[PYTHON] Remove and retrieve arrays from fasta according to the ID list file

Postscript: 2018/06/29

You can do the same with seqkit grep

https://github.com/shenwei356/seqkit

Extract and delete arrays from fasta

Delete the array containing the ID list file, or extract only that array.

fasta_extract.py


#!/usr/bin/env python
# -*- coding: utf-8 -*-

#fasta_Set the id item to key and output the hit array to standard output

import sys
from Bio import SeqIO


fasta_in = sys.argv[1]                            #In the first argument, specify the fasta file you want to change.
query = sys.argv[2]                          #Specify the file that describes the keyID for each line in the second argument


for record in SeqIO.parse(fasta_in, 'fasta'): #Open fasta file Parse using SeqIO(Read one item at a time)
    id_part = record.id                       #Read the ID part of fasta
    m_part = id_part.rstrip()        #chomp and m_Put in part
    description_part = record.description
    seq = record.seq                          #Read the array part of fastan
    for q in open(query, "r"):                     #Open annotation information file
        if m_part == q.rstrip():            #If the id part of the fasta file and the id part of the changer item match. ..
            fasta_seq = '>' + description_part + '\n' + seq      #Arrange in fasta format
            print(fasta_seq)                  #Output fasta to standard output

fasta_remove.py


#!/usr/bin/env python
# -*- coding: utf-8 -*-

#fasta_Set the id item to key and output the array that did not hit to the standard output

import sys
from Bio import SeqIO


fasta_in = sys.argv[1]                            #In the first argument, specify the fasta file you want to change.
query = sys.argv[2]                          #Specify the file that describes the keyID for each line in the second argument
hit = 0

for record in SeqIO.parse(fasta_in, 'fasta'): #Open fasta file Parse using SeqIO(Read one item at a time)
    id_part = record.id                       #Read the ID part of fasta
    m_part = id_part.rstrip()        #chomp and m_Put in part
    description_part = record.description
    seq = record.seq                          #Read the array part of fastan
    for q in open(query, "r"):                     #Open annotation information file
        if m_part == q.rstrip():            #If the id part of the fasta file and the id part of the changer item match. ..
            hit += 1
    if hit == 0:
        fasta_seq = '>' + description_part + '\n' + seq      #Arrange in fasta format
        print(fasta_seq)                  #Output fasta to standard output
    hit = 0

fasta_extract_cont.py


#!/usr/bin/env python
# -*- coding: utf-8 -*-

#fasta_Set the id item to key and output the hit array to standard output(Version corresponding to partial match)

import sys
from Bio import SeqIO


fasta_in = sys.argv[1]                            #In the first argument, specify the fasta file you want to change.
query = sys.argv[2]                          #Specify the file that describes the keyID for each line in the second argument


for record in SeqIO.parse(fasta_in, 'fasta'): #Open fasta file Parse using SeqIO(Read one item at a time)
    id_part = record.id                       #Read the ID part of fasta
    m_part = id_part.rstrip()        #chomp and m_Put in part
    description_part = record.description
    seq = record.seq                          #Read the array part of fastan
    for q in open(query, "r"):                     #Open annotation information file
        if q.rstrip() in m_part:            #Include or search
            fasta_seq = '>' + description_part + '\n' + seq      #Arrange in fasta format
            print(fasta_seq)                  #Output fasta to standard output

fasta_remove_cont.py


#!/usr/bin/env python
# -*- coding: utf-8 -*-

#fasta_Set the id item to key and output the hit array to standard output(Version corresponding to partial match)

import sys
from Bio import SeqIO


fasta_in = sys.argv[1]                            #In the first argument, specify the fasta file you want to change.
query = sys.argv[2]                          #Specify the file that describes the keyID for each line in the second argument
hit = 0


for record in SeqIO.parse(fasta_in, 'fasta'): #Open fasta file Parse using SeqIO(Read one item at a time)
    id_part = record.id                       #Read the ID part of fasta
    m_part = id_part.rstrip()        #chomp and m_Put in part
    description_part = record.description
    seq = record.seq                          #Read the array part of fastan
    for q in open(query, "r"):                     #Open annotation information file
        if q.rstrip() in m_part:            #Include or search
            hit += 1
    if hit == 0:
        fasta_seq = '>' + description_part + '\n' + seq      #Arrange in fasta format
        print(fasta_seq)                  #Output fasta to standard output
    hit = 0

When the description part is searched

fasta_remove_V3.py



#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from Bio import SeqIO


fasta_in = sys.argv[1]                            #In the first argument, specify the fasta file you want to change.
query = sys.argv[2]                          #Specify the file that describes the search keyword for each line in the second argument
hit = 0

for record in SeqIO.parse(fasta_in, 'fasta'): #Open fasta file Parse using SeqIO(Read one item at a time)
    id_part = record.description                       #Read the description part of fasta
    m_part = id_part.rstrip()        #chomp and m_Put in part
    description_part = record.description
    seq = record.seq                          #Read the array part of fastan
    for q in open(query, "r"):                     #Open annotation information file
        if  q.rstrip() in m_part:            #If the id part of the fasta file and the id part of the changer item match. ..
            hit += 1
    if hit == 0:
        fasta_seq = '>' + description_part + '\n' + seq      #Arrange in fasta format
        print(fasta_seq)                  #Output fasta to standard output
    hit = 0

Recommended Posts

Remove and retrieve arrays from fasta according to the ID list file
[Python] How to remove duplicate values from the list
Use Boto3 to retrieve over 1000 Prefixes from S3's file list
Coloring points according to the distance from the regression curve
Python --Read data from a numeric data file to find the covariance matrix, eigenvalues, and eigenvectors
Points to note when deleting multiple elements from the List
Extract the value closest to a value from a Python list element
Read the old Gakushin DC application Word file (.doc) from Python and try to operate it.
Dot according to the image
Remove the frame from the image
Output the key list included in S3 Bucket to a file
It's faster to add than to join and extend the list, right?
Repeat with While. Scripts to Tweet and search from the terminal
How to compare lists and retrieve common elements in a list
If you remove the list to be looped, you will get terrible.
How to get followers and followers from python using the Mastodon API
[EC2] How to install and download chromedriver from the command line
Script to organize LDOS and PDOS from VASP output file DOSCAR
Learn Bayesian statistics from the basics to learn the M-H and HMC methods
Dig the directory and create a list of directory paths + file names
How to remove duplicates from a Python list while preserving order.