Nous avons créé un script pour convertir le format GFF3, qui est souvent utilisé pour décrire les informations d'annotation génomique, en données génomiques dans DDBJ et la méthode d'annotation (MSS) requise pour l'enregistrement.
GFF2MSS.py genome.gff
Veuillez l'utiliser sous la forme de. Soyez prudent lorsque vous l'utilisez car la suppression des bogues n'est pas parfaite. Correspond à la région codant pour la protéine. Veuillez toucher la partie (zone COMMUNE) qui décrit les informations du titulaire.
Voici un exemple du fichier GFF3 supposé https://github.com/billzt/gff3sort Il est bon de traiter avec
GFF2MSS.py
#!/usr/bin/python
# coding: UTF-8
import sys
from Bio import SeqIO
from Bio import Seq
from BCBio import GFF
args = sys.argv
in_file = args[1]
PreContig = ""
Contig_Count = 0
in_handle = open(in_file)
for rec in GFF.parse(in_handle):
NowContig = rec.id
position = rec.annotations["sequence-region"]
NowPosition = position[Contig_Count]
NowPosEnd = str(NowPosition[2])
if PreContig != NowContig:
print NowContig + "\t" + "source" + "\t" + str(1) + ".." + NowPosEnd + "\t" + "ff_definition" + "\t" + "@@[organism]@@ DNA, contig: " + NowContig
print "\t" + "\t" + "\t" + "note" + "\t" + "contig: " + NowContig
PreContig = rec.id
for gene_f in rec.features:
for mRNA_f in gene_f.sub_features:
COUNT = 0 #Réglez le nombre à 0 lors de la saisie d'un nouvel ARNm
out_STRAND=""
out_STRAND_CLOSE=""
POSITION="" #Initialiser chaque élément de sortie
out_JOINT = ""
out_JOINT_CLOSE=""
strand = mRNA_f.strand
if strand == -1:
out_STRAND = "complement("
out_STRAND_CLOSE = ")"
####GENE_INFORMATIONS
mRNA_ID = mRNA_f.qualifiers["ID"]
locus_tag_ID = mRNA_f.qualifiers["Note"]
product_name = mRNA_f.qualifiers.get("product", ["Unknown_product"])
####
for CDS_f in mRNA_f.sub_features:
COUNT += 1
transl_table = CDS_f.qualifiers.get("transl_table", ["1"])
if COUNT==1: #Premier CDS dans l'ARNm concerné
CDS_START = CDS_f.location.start +1
CDS_END = CDS_f.location.end
POSITION = POSITION + str(CDS_START) + ".." + str(CDS_END)
else: #Le deuxième CDS et les suivants dans l'ARNm correspondant
CDS_START = CDS_f.location.start +1
CDS_END = CDS_f.location.end
POSITION = POSITION + ","+ str(CDS_START) + ".." + str(CDS_END)
out_JOINT = "join("
out_JOINT_CLOSE = ")"
print "\tCDS\t"+ out_STRAND + out_JOINT + POSITION + out_JOINT_CLOSE + out_STRAND_CLOSE + "\tcodon_start\t1"
print "\t\t\t" + "locus_tag\t" + locus_tag_ID[0]
print "\t\t\t" + "note\t" + mRNA_ID[0]
print "\t\t\t" + "product\t" + product_name[0]
print "\t\t\t" + "transl_table\t" + transl_table[0]
Contig_Count += 1
if Contig_Count == len(position):
break
in_handle.close()
example.gff
##sequence-region unitig_0 1 307079
unitig_0 . gene 1137 4305 . - . ID=LOCUS_TAG_0000100;Note=g35293
unitig_0 . mRNA 1137 4305 . - . ID=g35293.t1;Note=LOCUS_TAG_0000100;Parent=LOCUS_TAG_0000100;product=hypothetical protein
unitig_0 . CDS 1137 1462 . - 1 ID=g35293.t1_1;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 2201 2583 . - 1 ID=g35293.t1_2;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 2900 3031 . - 1 ID=g35293.t1_3;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 3381 3597 . - 1 ID=g35293.t1_4;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 3666 4073 . - 1 ID=g35293.t1_5;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 4278 4305 . - 1 ID=g35293.t1_6;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . gene 4844 5880 . - . ID=LOCUS_TAG_0000200;Note=g35294
unitig_0 . mRNA 4844 5880 . - . ID=g35294.t1;Note=LOCUS_TAG_0000200;Parent=LOCUS_TAG_0000200;product=hypothetical protein
unitig_0 . CDS 4844 5544 . - 1 ID=g35294.t1_1;Name=LOCUS_TAG_0000200;Note=g35294.t1;Parent=g35294.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.317
unitig_0 . CDS 5628 5880 . - 1 ID=g35294.t1_2;Name=LOCUS_TAG_0000200;Note=g35294.t1;Parent=g35294.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.317
unitig_0 . gene 6030 6836 . - . ID=LOCUS_TAG_0000300;Note=g35295
unitig_0 . mRNA 6030 6836 . - . ID=g35295.t1;Note=LOCUS_TAG_0000300;Parent=LOCUS_TAG_0000300;product=hypothetical protein
unitig_0 . CDS 6030 6335 . - 1 ID=g35295.t1_1;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . CDS 6427 6468 . - 1 ID=g35295.t1_2;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . CDS 6539 6739 . - 1 ID=g35295.t1_3;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . CDS 6819 6836 . - 1 ID=g35295.t1_4;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . gene 13014 14367 . + . ID=LOCUS_TAG_0000400;Note=g35296
unitig_0 . mRNA 13014 14367 . + . ID=g35296.t1;Note=LOCUS_TAG_0000400;Parent=LOCUS_TAG_0000400;product=hypothetical protein
unitig_0 . CDS 13014 13016 . + 1 ID=g35296.t1_1;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13106 13201 . + 1 ID=g35296.t1_2;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13271 13316 . + 1 ID=g35296.t1_3;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13439 13638 . + 1 ID=g35296.t1_4;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13716 13893 . + 1 ID=g35296.t1_5;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13961 14119 . + 1 ID=g35296.t1_6;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 14186 14367 . + 1 ID=g35296.t1_7;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
Recommended Posts