Wir haben ein Skript erstellt, um das GFF3-Format, das häufig zur Beschreibung genomischer Annotationsinformationen verwendet wird, in genomische Daten in DDBJ und die für die Registrierung erforderliche Annotationsmethode (MSS) zu konvertieren.
GFF2MSS.py genome.gff
Bitte verwenden Sie es in Form von. Seien Sie vorsichtig, wenn Sie es verwenden, da die Fehlerbehebung nicht perfekt ist. Entspricht der Proteinkodierungsregion. Bitte fertigen Sie das Teil (GEMEINSAMER Bereich), das die Registrierungsinformationen beschreibt, von Hand.
Unten finden Sie ein Beispiel für die angenommene GFF3-Datei https://github.com/billzt/gff3sort Es ist gut, damit zu verarbeiten
GFF2MSS.py
#!/usr/bin/python
# coding: UTF-8
import sys
from Bio import SeqIO
from Bio import Seq
from BCBio import GFF
args = sys.argv
in_file = args[1]
PreContig = ""
Contig_Count = 0
in_handle = open(in_file)
for rec in GFF.parse(in_handle):
NowContig = rec.id
position = rec.annotations["sequence-region"]
NowPosition = position[Contig_Count]
NowPosEnd = str(NowPosition[2])
if PreContig != NowContig:
print NowContig + "\t" + "source" + "\t" + str(1) + ".." + NowPosEnd + "\t" + "ff_definition" + "\t" + "@@[organism]@@ DNA, contig: " + NowContig
print "\t" + "\t" + "\t" + "note" + "\t" + "contig: " + NowContig
PreContig = rec.id
for gene_f in rec.features:
for mRNA_f in gene_f.sub_features:
COUNT = 0 #Setzen Sie die Anzahl auf 0, wenn Sie eine neue mRNA eingeben
out_STRAND=""
out_STRAND_CLOSE=""
POSITION="" #Initialisieren Sie jedes Ausgabeelement
out_JOINT = ""
out_JOINT_CLOSE=""
strand = mRNA_f.strand
if strand == -1:
out_STRAND = "complement("
out_STRAND_CLOSE = ")"
####GENE_INFORMATIONS
mRNA_ID = mRNA_f.qualifiers["ID"]
locus_tag_ID = mRNA_f.qualifiers["Note"]
product_name = mRNA_f.qualifiers.get("product", ["Unknown_product"])
####
for CDS_f in mRNA_f.sub_features:
COUNT += 1
transl_table = CDS_f.qualifiers.get("transl_table", ["1"])
if COUNT==1: #Erste CDS in der relevanten mRNA
CDS_START = CDS_f.location.start +1
CDS_END = CDS_f.location.end
POSITION = POSITION + str(CDS_START) + ".." + str(CDS_END)
else: #Die zweite und nachfolgende CDS in der entsprechenden mRNA
CDS_START = CDS_f.location.start +1
CDS_END = CDS_f.location.end
POSITION = POSITION + ","+ str(CDS_START) + ".." + str(CDS_END)
out_JOINT = "join("
out_JOINT_CLOSE = ")"
print "\tCDS\t"+ out_STRAND + out_JOINT + POSITION + out_JOINT_CLOSE + out_STRAND_CLOSE + "\tcodon_start\t1"
print "\t\t\t" + "locus_tag\t" + locus_tag_ID[0]
print "\t\t\t" + "note\t" + mRNA_ID[0]
print "\t\t\t" + "product\t" + product_name[0]
print "\t\t\t" + "transl_table\t" + transl_table[0]
Contig_Count += 1
if Contig_Count == len(position):
break
in_handle.close()
example.gff
##sequence-region unitig_0 1 307079
unitig_0 . gene 1137 4305 . - . ID=LOCUS_TAG_0000100;Note=g35293
unitig_0 . mRNA 1137 4305 . - . ID=g35293.t1;Note=LOCUS_TAG_0000100;Parent=LOCUS_TAG_0000100;product=hypothetical protein
unitig_0 . CDS 1137 1462 . - 1 ID=g35293.t1_1;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 2201 2583 . - 1 ID=g35293.t1_2;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 2900 3031 . - 1 ID=g35293.t1_3;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 3381 3597 . - 1 ID=g35293.t1_4;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 3666 4073 . - 1 ID=g35293.t1_5;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . CDS 4278 4305 . - 1 ID=g35293.t1_6;Name=LOCUS_TAG_0000100;Note=g35293.t1;Parent=g35293.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.497
unitig_0 . gene 4844 5880 . - . ID=LOCUS_TAG_0000200;Note=g35294
unitig_0 . mRNA 4844 5880 . - . ID=g35294.t1;Note=LOCUS_TAG_0000200;Parent=LOCUS_TAG_0000200;product=hypothetical protein
unitig_0 . CDS 4844 5544 . - 1 ID=g35294.t1_1;Name=LOCUS_TAG_0000200;Note=g35294.t1;Parent=g35294.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.317
unitig_0 . CDS 5628 5880 . - 1 ID=g35294.t1_2;Name=LOCUS_TAG_0000200;Note=g35294.t1;Parent=g35294.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.317
unitig_0 . gene 6030 6836 . - . ID=LOCUS_TAG_0000300;Note=g35295
unitig_0 . mRNA 6030 6836 . - . ID=g35295.t1;Note=LOCUS_TAG_0000300;Parent=LOCUS_TAG_0000300;product=hypothetical protein
unitig_0 . CDS 6030 6335 . - 1 ID=g35295.t1_1;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . CDS 6427 6468 . - 1 ID=g35295.t1_2;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . CDS 6539 6739 . - 1 ID=g35295.t1_3;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . CDS 6819 6836 . - 1 ID=g35295.t1_4;Name=LOCUS_TAG_0000300;Note=g35295.t1;Parent=g35295.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.188
unitig_0 . gene 13014 14367 . + . ID=LOCUS_TAG_0000400;Note=g35296
unitig_0 . mRNA 13014 14367 . + . ID=g35296.t1;Note=LOCUS_TAG_0000400;Parent=LOCUS_TAG_0000400;product=hypothetical protein
unitig_0 . CDS 13014 13016 . + 1 ID=g35296.t1_1;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13106 13201 . + 1 ID=g35296.t1_2;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13271 13316 . + 1 ID=g35296.t1_3;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13439 13638 . + 1 ID=g35296.t1_4;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13716 13893 . + 1 ID=g35296.t1_5;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 13961 14119 . + 1 ID=g35296.t1_6;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
unitig_0 . CDS 14186 14367 . + 1 ID=g35296.t1_7;Name=LOCUS_TAG_0000400;Note=g35296.t1;Parent=g35296.t1;codon_start=1;product=hypothetical protein;transl_table=1;translation=length.287
Recommended Posts