Source code for padmet.utils.connection.gbk_to_faa

# -*- coding: utf-8 -*-
"""
Description:
    convert GBK to FAA with Bio package

::

    usage:
        padmet gbk_to_faa    --gbk=FILE --output=FILE [--qualifier=STR] [-v]

    option:
        -h --help    Show help.
        --gbk=FILE    path to the gbk file.
        --output=FILE    path to the output, a FAA file.
        --qualifier=STR    the qualifier of the gene id [default: locus_tag].
        -v   print info
"""
import docopt
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

try:
    # Import to be compatible with biopython version lesser than 1.78
    from Bio.Alphabet.IUPAC import protein
except ImportError:
    # Exception to be compatible with biopython version superior to 1.78
    protein = None


[docs] def command_help(): """ Show help for analysis command. """ print(docopt.docopt(__doc__))
[docs] def gbk_to_faa_cli(command_args): args = docopt.docopt(__doc__, argv=command_args) gbk_file = args["--gbk"] output = args["--output"] qualifier = args["--qualifier"] verbose = args["-v"] gbk_to_faa(gbk_file, output, qualifier, verbose)
[docs] def gbk_to_faa(gbk_file, output, qualifier='locus_tag', verbose=True): """ convert GBK to FAA with Bio package Parameters ---------- gbk_file: str path to the gbk file output: str path to the output, a FAA file qualifier: str he qualifier of the gene id verbose: bool if True print information """ if not os.path.exists(gbk_file): raise FileNotFoundError("No Genbank file (--gbk/gbk_file) accessible at " + gbk_file) fasta_records = [] fasta_ids = {} with open(gbk_file, "r") as gbk: for seq_record in SeqIO.parse(gbk, "genbank"): seq_feature_cds = (seq_feature for seq_feature in seq_record.features if seq_feature.type == "CDS") for seq_feature in seq_feature_cds: try: fasta_id = seq_feature.qualifiers[qualifier][0] if fasta_id not in fasta_ids: # Keep compatibility with biopython version lesser than 1.78 if protein: fasta_record = SeqRecord(Seq(seq_feature.qualifiers['translation'][0], protein), id=fasta_id, description=fasta_id) else: fasta_record = SeqRecord(Seq(seq_feature.qualifiers['translation'][0]), id=fasta_id, description=fasta_id) fasta_records.append(fasta_record) fasta_ids[fasta_id] = 1 else: fasta_ids[fasta_id] += 1 isoform_id = fasta_id + '_isoform' + str(fasta_ids[fasta_id]) # Keep compatibility with biopython version lesser than 1.78 if protein: fasta_record = SeqRecord(Seq(seq_feature.qualifiers['translation'][0], protein), id=isoform_id, description=fasta_id) else: fasta_record = SeqRecord(Seq(seq_feature.qualifiers['translation'][0]), id=isoform_id, description=fasta_id) fasta_records.append(fasta_record) except KeyError: if verbose: try: print("locus without Translation: "+seq_feature.qualifiers['locus_tag'][0]) except KeyError: print("locus without Translation: "+seq_feature.qualifiers.get('gene',["Unknown"])[0]) SeqIO.write(fasta_records, output, "fasta")