Source code for padmet.utils.connection.check_orthology_input

# -*- coding: utf-8 -*-
"""
Description:
    Before running orthology based reconstruction it is necessary to check if the metabolic network 
    and the proteome of the model organism use the same ids for genes (or at least more than a given cutoff).
    To only check this. Use the 2nd usage.

    If the genes ids are not the same, it is necessary to use a dictionnary of genes ids associating
    the genes ids from the proteome to the genes ids from the metabolic network.

    To create the correct proteome from the dictionnnary, use the 3nd usage
    Finnaly by using the 1st usage, it is possible to:

        1/ Check model_faa and model_metabolic for a given cutoff

        2/ if under the cutoff, convert model_faa to the correct one with dict_ids_file

        3/ if still under, SystemExit()

::

    usage:
        padmet check_orthology_input    --model_metabolic=FILE    --model_faa=FILE    [--cutoff=FLOAT] [--dict_ids_file=FILE] --output=FILE    [-v]
        padmet check_orthology_input    --model_metabolic=FILE    --model_faa=FILE    [--cutoff=FLOAT] [-v]
        padmet check_orthology_input    --model_faa=FILE    --dict_ids_file=FILE    --output=FILE [-v]

    option:
        -h --help    Show help.
        --model_metabolic=FILE    pathname to the metabolic network of the model (sbml).
        --model_faa=FILE    pathname to the proteome of the model (faa)
        --cutoff=FLOAT    cutoff [0:1] for comparing model_metabolic and model_faa. [default: 0.70].
        --dict_ids_file=FILE    pathname to the dict associating genes ids from the model_metabolic to the model_faa. line = gene_id_in_metabolic_network\tgene_id_in_faa
        --output=FILE    output of get_valid_faa (a faa) or get_dict_ids (a dictionnary of gene ids in tsv)
        -v   print info
"""
import docopt
import re
import itertools
import libsbml
import os

from Bio import SeqIO
from padmet.utils import sbmlPlugin as sp



[docs]
def command_help():
    """
    Show help for analysis command.
    """
    print(docopt.docopt(__doc__))




[docs]
def check_orthology_input_cli(command_args):
    args = docopt.docopt(__doc__, argv=command_args)
    model_metabolic = args["--model_metabolic"]
    model_faa = args["--model_faa"]
    dict_ids_file = args["--dict_ids_file"]
    output = args["--output"]
    verbose = args["-v"]
    cutoff = float(args["--cutoff"])
    check_orthology_input(model_metabolic, model_faa, dict_ids_file, output, verbose, cutoff)




[docs]
def check_orthology_input(model_metabolic, model_faa, dict_ids_file, output, verbose, cutoff):
    """
    #TODO
    """
    if model_metabolic is not None:
        if not os.path.exists(model_metabolic):
            raise FileNotFoundError("No SBML file accessible (--model_metabolic/model_metabolic) at " + model_metabolic)

        if not os.path.exists(model_faa):
            raise FileNotFoundError("No fasta file accessible (--model_faa/model_faa) at " + model_faa)

        if verbose: print("check genes ids model_metablic vs model_faa")
        #if true: more than cutoff% match
        if check_ids(model_metabolic, model_faa, cutoff, verbose):
            return True
        elif dict_ids_file is not None:
            if verbose: print("creating a valid FAA with the dictionnary")
            get_valid_faa(model_faa, dict_ids_file, output)
            if verbose: print("check genes ids")
            if check_ids(model_metabolic, output, cutoff, verbose):
                return True
            else:
                raise SystemExit("Also with the dictionnary, genes ids in the metabolic network and the FAA are not the same")
        else:
                raise SystemExit("Change the cutoff or use an other dictionnary")
                                
    elif dict_ids_file is not None:
        if verbose: print("creating a valid FAA with the dictionnary")
        get_valid_faa(model_faa, dict_ids_file, output)




[docs]
def check_ids(model_metabolic, model_faa, cutoff, verbose=False):
    """
    check if genes ids of model_metabolic = model_faa for a given cutoff
    faa genes ids are in the first line of each sequence: >GENE_ID ....
    metabolic netowkrs genes ids are in note section, GENE_ASSOCIATION: gene_id-1 or gene_id-2

    Parameters
    ----------
    model_metabolic: str
        path to sbml file
    model_faa: str
        path to fasta faa file
    cutoff: int
        cutoff genes ids from model found in faa
    verbose: bool
        verbose
        
    Returns
    -------
    bool    
        True if same ids, if verbose, print % of genes under cutoff
    """
    reader = libsbml.SBMLReader()
    document = reader.readSBML(model_metabolic)
    model = document.getModel()
    document.getNumErrors()
    listOfReactions = model.getListOfReactions()
    #convert to set
    model_metabolic_ids = set(itertools.chain.from_iterable([sp.parseGeneAssoc(geneAssoc) 
    for geneAssoc in (sp.parseNotes(r).get("GENE_ASSOCIATION",[None])[0] for r in listOfReactions)
    if geneAssoc is not None]))
    
    with open(model_faa, "r") as f:
        model_faa_ids = set([record.id for record in SeqIO.parse(f, "fasta")])

    diff_genes = model_metabolic_ids.difference(model_faa_ids)
    try:
        diff_genes_ratio = float(len(diff_genes))/float(len(model_metabolic_ids))
    except ZeroDivisionError:
        raise SystemExit("No genes found in model metabolic")
    #if all model_metabolic_ids are in model_faa_ids
    if diff_genes_ratio == 0:
        if verbose: print("all genes of the model_metabolic are in the model_faa")
        return True
    #if not check if the nb is sup-equal to the cutoff
    elif diff_genes_ratio <= float(1-cutoff):
        if verbose: print("Only %.2f%% genes of the model_metabolic are not in the model_faa" % (diff_genes_ratio*100))
        return True
    else:
        if verbose: 
            print("%s%% genes of the model_metabolic are not in the model_faa" % (diff_genes_ratio*100))
            print(";".join(diff_genes))
        return False



[docs]
def get_valid_faa(model_faa, dict_ids_file, output):
    """
    create a new faa from the model_faa by converting the gene id with the dict_ids
    dict_ids: line = origin_id new_gene_id, sep = \t

    Parameters
    ----------
    model_faa: str
        path to faa file
    dict_ids_file: str
        path to file containing link old to new ids
    output: str
        path to new faa file
    """
    regex_origin_id = re.compile("^>(\w*)")
    with open(dict_ids_file, 'r') as f:
        dict_ids = dict([(line.split("\t")) for line in f.read().splitlines()])

    with open(output, 'w') as o:
        with open(model_faa, 'r') as f:
            for line in f.readlines():
                line = line.replace('"','')
                if line.startswith(">"):
                    origin_id = regex_origin_id.search(line).group(1)
                    new_gene_id = dict_ids.get(origin_id,origin_id)
                    line = line.replace(origin_id, new_gene_id)
                o.write(line)