# -*- coding: utf-8 -*-
"""
Description:
Before running orthology based reconstruction it is necessary to check if the metabolic network
and the proteome of the model organism use the same ids for genes (or at least more than a given cutoff).
To only check this. Use the 2nd usage.
If the genes ids are not the same, it is necessary to use a dictionnary of genes ids associating
the genes ids from the proteome to the genes ids from the metabolic network.
To create the correct proteome from the dictionnnary, use the 3nd usage
Finnaly by using the 1st usage, it is possible to:
1/ Check model_faa and model_metabolic for a given cutoff
2/ if under the cutoff, convert model_faa to the correct one with dict_ids_file
3/ if still under, SystemExit()
::
usage:
padmet check_orthology_input --model_metabolic=FILE --model_faa=FILE [--cutoff=FLOAT] [--dict_ids_file=FILE] --output=FILE [-v]
padmet check_orthology_input --model_metabolic=FILE --model_faa=FILE [--cutoff=FLOAT] [-v]
padmet check_orthology_input --model_faa=FILE --dict_ids_file=FILE --output=FILE [-v]
option:
-h --help Show help.
--model_metabolic=FILE pathname to the metabolic network of the model (sbml).
--model_faa=FILE pathname to the proteome of the model (faa)
--cutoff=FLOAT cutoff [0:1] for comparing model_metabolic and model_faa. [default: 0.70].
--dict_ids_file=FILE pathname to the dict associating genes ids from the model_metabolic to the model_faa. line = gene_id_in_metabolic_network\tgene_id_in_faa
--output=FILE output of get_valid_faa (a faa) or get_dict_ids (a dictionnary of gene ids in tsv)
-v print info
"""
import docopt
import re
import itertools
import libsbml
import os
from Bio import SeqIO
from padmet.utils import sbmlPlugin as sp
[docs]
def command_help():
"""
Show help for analysis command.
"""
print(docopt.docopt(__doc__))
[docs]
def check_ids(model_metabolic, model_faa, cutoff, verbose=False):
"""
check if genes ids of model_metabolic = model_faa for a given cutoff
faa genes ids are in the first line of each sequence: >GENE_ID ....
metabolic netowkrs genes ids are in note section, GENE_ASSOCIATION: gene_id-1 or gene_id-2
Parameters
----------
model_metabolic: str
path to sbml file
model_faa: str
path to fasta faa file
cutoff: int
cutoff genes ids from model found in faa
verbose: bool
verbose
Returns
-------
bool
True if same ids, if verbose, print % of genes under cutoff
"""
reader = libsbml.SBMLReader()
document = reader.readSBML(model_metabolic)
model = document.getModel()
document.getNumErrors()
listOfReactions = model.getListOfReactions()
#convert to set
model_metabolic_ids = set(itertools.chain.from_iterable([sp.parseGeneAssoc(geneAssoc)
for geneAssoc in (sp.parseNotes(r).get("GENE_ASSOCIATION",[None])[0] for r in listOfReactions)
if geneAssoc is not None]))
with open(model_faa, "r") as f:
model_faa_ids = set([record.id for record in SeqIO.parse(f, "fasta")])
diff_genes = model_metabolic_ids.difference(model_faa_ids)
try:
diff_genes_ratio = float(len(diff_genes))/float(len(model_metabolic_ids))
except ZeroDivisionError:
raise SystemExit("No genes found in model metabolic")
#if all model_metabolic_ids are in model_faa_ids
if diff_genes_ratio == 0:
if verbose: print("all genes of the model_metabolic are in the model_faa")
return True
#if not check if the nb is sup-equal to the cutoff
elif diff_genes_ratio <= float(1-cutoff):
if verbose: print("Only %.2f%% genes of the model_metabolic are not in the model_faa" % (diff_genes_ratio*100))
return True
else:
if verbose:
print("%s%% genes of the model_metabolic are not in the model_faa" % (diff_genes_ratio*100))
print(";".join(diff_genes))
return False
[docs]
def get_valid_faa(model_faa, dict_ids_file, output):
"""
create a new faa from the model_faa by converting the gene id with the dict_ids
dict_ids: line = origin_id new_gene_id, sep = \t
Parameters
----------
model_faa: str
path to faa file
dict_ids_file: str
path to file containing link old to new ids
output: str
path to new faa file
"""
regex_origin_id = re.compile("^>(\w*)")
with open(dict_ids_file, 'r') as f:
dict_ids = dict([(line.split("\t")) for line in f.read().splitlines()])
with open(output, 'w') as o:
with open(model_faa, 'r') as f:
for line in f.readlines():
line = line.replace('"','')
if line.startswith(">"):
origin_id = regex_origin_id.search(line).group(1)
new_gene_id = dict_ids.get(origin_id,origin_id)
line = line.replace(origin_id, new_gene_id)
o.write(line)