Source code for padmet.utils.exploration.compare_sbml

# -*- coding: utf-8 -*-
"""
Description:
    compare reactions in 1-n or 2 sbml.

    Returns if a reaction is missing

    And if a reaction with the same id is using different species or different reversibility

::

    usage:
        padmet compare_sbml --sbml=FILES/DIR --output=DIR

    option:
        -h --help    Show help.
        --sbml FILES/DIR    pathname of the sbml files, sep all files by ',', ex: /path/sbml1.sbml;/path/sbml2.sbml OR a folder
        --output DIR    pathname of the output folder
"""
import docopt
import csv
import os
import re

from cobra.io.sbml import read_sbml_model
from padmet.utils.sbmlPlugin import convert_from_coded_id
from padmet.utils.gbr import compile_input



[docs]
def command_help():
    """
    Show help for analysis command.
    """
    print(docopt.docopt(__doc__))




[docs]
def compare_sbml_cli(command_args):
    args = docopt.docopt(__doc__, argv=command_args)
    sbml_path = args["--sbml"]
    output_folder = args["--output"]
    compare_multiple_sbml(sbml_path, output_folder)




[docs]
def compare_multiple_sbml(sbml_path, output_folder):
    """
    Compare 1-n sbml, create two output files reactions.tsv and metabolites.tsv
    with the reactions/metabolites in each sbml

    Parameters
    ----------
    sbml_path: str
        path to a folder containing sbmls or multiple sbml paths separated by a ','
    output_folder: str
        path to the output folder
    """
    if not os.path.exists(output_folder):
        print("Creating %s" %output_folder)
        os.makedirs(output_folder)
    else:
        print("%s already exist, old comparison output folders will be overwritten" %output_folder)

    if os.path.isdir(sbml_path):
        if not os.path.exists(sbml_path):
            raise FileNotFoundError("No SBML directory (--sbml/sbml_path) accessible at " + sbml_path)
        all_files = [os.path.join(sbml_path, f) for f in next(os.walk(sbml_path))[2]]
    else:
        all_files = sbml_path.split(",")
        for sbml_file in all_files:
            if not os.path.exists(sbml_file):
                raise FileNotFoundError("No SBML file (--sbml/sbml_path) accessible at " + sbml_file)

    species_columns = [os.path.splitext(os.path.basename(all_file))[0] for all_file in sorted(all_files)]
    gene_columns = [os.path.splitext(os.path.basename(all_file))[0] + '_genes_assoc (sep=;)' for all_file in sorted(all_files)]
    all_reactions = {}
    all_compounds = []
    reactions = {}
    compounds = {}
    for sbml_file in all_files:
        sbml_1 = read_sbml_model(sbml_file)
        reactions[sbml_file] = sbml_1.reactions
        for rxn in sbml_1.reactions:
            if rxn.id not in all_reactions:
                all_reactions[rxn.id] = rxn
        compounds[sbml_file] = [metabolite.id for metabolite in sbml_1.metabolites]
        all_compounds.extend([metabolite.id for metabolite in sbml_1.metabolites])

    all_compounds = set(all_compounds)

    reaction_file = output_folder + '/reactions.tsv'
    reaction_file_rows = []

    for reaction_id in all_reactions:
        reaction_presents = []
        reaction_genes = []
        row = [reaction_id]
        for sbml_file in sorted(all_files):
            if reaction_id in [rxn.id for rxn in reactions[sbml_file]]:
                reaction_presents.append(1)
            else:
                reaction_presents.append(0)
            if reaction_id in reactions[sbml_file]:
                species_reaction = reactions[sbml_file].get_by_id(reaction_id)
                if 'GENE_ASSOCIATION' in species_reaction.notes:
                    ga_for_gbr = species_reaction.notes['GENE_ASSOCIATION']
                    ga_for_gbr = re.sub(r" or " , "|", ga_for_gbr)
                    ga_for_gbr = re.sub(r" and " , "&", ga_for_gbr)
                    ga_for_gbr = re.sub(r"\s" , "", ga_for_gbr)
                    if re.findall("\||&", ga_for_gbr):
                        to_compare_ga_subsets = list(compile_input(ga_for_gbr))
                        genes = []
                        for to_compare_subset in to_compare_ga_subsets:
                            for gene in to_compare_subset:
                                genes.append(gene)
                    else:
                        genes  = [ga_for_gbr.replace('(', '').replace(')', '')]
                    reaction_genes.append(';'.join(genes))
                else:
                    reaction_genes.append('')
            else:
                reaction_genes.append('')

        row = row + reaction_presents + reaction_genes
        row.append(all_reactions[reaction_id].reaction)

        reaction_file_rows.append(row)

    with open(reaction_file, 'w') as output_reaction:
        csvwriter = csv.writer(output_reaction, delimiter='\t')
        csvwriter.writerow(['reaction', *species_columns, *gene_columns, '_formula'])
        csvwriter.writerows(reaction_file_rows)

    compounds_file = output_folder + '/metabolites.tsv'
    compounds_rows = []

    for compound_id in all_compounds:
        row = [compound_id]
        for sbml_file in sorted(all_files):
            if compound_id in compounds[sbml_file]:
                row.append(1)
            else:
                row.append(0)
        compounds_rows.append(row)

    with open(compounds_file, 'w') as output_compound:
        csvwriter = csv.writer(output_compound, delimiter='\t')
        csvwriter.writerow(['metabolite', *sorted(all_files)])
        csvwriter.writerows(compounds_rows)




[docs]
def compare_sbml(sbml1_path, sbml2_path):
    """
    Compare 2 sbml, print nb of metabolites and reactions.
    If reaction missing print reaction id, and reaction formula.

    Parameters
    ----------
    sbml1_path: str
        path to the first sbml file to compare
    sbml2_path: str
        path to the second sbml file to compare
    """

    if not os.path.exists(sbml1_path):
        raise FileNotFoundError("No SBML file or directory (sbml1_path) accessible at " + sbml1_path)

    if not os.path.exists(sbml2_path):
        raise FileNotFoundError("No SBML file or directory (sbml2_path) accessible at " + sbml2_path)

    sbml_1 = read_sbml_model(sbml1_path)
    sbml_2 = read_sbml_model(sbml2_path)
    
    print("sbml1:")
    print("metabolites: %s" %(len(sbml_1.metabolites)))
    print("reactions: %s" %(len(sbml_1.reactions)))
    print("sbml2:")
    print("metabolites: %s" %(len(sbml_2.metabolites)))
    print("reactions: %s" %(len(sbml_2.reactions)))
    not_in1 = [i for i in sbml_2.reactions if i not in sbml_1.reactions]
    print("reactions not in sbml1: %s" %len(not_in1))
    for i in not_in1:
        print("\t%s" %i)
    not_in2 = [i for i in sbml_1.reactions if i not in sbml_2.reactions]
    print("reactions not in sbml2: %s" %len(not_in2))
    for j in not_in2:
        print("\t%s" %j)
    
    all_diff = set()
    for rxn1 in sbml_1.reactions:
        rxn_id = rxn1.id
        try:
            rxn2 = sbml_2.reactions.get_by_id(rxn_id)
            same_cpd, same_rev =  compare_rxn(rxn1, rxn2)
            if rxn_id not in all_diff:
                if not same_cpd:
                    print("%s use different species" %rxn_id)
                if not same_rev:
                    print("%s use different reversibility" %rxn_id)
                all_diff.add(rxn_id)
        except KeyError:
            pass
                
    for rxn2 in sbml_2.reactions:
        rxn_id = rxn2.id
        try:
            rxn1 = sbml_1.reactions.get_by_id(rxn_id)
            same_cpd, same_rev =  compare_rxn(rxn1, rxn2)
            if rxn_id not in all_diff:
                if not same_cpd:
                    print("%s use different species" %rxn_id)
                if not same_rev:
                    print("%s use different reversibility" %rxn_id)
                all_diff.add(rxn_id)
        except KeyError:
            pass



[docs]
def compare_rxn(rxn1,rxn2):
    """
    compare two cobra reaction object and return (same_cpd, same_rev)
    same_cpd: bool, if true means same compounds consumed and produced
    same_reve: bool, if true means same direction of reaction (reversible or not)

    Parameters
    ----------
    rxn1: cobra.model.reaction
        reaction as cobra object
    rxn2: cobra.model.reaction
        reaction as cobra object

    Returns
    -------
    tuple:
        (same_cpd (bool), same_rev (bool))
    """
    same_cpd, same_rev = False, False
    rxn1_dict,rxn2_dict = {}, {}
    for k,v in list(rxn1.metabolites.items()):
        rxn1_dict[k.id] = v
    for k,v in list(rxn2.metabolites.items()):
        rxn2_dict[k.id] = v
    if rxn1_dict == rxn2_dict: 
        same_cpd = True
    if rxn1.reversibility == rxn2.reversibility:

       same_rev = True

    return (same_cpd, same_rev)