Source code for padmet.utils.exploration.convert_sbml_db

# -*- coding: utf-8 -*-
r"""
Description:
    This tool is use the MetaNetX database to check or convert a sbml. Flat files
    from MetaNetx are required to run this tool. They can be found in the aureme workflow
    or from the MetaNetx website.
    To use the tool set:
        mnx_folder= the path to a folder containing MetaNetx flat files.
        the files must be named as 'reac_xref.tsv' and 'chem_xref.tsv'
        or set manually the different path of the flat files with:
            mnx_reac= path to the flat file for reactions
            
            mnx_chem= path to the flat file for chemical compounds (species)

    To check the database used in a sbml:
        to check all element of sbml (reaction and species) set:
            to--map=all
        to check only reaction of sbml set:
            to--map=reaction
        to check only species of sbml set:
            to--map=species

    To map a sbml and obtain a file of mapping ids to a given database set:
        to-map:
            as previously explained
        db_out:
            the name of the database target: ['metacyc', 'bigg', 'kegg'] only
        output:
            the path to the output file
        
        For a given sbml using a specific database.
        
        Return a dictionnary of mapping.
        
        the output is a file with line = reaction_id/or species in sbml, reaction_id/species in db_out database
        
        ex:
            For a sbml based on kegg database, db_out=metacyc: the output file will contains for ex:
        R02283 ACETYLORNTRANSAM-RXN

::

    usage:
        padmet convert_sbml_db --mnx_reac=FILE --mnx_chem=FILE --sbml=FILE --to-map=STR [-v]
        padmet convert_sbml_db --mnx_folder=DIR --sbml=FILE --to-map=STR [-v]
        padmet convert_sbml_db --mnx_folder=DIR --sbml=FILE --output=FILE --db_out=ID --to-map=STR [-v]
        padmet convert_sbml_db --mnx_reac=FILE --mnx_chem=FILE --sbml=FILE --output=FILE --db_out=ID --to-map=STR [-v]

    options:
        -h --help     Show help.
        --to-map=STR     select the part of the sbml to check or convert, must be in ['all', 'reaction', 'species']
        --mnx_reac=FILE     path to the MetaNetX file for reactions
        --mnx_chem=FILE     path to the MetaNetX file for compounds
        --sbml=FILE     path to the sbml file to convert
        --output=FILE     path to the file containing the mapping, sep = "\t"
        --db_out=FILE     id of the output database in ["BIGG","METACYC","KEGG"]
        -v     verbose.
"""
import docopt
import libsbml
import os

from padmet.utils.sbmlPlugin import get_all_decoded_version



[docs]
def command_help():
    """
    Show help for analysis command.
    """
    print(docopt.docopt(__doc__))




[docs]
def convert_sbml_db_cli():
    args = docopt.docopt(__doc__, argv=command_args)
    verbose = args["-v"]
    to_map = args["--to-map"]
    mnx_folder = args["--mnx_folder"]
    mnx_reac_file = args["--mnx_reac"]
    mnx_chem_file = args["--mnx_chem"]
    sbml_file = args["--sbml"]

    if args["--db_out"]:
        db_out = args["--db_out"].upper()
        output = args["--output"]
        map_sbml(sbml_file, to_map, db_out, output, verbose, mnx_reac_file, mnx_chem_file, mnx_folder)
    else:
        db_select, db_found = check_sbml_db(sbml_file, to_map, verbose, mnx_reac_file, mnx_chem_file, mnx_folder)
        print("Best matching database: %s" %db_select)
        print(db_found)




[docs]
def check_sbml_db(sbml_file, to_map, verbose = False, mnx_reac_file = None, mnx_chem_file = None, mnx_folder = None):
    """
    Check sbml database of a given sbml.

    Parameters
    ----------
    sbml_file: str
        path to the sbml file to convert
    to_map: str
        select the part of the sbml to check must be in ['all', 'reaction', 'species']
    verbose: bool
        if true: more info during process
    mnx_reac_file: str
        path to the flat file for reactions (can be None if given mnx_folder)
    mnx_chem_file: str
        path to the flat file for chemical compounds (species) (can be None if given mnx_folder)
    mnx_folder: str
        the path to a folder containing MetaNetx flat files

    Returns
    -------
    tuple:
        (name of the best matching database, dict of matching)
    """
    if to_map not in ["all", "reaction", "species"]:
        raise ValueError("%s must be in [all, reaction, species]" %to_map)
    if mnx_folder:
        mnx_reac_file = os.path.join(mnx_folder, "reac_xref.tsv")
        mnx_chem_file = os.path.join(mnx_folder, "chem_xref.tsv")

    if mnx_reac_file:
        if not os.path.exists(mnx_reac_file):
            raise FileNotFoundError("No MetaNetX file for reactions accessible at " + mnx_reac_file)

    if mnx_chem_file:
        if not os.path.exists(mnx_chem_file):
            raise FileNotFoundError("No MetaNetX file for compounds accessible at " + mnx_chem_file)

    if not os.path.exists(sbml_file):
        raise FileNotFoundError("No SBML file accessible at " + sbml_file)

    reader = libsbml.SBMLReader()
    document = reader.readSBML(sbml_file)
    for i in range(document.getNumErrors()):
        print(document.getError(i).getMessage())
    model = document.getModel()
    listOfReactions = model.getListOfReactions()
    listOfSpecies = model.getListOfSpecies()

    unknown_db = "Unknown"
    db_found = {unknown_db: 0}
    if to_map == "all":
        db_found["total_reaction_species"] = 0
    if verbose:
        print("Check from which database is this sbml:")
    if to_map in ["all", "reaction"]:
        db_found['total_reaction'] = 0
        with open(mnx_reac_file, "r") as f:
            dict_reaction_id_db = dict([(line.split("\t")[0].split(":")[1], line.split("\t")[0].split(":")[0])  for line in f.read().splitlines()
            if not line.startswith("#") and ":" in line.split("\t")[0]])

        for reaction in listOfReactions:
            reaction_id = reaction.id
            db_found['total_reaction'] += 1
            if to_map == "all":
                db_found["total_reaction_species"] += 1
            all_reaction_id_decoded = get_all_decoded_version(reaction_id, "reaction")
   
            for reaction_id_decoded in all_reaction_id_decoded:
                db_match = dict_reaction_id_db.get(reaction_id_decoded, unknown_db)
                if db_match != unknown_db:
                    break
            if verbose:
                if db_match != unknown_db:
                    print("reaction: original id: %s; decoded id:%s; db_match:%s" %(reaction_id, reaction_id_decoded, db_match))
                else:
                    print("reaction: original id:%s; all decoded id:%s; no db found" %(reaction_id, all_reaction_id_decoded))
            try:
                db_found[db_match] += 1
            except KeyError:
                db_found[db_match] = 1

    if to_map in ["all", "species"]:
        db_found["total_species"] = 0
        with open(mnx_chem_file, "r") as f:
            dict_species_id_db = dict([(line.split("\t")[0].split(":")[1], line.split("\t")[0].split(":")[0])  for line in f.read().splitlines()
            if not line.startswith("#") and ":" in line.split("\t")[0]])

        for species in listOfSpecies:
            species_id = species.id
            db_found['total_species'] += 1
            if to_map == "all":
                db_found["total_reaction_species"] += 1

            all_species_id_decoded = get_all_decoded_version(species_id, "species")
    
            for species_id_decoded in all_species_id_decoded:
                db_match = dict_species_id_db.get(species_id_decoded, unknown_db)
                if db_match != unknown_db:
                    break
            if verbose:
                if db_match != unknown_db:
                    print("species: original id: %s; decoded id:%s; db_match:%s" %(species_id, species_id_decoded, db_match))
                else:
                    print("species: original id:%s; all decoded id:%s; no db found" %(species_id, all_species_id_decoded))
            try:
                db_found[db_match] += 1
            except KeyError:
                db_found[db_match] = 1

    if to_map == "all":
        db_select = [k for k, v in list(db_found.items())
                     if v == max([j for i,j in list(db_found.items()) if i != 'total_reaction_species'])][0]
    elif to_map == "reaction":
        db_select = [k for k, v in list(db_found.items())
                     if v == max([j for i,j in list(db_found.items()) if i != 'total_reaction'])][0]
    elif to_map == "species":
        db_select = [k for k, v in list(db_found.items())
                     if v == max([j for i,j in list(db_found.items()) if i != 'total_species'])][0]

    return (db_select, db_found)



[docs]
def map_sbml(sbml_file, to_map, db_out, output, verbose = False, mnx_reac_file = None, mnx_chem_file = None, mnx_folder = None):
    """
    map a sbml and obtain a file of mapping ids to a given database.

    Parameters
    ----------
    sbml_file: str
        path to the sbml file to convert
    to_map: str
        select the part of the sbml to check must be in ['all', 'reaction', 'species']
    db_out: str
        the name of the database target: ['metacyc', 'bigg', 'kegg'] only
    output: str
        path to the file containing the mapping, sep = "\t"
    verbose: bool
        if true: more info during process
    mnx_reac_file: str
        path to the flat file for reactions (can be None if given mnx_folder)
    mnx_chem_file: str
        path to the flat file for chemical compounds (species) (can be None if given mnx_folder)
    mnx_folder: str
        the path to a folder containing MetaNetx flat files

    Returns
    -------
    tuple:
        (name of the best matching database, dict of matching)

    """
    map_from_cpd = False
    if to_map not in ["all", "reaction", "species"]:
        raise ValueError("%s must be in [all, reaction, species]" %to_map)
    if mnx_folder:
        mnx_reac_file = os.path.join(mnx_folder, "reac_xref.tsv")
        mnx_chem_file = os.path.join(mnx_folder, "chem_xref.tsv")

    if mnx_reac_file:
        if not os.path.exists(mnx_reac_file):
            raise FileNotFoundError("No MetaNetX file for reactions accessible at " + mnx_reac_file)

    if mnx_chem_file:
        if not os.path.exists(mnx_chem_file):
            raise FileNotFoundError("No MetaNetX file for compounds accessible at " + mnx_chem_file)

    if not os.path.exists(sbml_file):
        raise FileNotFoundError("No SBML file accessible at " + sbml_file)

    reader = libsbml.SBMLReader()
    document = reader.readSBML(sbml_file)
    for i in range(document.getNumErrors()):
        print(document.getError(i).getMessage())
    model = document.getModel()
    listOfReactions = model.getListOfReactions()
    listOfSpecies = model.getListOfSpecies()
                
    if db_out not in ["BIGG","METACYC","KEGG"]:
        raise ValueError('Please choose a database id in ["BIGG","METACYC","KEGG"]')


    #k: orignial id, v = ref id
    dict_sbml_id_mapped_id = {}
    if to_map in ["all", "reaction"]:
        #For reactions: k = MNXid, v = {k=db_id,v=[list of ids]}
        mnx_reac_dict = mnx_reader(mnx_reac_file, db_out)
        reaction_with_more_one_mapping = 0
        count_nb_stric_reac_mapped = 0

        for reaction in listOfReactions:
            reaction_id = reaction.id
            match_ids = None
            all_reaction_id_decoded = get_all_decoded_version(reaction_id, "reaction")
            for reaction_id_decoded in all_reaction_id_decoded:
                if not match_ids:
                    #first check in intern dict mapping
                    match_ids = intern_mapping(reaction_id_decoded, db_out, "reaction")
                    if match_ids:
                        dict_sbml_id_mapped_id[reaction_id] = match_ids
                        count_nb_stric_reac_mapped += 1
                        if verbose:
                            print("reaction: original id:%s; decoded id:%s; match_with:%s from intern mapping" %(reaction_id, reaction_id_decoded, match_ids))
                    #check if in mnx_reac_dict
                    else:
                        match_ids = get_from_mnx(mnx_reac_dict, reaction_id_decoded, db_out)
                        if match_ids:
                            if len(match_ids) > 1: 
                                reaction_with_more_one_mapping += 1
                                if verbose:
                                    print("reaction: original id:%s; decoded id:%s; More than one mapping:%s" %(reaction_id, reaction_id_decoded, match_ids))
                            else:
                                dict_sbml_id_mapped_id[reaction_id] = match_ids[0]
                                count_nb_stric_reac_mapped += 1
                                if verbose:
                                    print("reaction: original id:%s; decoded id:%s; match_with:%s from MNX mapping" %(reaction_id, reaction_id_decoded, match_ids[0]))
            if verbose and not match_ids:
                print("reaction: original id:%s; all decoded id:%s; not mapped" %(reaction_id, all_reaction_id_decoded))

    if to_map in ["all", "species"]:
        #For species: k = MNXid, v = {k=db_id,v=[list of ids]} 
        mnx_chem_dict = mnx_reader(mnx_chem_file, db_out)
        species_with_more_one_mapping = 0
        count_nb_stric_species_mapped = 0
  
        for species in listOfSpecies:
            species_id = species.id
            match_ids = None
            all_species_id_decoded = get_all_decoded_version(species_id, "species")
    
            for species_id_decoded in all_species_id_decoded:
                #first check in intern dict mapping
                match_ids = intern_mapping(species_id_decoded, db_out, "species")
        
                if match_ids:
                    dict_sbml_id_mapped_id[species_id] = match_ids
                    count_nb_stric_species_mapped += 1
                    if verbose:
                        print("species: original id:%s; decoded id:%s; match_with:%s from intern mapping" %(species_id, species_id_decoded, match_ids))
                    break
                #check if in mnx_chem_dict
                else:
                    match_ids = get_from_mnx(mnx_chem_dict, species_id_decoded, db_out)
                    if match_ids:
                        if len(match_ids) > 1:
                            species_with_more_one_mapping += 1
                            if verbose:
                                print("species: original id:%s; decoded id:%s; More than one mapping:%s" %(species_id, species_id_decoded, match_ids))
                        else: 
                            dict_sbml_id_mapped_id[species_id] = match_ids[0]
                            count_nb_stric_species_mapped += 1
                            if verbose:
                                print("species: original id:%s; decoded id:%s; match_with:%s from MNX mapping" %(species_id, species_id_decoded, match_ids))
            if verbose and not match_ids:
                print("species: original id:%s; all decoded id:%s; not mapped" %(species_id, all_species_id_decoded))

    if map_from_cpd:
        reaction_mapped_with_cpds = []

        #for all non mapped rxn, check if able to map all speices
        for sbml_rxn in [i for i in listOfReactions if i.id not in list(dict_sbml_id_mapped_id.keys())]:
            all_cpds = set([r.getSpecies() for r in sbml_rxn.getListOfReactants()] + [r.getSpecies() for r in sbml_rxn.getListOfProducts()])
            match_cpd_in_rxn = set([cpd_id for cpd_id in all_cpds if cpd_id in list(dict_sbml_id_mapped_id.keys())])
    
            if len(match_cpd_in_rxn) == len(all_cpds):
                reaction_mapped_with_cpds.append(sbml_rxn.id)

    if verbose:
        print("#######")
        if to_map in ["all", "reaction"]:
            print("Mapped reactions: %s/%s" %(count_nb_stric_reac_mapped,len(listOfReactions)))
            print("Reactions with more than one mapping: %s" %reaction_with_more_one_mapping)
        if to_map in ["all", "species"]:
            print("Mapped species: %s/%s" %(count_nb_stric_species_mapped,len(listOfSpecies)))
            print("Species with more than one mapping: %s" %species_with_more_one_mapping)

        if map_from_cpd:
            print("Mapped reactions from species: %s" %(len(reaction_mapped_with_cpds)))
            for i in reaction_mapped_with_cpds:
                print("\t%s" %i)
            print("Total reactions mapped:%s/%s" %(count_nb_stric_reac_mapped+len(reaction_mapped_with_cpds),len(listOfReactions)))
        print("#######")

    with open(output, 'w') as f:
        for k,v in list(dict_sbml_id_mapped_id.items()):
            f.write(k+"\t"+v+"\n")



[docs]
def get_from_mnx(mnx_dict, element_id, db_out):
    """
    #TODO
    """
    match_ids = None
    for map_dict in list(mnx_dict.values()):
        #print(map_dict)
        all_element_id = []
        [all_element_id.extend(i) for i in list(map_dict.values())]
        if element_id in all_element_id:
            match_ids = map_dict[db_out]
            break
    return match_ids

    


[docs]
def mnx_reader(input_file, db_out):
    """
    #TODO
    """
    with open(input_file, "r") as f:
        dataInArray = [line.split("\t")[:2] for line in f.read().splitlines() if not line.startswith("#")]

    #print list_data[:10]
    mnx_dict = dict()
    all_mnxs = set([k for v,k in dataInArray])
    mnx_dict = dict([(k, dict()) for k in all_mnxs])
    #print a[:10]
    #print mnx_dict.items()[:10]
    for v,k in dataInArray:
        try:
            db, _id = v.split(":")
        except ValueError:
            db = v.split(":")[0]
            _id = ":".join(v.split(":")[1:])
        db = db.upper()
        try:
            mnx_dict[k][db].append(_id)
        except KeyError:
            mnx_dict[k][db] = [_id]
    #clean ids not in db_out
    for k,v in list(mnx_dict.items()):
        if db_out not in list(v.keys()) or len(list(v.keys())) == 1:
            mnx_dict.pop(k)
    """
    for v in mnx_dict.values():
        try:
            rxn_db_out = v[db_out]
            all_mapp_rxn = []
            [all_mapp_rxn.extend(i) for i in v.values()]
    """
    return mnx_dict




[docs]
def intern_mapping(id_to_map, db_out, _type):
    """
    #TODO
    """
    
    intern_reac_dict = {
    "UNIQ_ID_1":{"METACYC":["RXN-6382"],"KEGG":["R00904"],"UNKNOWN":["APor"]},
    "UNIQ_ID_2":{"METACYC":["CARNOSINE-SYNTHASE-RXN"],"UNKNOWN":["HAL"]},
    "UNIQ_ID_3":{"METACYC":["ALANINE-AMINOTRANSFERASE-RXN"],"KEGG":["R00258"]},
    "UNIQ_ID_4":{"METACYC":["ASPAMINOTRANS-RXN"],"KEGG":["R00355"],"BIGG":["ASPATh"]},
    "UNIQ_ID_5":{"METACYC":["PHEAMINOTRANS-RXN"],"KEGG":["R00694"],"BIGG":["POAT","POATh","POATm"]},
    "UNIQ_ID_6":{"METACYC":["ORNCARBAMTRANSFER-RXN"],"KEGG":["R01398"],"BIGG":["OCT","OCTh","OCTm"]},
    "UNIQ_ID_7":{"METACYC":["3PGAREARR-RXN"],"KEGG":["R01518"],"BIGG":["PGM","PGMf","PGMm"]},
    "UNIQ_ID_8":{"METACYC":["6PGLUCONDEHYDROG-RXN"],"KEGG":["R01528"],"BIGG":["PGDHh"]},
    "UNIQ_ID_9":{"METACYC":["4-HYDROXY-2-KETOPIMELATE-LYSIS-RXN"],"KEGG":["R01645"]},
    "UNIQ_ID_10":{"METACYC":["AMINEPHEN-RXN"],"KEGG":["R02613"]},
    "UNIQ_ID_11":{"METACYC":["SHIKIMATE-5-DEHYDROGENASE-RXN"],"KEGG":["R02413"]},
    "UNIQ_ID_12":{"METACYC":["CARBODEHYDRAT-RXN"],"BIGG":["HCO3E","HCO3Em","HCO3Ehi"]}
    }
    
    intern_chem_dict = {
    "UNIQ_ID_1":{"METACYC":["WATER"],"KEGG":["C00001"],"BIGG":["h2o"],"UNKNOWN":["H2O"]},
    "UNIQ_ID_2":{"METACYC":["2-KETOGLUTARATE"],"BIGG":["akg"],"KEGG":["C00026"]},
    "UNIQ_ID_3":{"METACYC":["Pi"],"BIGG":["pi"],"KEGG":["C00009"]},
    "UNIQ_ID_4":{"METACYC":["OXALACETIC_ACID"],"BIGG":["oaa"],"KEGG":["C00036"]},

    "UNIQ_ID_5":{"METACYC":["HS"],"BIGG":["h2s"]},
    "UNIQ_ID_6":{"METACYC":["NITRATE"],"BIGG":["no3"]},
    "UNIQ_ID_7":{"METACYC":["HEME_O"],"BIGG":["heme0"]},
    "UNIQ_ID_8":{"METACYC":["FORMATE"],"BIGG":["for"]},
    "UNIQ_ID_9":{"METACYC":["GLN-tRNAs"],"BIGG":["trnagln"]},
    "UNIQ_ID_10":{"METACYC":["CO-A"],"BIGG":["coa"]},
    "UNIQ_ID_11":{"METACYC":["ADENOSINE"],"BIGG":["adn"]},
    "UNIQ_ID_12":{"METACYC":["ADENINE"],"BIGG":["ade"]},
    "UNIQ_ID_13":{"METACYC":["CYTIDINE"],"BIGG":["cytd"]},
    "UNIQ_ID_14":{"METACYC":["CYTIDINE"],"BIGG":["dtdp"]},
    "UNIQ_ID_15":{"METACYC":["DIHYDRO-NEO-PTERIN"],"BIGG":["dhnpt"]},
    "UNIQ_ID_16":{"METACYC":["SUCROSE-6P"],"BIGG":["suc6p"]},
    "UNIQ_ID_17":{"METACYC":["ALKYL-SN-GLYCERO-PHOSPHOETHANOLAMINE"],"BIGG":["g3pe"]},
    "UNIQ_ID_18":{"METACYC":["Retinols"],"BIGG":["retinol"]},
    "UNIQ_ID_19":{"METACYC":["PHENYL-PYRUVATE"],"BIGG":["phpyr"]},
    "UNIQ_ID_20":{"METACYC":["CPD-6082"],"BIGG":["bamppald"]},
    "UNIQ_ID_21":{"METACYC":["S-3-HYDROXYBUTANOYL-COA"],"BIGG":["3hbcoa"]},
    "UNIQ_ID_22":{"METACYC":["Thiopurines"],"BIGG":["6mpur"]},
    "UNIQ_ID_23":{"METACYC":["N-ACETYL-D-GLUCOSAMINE-6-P"],"BIGG":["acgam6p"]},
    "UNIQ_ID_24":{"METACYC":["CHLOROPHYLL-B"],"BIGG":["chlb"]},
    "UNIQ_ID_25":{"METACYC":["FORMALDEHYDE"],"BIGG":["fald"]},
    "UNIQ_ID_26":{"METACYC":["GLC-1-P"],"BIGG":["g1p"]},
    "UNIQ_ID_27":{"METACYC":["NA+"],"BIGG":["na1"]},

    "UNIQ_ID_28":{"METACYC":["ADP"],"KEGG":["C00008"]},
    "UNIQ_ID_29":{"METACYC":["L-DELTA1-PYRROLINE_5-CARBOXYLATE"],"KEGG":["C03912"]},
    "UNIQ_ID_30":{"METACYC":["AMMONIA"],"KEGG":["C00014"]},
    "UNIQ_ID_31":{"METACYC":["PPI"],"KEGG":["C00013"]},
    "UNIQ_ID_32":{"METACYC":["NADP"],"KEGG":["C00006"]},
    "UNIQ_ID_33":{"METACYC":["NADPH"],"KEGG":["C00005"]},

    "UNIQ_ID_34":{"METACYC":["GLY"],"UNKNOWN":["Glycine"]},
    "UNIQ_ID_35":{"METACYC":["METOH"],"UNKNOWN":["Methanol"]},
    "UNIQ_ID_36":{"METACYC":["PPI"],"UNKNOWN":["Pyrophosphate"]},
    }


    if _type == "reaction":
        for mapp_dict in list(intern_reac_dict.values()):
            all_reaction_id = []
            [all_reaction_id.extend(i) for i in list(mapp_dict.values())]
            if id_to_map in all_reaction_id:
                return mapp_dict.get(db_out, [None])[0]

    elif _type == "species":
        for mapp_dict in list(intern_chem_dict.values()):
            all_species_id = []
            [all_species_id.extend(i) for i in list(mapp_dict.values())]
            if id_to_map in all_species_id:
                return mapp_dict.get(db_out, [None])[0]

    return None