Source code for padmet.utils.exploration.compare_padmet

# -*- coding: utf-8 -*-
"""
Description:
    #Compare 1-n padmet and create a folder output with files:
    genes.tsv:
        fieldnames = [gene, padmet_a, padmet_b, padmet_a_rxn_assoc, padmet_b_rxn_assoc]
        line = [gene-a, 1 (if in padmet_a), 1 (if in padmet_b), rxn-1;rxn-2 (names of reactions associated to gene-a in padmet_a), rxn-2]
    reactions.tsv:
        fieldnames = [reaction, padmet_a, padmet_b, padmet_a_genes_assoc, padmet_b_genes_assoc, padmet_a_formula, padmet_b_formula]
        line = [rxn-1, 1 (if in padmet_a), 1 (if in padmet_b), 'gene-a;gene-b; gene-a, 'cpd-1 + cpd-2 => cpd-3', 'cpd-1 + cpd-2 => cpd-3']
    pathways.tsv:
        fieldnames = [pathway, padmet_a_completion_rate, padmet_b_completion_rate, padmet_a_rxn_assoc, padmet_b_rxn_assoc]
        line = [pwy-a, 0.80, 0.30, rxn-a;rxn-b; rxn-a]
    compounds.tsv:
        fieldnames = ['metabolite', padmet_a_rxn_consume, padmet_a_rxn_produce, padmet_b_rxn_consume, padmet_rxn_produce]
        line = [cpd-1, rxn-1,'',rxn-1,'']

::

    usage:
        padmet compare_padmet --padmet=FILES/DIR --output=DIR [--padmetRef=FILE] [--cpu INT] [-v]

    option:
        -h --help    Show help.
        --padmet=FILES/DIR    pathname of the padmet files, sep all files by ',', ex: /path/padmet1.padmet;/path/padmet2.padmet OR a folder
        --output=DIR    pathname of the output folder
        --padmetRef=FILE    pathanme of the database ref in padmet
        --cpu INT    number of CPU to use in multiprocessing
"""
import docopt
import csv
import os

from multiprocessing import Pool
from padmet.classes import PadmetRef, PadmetSpec



[docs]
def command_help():
    """
    Show help for analysis command.
    """
    print(docopt.docopt(__doc__))




[docs]
def compare_padmet_cli(command_args):
    args = docopt.docopt(__doc__, argv=command_args)
    output = args["--output"]
    verbose = args["-v"]
    if args["--padmetRef"]:
        padmetRef = PadmetRef(args["--padmetRef"])
    else:
        padmetRef = None
    padmet_path = args["--padmet"]
    number_cpu = args["--cpu"]
    compare_padmet(padmet_path, output, padmetRef, verbose, number_cpu)




[docs]
def extract_information_padmet(file_path, padmetRef, verbose):
    count_elem, dict_genes, dict_rxns, dict_pwys, dict_cpds = {}, {}, {}, {}, {}
    #cout_elem: k = basename_file, v = dict(k' in  [gene,rxn,pwy,metabolite], v = count element)
    #ex: count_elemn = {'file_1':{'genes':1500,'rxn':1000,pathways:1000,'metabolites':1000}}
    #dict_genes: k = gene_id, v = dict(k' = basename_file, v = set of rxn assoc)
    #ex: dict_genes = {'gene_a': {'file_1': set([rxn_1,rxn_2])}}
    #dict_rxns: k = rxn_id, v = dict(k' = basename_file, v' = dict(k'' in ['genes_assoc','formula'])
    #ex: dict_rxns = {'rxn_a': {'file_1': {'genes_assoc': set([gene_a,gene_b]), 'formula':'1x + 2y => 3z', 'source_categories':'', source_tools, sources:''}}
    #dict_pwys: k = pwy_id, , v = dict(k' = basename_file, v' = dict(k'' in ['ratio','rxn_assoc'])
    #ex: dict_pwys = {'pwy_a': {'file_1': {'ratio': 2/9, 'rxn_assoc':'rxn_a;rxn_b'}}}
    #dict_cpds
    padmet = PadmetSpec(file_path)
    basename_file = os.path.basename(file_path).replace(".padmet","")
    log_padmet = ""
    log_padmet += "reading %s\n" %basename_file
    count_elem[basename_file] = {"genes":0,"reactions":0,"pathways":0,"metabolites":0}

    #genes
    all_genes = set([node.id for node in list(padmet.dicOfNode.values()) if node.type == "gene"])
    count_elem[basename_file]["genes"] = len(all_genes)
    log_padmet += "\t%s genes...\n" %len(all_genes)
    for gene_id in all_genes:
        #get for each gene, the reactiosn associated to and add in dict_gene: dict_gene[gene_id] = {padmet_file_name: set of rxn assoc}
        rxn_assoc = set([rlt.id_in for rlt in padmet.dicOfRelationOut.get(gene_id,[]) if rlt.type == "is_linked_to"])
        try:
            dict_genes[gene_id][basename_file] = ";".join(rxn_assoc)
        except KeyError:
            dict_genes[gene_id] = {basename_file: ";".join(rxn_assoc)}

    #reactions
    all_rxns = set([node.id for node in list(padmet.dicOfNode.values()) if node.type == "reaction"])
    count_elem[basename_file]["reactions"] = len(all_rxns)
    log_padmet += "\t%s reactions...\n" %len(all_rxns)
    for rxn_id in all_rxns:
        try:
            dict_rxns[rxn_id][basename_file] = {"genes_associated":"", "formula":""}
        except KeyError:
            dict_rxns[rxn_id] = {basename_file: {"genes_associated":"", "formula":""}}

        #get for each rxn, genes associated to and add in dict_rxn: dict_rxns[rxn_id] = {padmet_file_name: {'genes_associations':set of rxn assoc}}
        genes_assoc = set([rlt.id_out for rlt in padmet.dicOfRelationIn.get(rxn_id,[]) if rlt.type == "is_linked_to"])
        dict_rxns[rxn_id][basename_file]["genes_associated"] = ";".join(genes_assoc)
        #create formula
        direction = padmet.dicOfNode[rxn_id].misc["DIRECTION"][0]
        if direction == "UNKNOWN":
            direction = " =>/<=> "
        elif direction == "REVERSIBLE":
            direction = " <=> "
        elif direction == "LEFT-TO-RIGHT":
            direction = " => "
        #reactants and products: list of str: ['stoichio [[cpd_id]][cpd_compart]',...]
        reactants = [rlt.misc["STOICHIOMETRY"][0]+" "+rlt.id_out+"["+rlt.misc["COMPARTMENT"][0]+"]"
        for rlt in padmet.dicOfRelationIn.get(rxn_id, None) if rlt.type == "consumes"]
        products = [rlt.misc["STOICHIOMETRY"][0]+" "+rlt.id_out+"["+rlt.misc["COMPARTMENT"][0]+"]"
        for rlt in padmet.dicOfRelationIn.get(rxn_id, None) if rlt.type == "produces"]
        #join each list with '+' and direction in center
        formula = " + ".join(reactants)+direction+" + ".join(products)
        dict_rxns[rxn_id][basename_file]["formula"] = formula

    #pathways
    all_pwys = set()
    for rxn_id in all_rxns:
        pwy_in = set([rlt.id_out for rlt in padmet.dicOfRelationIn[rxn_id] if rlt.type == "is_in_pathway"])
        all_pwys.update(pwy_in)
    count_elem[basename_file]["pathways"] = len(all_pwys)
    log_padmet += "\t%s pathways...\n" %len(all_pwys)
    for pwy_id in all_pwys:
        #get for each pwy, the reactions associated to and add in dict_gene: dict_gene[gene_id] = {padmet_file_name: set of rxn assoc}
        try:
            dict_pwys[pwy_id][basename_file] = {"ratio":"", "rxn_associated":""}
        except KeyError:
            dict_pwys[pwy_id] = {basename_file: {"ratio":"", "rxn_associated":""}}
        in_rxns = set([rlt.id_in for rlt in padmet.dicOfRelationOut.get(pwy_id,[]) if rlt.type == "is_in_pathway"])
        if padmetRef:
            all_rxns = set([rlt.id_in for rlt in padmetRef.dicOfRelationOut.get(pwy_id,[]) if rlt.type == "is_in_pathway"])
            dict_pwys[pwy_id][basename_file]["ratio"] = str(len(in_rxns))+"/"+str(len(all_rxns))
        else:
            dict_pwys[pwy_id][basename_file]["ratio"] = "NA"
        dict_pwys[pwy_id][basename_file]["rxn_associated"] = ";".join(in_rxns)

    #metabolites
    all_cpd = set([rlt.id_out for rlt in padmet.getAllRelation() if rlt.type in ["consumes","produces"]])
    count_elem[basename_file]["metabolites"] = len(all_cpd)
    log_padmet += "\t%s metabolites...\n" %len(all_cpd)
    for cpd_id in all_cpd:
        #get for each cpd, the reactions consuming/producing the cpd
        rxn_consume = set([rlt.id_in for rlt in padmet.dicOfRelationOut[cpd_id] if rlt.type == "consumes"])
        if rxn_consume:
            rxn_consume = ";".join(rxn_consume)
        else:
            rxn_consume = ""
        rxn_produce = set([rlt.id_in for rlt in padmet.dicOfRelationOut[cpd_id] if rlt.type == "produces"])
        if rxn_produce:
            rxn_produce = ";".join(rxn_produce)
        else:
            rxn_produce = ""
        try:
            dict_cpds[cpd_id][basename_file] = {"rxn_consume":rxn_consume, "rxn_produce":rxn_produce}
        except KeyError:
            dict_cpds[cpd_id] = {basename_file: {"rxn_consume":rxn_consume, "rxn_produce":rxn_produce}}

    if verbose:
        print(log_padmet)
    return dict_genes, dict_rxns, dict_pwys, dict_cpds




[docs]
def merge_dicts(element_dict, tmp_dict):
    for gene_id in tmp_dict:
        if gene_id in element_dict:
            for basename_file in tmp_dict[gene_id]:
                element_dict[gene_id][basename_file] = tmp_dict[gene_id][basename_file]
        else:
            element_dict[gene_id] = {}
            for basename_file in tmp_dict[gene_id]:
                element_dict[gene_id][basename_file] = tmp_dict[gene_id][basename_file]

    return element_dict




[docs]
def compare_padmet(padmet_path, output, padmetRef = None, verbose = False, number_cpu = None):
    """
    #Compare 1-n padmet and create a folder output with files:
    genes.tsv:
        fieldnames = [gene, padmet_a, padmet_b, padmet_a_rxn_assoc, padmet_b_rxn_assoc]
        line = [gene-a, 1 (if in padmet_a), 1 (if in padmet_b), rxn-1;rxn-2 (names of reactions associated to gene-a in padmet_a), rxn-2]
    reactions.tsv:
        fieldnames = [reaction, padmet_a, padmet_b, padmet_a_genes_assoc, padmet_b_genes_assoc, padmet_a_formula, padmet_b_formula]
        line = [rxn-1, 1 (if in padmet_a), 1 (if in padmet_b), 'gene-a;gene-b; gene-a, 'cpd-1 + cpd-2 => cpd-3', 'cpd-1 + cpd-2 => cpd-3']
    pathways.tsv:
        fieldnames = [pathway, padmet_a_completion_rate, padmet_b_completion_rate, padmet_a_rxn_assoc, padmet_b_rxn_assoc]
        line = [pwy-a, 0.80, 0.30, rxn-a;rxn-b; rxn-a]
    compounds.tsv:
        fieldnames = ['metabolite', padmet_a_rxn_consume, padmet_a_rxn_produce, padmet_b_rxn_consume, padmet_rxn_produce]
        line = [cpd-1, rxn-1,'',rxn-1,'']

    Parameters
    ----------
    padmet_path: str
        pathname of the padmet files, sep all files by ',', ex: /path/padmet1.padmet;/path/padmet2.padmet OR a folder
    output: str
        pathname of the output folder
    padmetRef: padmet.classes.PadmetRef
        padmet containing the database of reference, need to calculat pathway completion rate
    verbose: bool
        if True print information
    
    """
    dict_genes, dict_rxns, dict_pwys, dict_cpds = {}, {}, {}, {}
    if not os.path.exists(output):
        if verbose: print("Creating %s" %output)
        os.makedirs(output)
    else:
        if verbose: print("%s already exist, old comparison output folders will be overwritten" %output)

    if os.path.isdir(padmet_path):
        all_files = [os.path.join(padmet_path, f) for f in next(os.walk(padmet_path))[2]]
    else:
        all_files = padmet_path.split(",")

    if len(all_files) < 2:
        raise ValueError("You must specify at least 2 files in order to make a comparison")
    if verbose:
        print(("%s padmet files to compare:" %len(all_files)))
        for f in all_files:
            print("\t%s" %os.path.basename(f))

    if number_cpu:
        try:
            number_cpu_to_use = int(number_cpu)
        except ValueError:
            raise ValueError('The number of CPU must be an integer.')
    else:
        number_cpu_to_use = 1
    compare_pool = Pool(processes=number_cpu_to_use)

    multiprocessin_datas = []
    for padmet_file in all_files:
        multiprocessin_datas.append([padmet_file, padmetRef, verbose])
    result_rxns_dicts = compare_pool.starmap(extract_information_padmet, multiprocessin_datas)

    for result_rxns_dict in result_rxns_dicts:
        tmp_dict_genes = result_rxns_dict[0]
        tmp_dict_rxns = result_rxns_dict[1]
        tmp_dict_pwys = result_rxns_dict[2]
        tmp_dict_cpds = result_rxns_dict[3]
        dict_genes = merge_dicts(dict_genes, tmp_dict_genes)
        dict_rxns = merge_dicts(dict_rxns, tmp_dict_rxns)
        dict_pwys = merge_dicts(dict_pwys, tmp_dict_pwys)
        dict_cpds = merge_dicts(dict_cpds, tmp_dict_cpds)

    compare_pool.close()
    compare_pool.join()

    #create files
    all_basename_files = [os.path.basename(file_path).replace(".padmet","") for file_path in all_files]
    #genes
    #gene file header: gene_id, base_file_1, base_file_n, base_file_1_rxn_assoc (sep=;), base_file_n_rxn_assoc (sep=;)
    genes_file = os.path.join(output,"genes.tsv")
    if verbose: print("creating %s" %genes_file)
    with open(genes_file, 'w') as csvfile:
        fieldnames = ['gene'] + all_basename_files + [i+"_rxn_assoc (sep=;)" for i in all_basename_files]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        for gene_id, dic_basename_rxn_assoc in list(dict_genes.items()):
            dict_row = {'gene': gene_id}
            for basename_file, rxn_assoc in list(dic_basename_rxn_assoc.items()):
                dict_row.update({basename_file : 1, basename_file+"_rxn_assoc (sep=;)": rxn_assoc})
            writer.writerow(dict_row)

    #reactions
    #reactions file header: rxn_id, base_file_1, base_file_n, base_file_1_genes_assoc (sep=;), base_file_n_genes_assoc (sep=;), base_file_1_formula, base_file_n_formula
    rxns_file = os.path.join(output,"reactions.tsv")
    if verbose: print("creating %s" %rxns_file)
    with open(rxns_file, 'w') as csvfile:
        fieldnames = ['reaction'] + all_basename_files + [i+"_genes_assoc (sep=;)" for i in all_basename_files] + [i+"_formula" for i in all_basename_files]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        for rxn_id, dict_basename_data in list(dict_rxns.items()):
            dict_row = {'reaction': rxn_id}
            for basename_file, rxn_data in list(dict_basename_data.items()):
                dict_row.update({basename_file: 1, basename_file+"_genes_assoc (sep=;)": rxn_data["genes_associated"], basename_file+"_formula": rxn_data["formula"]})
            for basename_file in all_basename_files:
                if basename_file not in dict_row:
                    dict_row.update({basename_file: 0})
            writer.writerow(dict_row)

    #pathways
    #pathways file header: pwy, base_file_1_rate, base_file_n_rate, base_file_1_rxn_assoc (sep=;), base_file_n_rxn_assoc (sep=;)
    pwys_file = os.path.join(output,"pathways.tsv")
    if verbose: print("creating %s" %pwys_file)
    with open(pwys_file, 'w') as csvfile:
        fieldnames = ['pathway'] + [i+"_completion_rate" for i in all_basename_files] + [i+"_rxn_assoc (sep=;)" for i in all_basename_files]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        for pwy_id, dict_basename_data in list(dict_pwys.items()):
            dict_row = {'pathway': pwy_id}
            for basename_file, pwy_data in list(dict_basename_data.items()):
                dict_row.update({basename_file+"_completion_rate": pwy_data["ratio"], basename_file+"_rxn_assoc (sep=;)": pwy_data["rxn_associated"]})
            writer.writerow(dict_row)

    #metabolites
    #metabolites file header: cpd, base_file_1, base_file_n
    cpds_file = os.path.join(output,"metabolites.tsv")
    if verbose: print(("creating %s" %cpds_file))
    with open(cpds_file, 'w') as csvfile:
        fieldnames = ['metabolite'] + [i+"_rxn_consume" for i in all_basename_files] + [i+"_rxn_produce" for i in all_basename_files]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        for cpd_id, dict_basename_data in dict_cpds.items():
            dict_row = {'metabolite': cpd_id}
            for basename_file, cpd_data in dict_basename_data.items():
                dict_row.update({basename_file+"_rxn_consume": cpd_data["rxn_consume"], basename_file+"_rxn_produce": cpd_data["rxn_produce"]})
            writer.writerow(dict_row)