Source code for padmet.utils.exploration.padmet_stats

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Description:
    Create a padmet stats file containing the number of pathways, reactions,
    genes and compounds inside the padmet.

    The input is a padmet file or a folder containing multiple padmets.
    
    Create a tsv file named padmet_stats.tsv where the script have been
    launched.

::

    usage:
        padmet padmet_stats --padmet=FILE --output=FOLDER

    option:
        -h --help    Show help.
        -p --padmet=FILE    padmet file or folder containing padmet(s).
        -o --output=FOLDER    path to output folder.
"""

import csv
import docopt
import os

from padmet.classes import PadmetSpec


[docs] def command_help(): """ Show help for analysis command. """ print(docopt.docopt(__doc__))
[docs] def padmet_stats_cli(command_args): args = docopt.docopt(__doc__, argv=command_args) padmet_file_folder = args['--padmet'] output_folder = args['--output'] compute_stats(padmet_file_folder, output_folder)
[docs] def compute_stats(padmet_file_folder, output_folder): """ Count reactions/pathways/compounds/genes in padmet(s). Parameters ---------- padmet_file_folder: str path to the padmet file/folder to analyze output_folder: str path to the output folder """ if os.path.isdir(padmet_file_folder): padmet_type = "dir" elif os.path.isfile(padmet_file_folder): padmet_type = "file" else: raise TypeError("%s is not a dir or a file or is not accessible." %(padmet_file_folder)) if not os.path.exists(output_folder): os.mkdir(output_folder) output_file = open(output_folder + '/padmet_stats.tsv', 'w') output_writer = csv.writer(output_file, delimiter='\t') output_writer.writerow(['padmet_file', 'pathways_with_reactions', 'reactions without class compounds', 'reactions with class compounds', 'reactions_with_gene_association', 'genes', 'compounds', 'class compounds']) orthologs_species = [] orthologs_stats = {} if padmet_type == "dir": padmet_names = [padmet_file.replace('.padmet', '').upper() for padmet_file in os.listdir(padmet_file_folder)] for padmet_file in os.listdir(padmet_file_folder): padmet_path = padmet_file_folder + '/' + padmet_file stats = padmet_stat(padmet_path) output_writer.writerow(stats) ortholog_species_counts = orthology_result(padmet_path, padmet_names) orthologs_stats[padmet_file.replace('.padmet', '').upper()] = ortholog_species_counts for species in ortholog_species_counts: if species not in orthologs_species: orthologs_species.append(species) if padmet_type == "file": stats = padmet_stat(padmet_file_folder) output_writer.writerow(stats) padmet_name = padmet_file_folder.replace('.padmet', '').upper() padmet_names = [padmet_name] ortholog_species_counts = orthology_result(padmet_file_folder, padmet_names) orthologs_stats[padmet_file_folder.replace('.padmet', '').upper()] = ortholog_species_counts for species in ortholog_species_counts: if species not in orthologs_species: orthologs_species.append(species) orthologs_stats_file = open(output_folder + '/padmet_orthologs_stats.tsv', 'w') orthologs_stats_writer = csv.writer(orthologs_stats_file, delimiter='\t') orthologs_stats_writer.writerow(['', *orthologs_species]) for input_species in orthologs_stats: orthologs_stats_writer.writerow([input_species, *[orthologs_stats[input_species][species] for species in orthologs_species]]) output_file.close() orthologs_stats_file.close()
[docs] def padmet_stat(padmet_file): """ Count reactions/pathways/compounds/genes in a padmet file. Parameters ---------- padmet_file: str path to a padmet file Returns ------- list: [path to padmet, number of pathways, number of reactions, number of genes, number of compounds, number of class compounds] """ padmetSpec = PadmetSpec(padmet_file) padmet_name = os.path.basename(padmet_file).replace('.padmet', '') total_pwy_id = set() total_cpd_id = set() all_rxns = [node for node in padmetSpec.dicOfNode.values() if node.type == "reaction"] all_genes = [node for node in padmetSpec.dicOfNode.values() if node.type == "gene"] nb_rxn_with_ga = 0 for rxn_node in all_rxns: total_cpd_id.update([rlt.id_out for rlt in padmetSpec.dicOfRelationIn[rxn_node.id] if rlt.type in ["consumes","produces"]]) # Get all pathways having at least a reaction. Remove superpathways containing only pathways. pathways_ids = set([rlt.id_out for rlt in padmetSpec.dicOfRelationIn[rxn_node.id] if rlt.type == "is_in_pathway"]) if any([rlt for rlt in padmetSpec.dicOfRelationIn[rxn_node.id] if rlt.type == "is_linked_to"]): nb_rxn_with_ga += 1 total_pwy_id.update(pathways_ids) all_pwys = [node for (node_id, node) in padmetSpec.dicOfNode.items() if node_id in total_pwy_id] class_cpds = [node.id for (node_id, node) in padmetSpec.dicOfNode.items() if node_id in total_cpd_id if node.type == "class"] compound_cpds = [node.id for (node_id, node) in padmetSpec.dicOfNode.items() if node_id in total_cpd_id if node.type == "compound"] rxn_with_class_cpds = [] rxn_without_class_cpds = [] for rxn_node in all_rxns: rxn_compounds = [rlt.id_out for rlt in padmetSpec.dicOfRelationIn[rxn_node.id] if rlt.type in ["consumes","produces"]] if len(set(rxn_compounds).intersection(set(class_cpds))) > 0: rxn_with_class_cpds.append(rxn_node) else: rxn_without_class_cpds.append(rxn_node) return [padmet_name, len(all_pwys), len(rxn_without_class_cpds), len(rxn_with_class_cpds), nb_rxn_with_ga, len(all_genes), len(compound_cpds), len(class_cpds)]
[docs] def orthology_result(padmet_file, padmet_names): """ Count reactions/pathways/compounds/genes in a padmet file. Parameters ---------- padmet_file: str path to a padmet file padmet_names: list all the padmet filenames Returns ------- dictionary: Number of reactions given by the other species """ ortholog_species_counts = {} padmetSpec = PadmetSpec(padmet_file) ortholog_reactions = set() for node in padmetSpec.dicOfNode.values(): if node.type == 'suppData': reaction_id = node.id.split('_SuppData_OUTPUT_ORTHOFINDER_')[0] ortholog_reactions.add(reaction_id) ortholog_species = node.id.split('FROM_')[1] if ortholog_species in ortholog_species_counts: ortholog_species_counts[ortholog_species] += 1 else: ortholog_species_counts[ortholog_species] = 1 for species in padmet_names: if species not in ortholog_species_counts: ortholog_species_counts[species] = 0 ortholog_species_counts['Orthology'] = len(ortholog_reactions) return ortholog_species_counts