Source code for padmet.utils.exploration.visu_similarity_gsmn

# -*- coding: utf-8 -*-
"""
Description:
    Visualize similarity between metabolic networks using MDS.

::

    usage:
        padmet visu_similarity_gsmn --reaction=FILE --output=FILE [--group=FILE]

    options:
        -h --help     Show help.
        --reaction=FILE    pathname to the reaction file output of compare_padmet or compare_sbml.
        --output=FILE    pathname to the picture output file containing the MDS projection
        --group=FILE    pathname to the group file containing a column named "species" with the organism ID and a column "group" classifying species in group (you can also use a "color" column to associate group to specific color)
"""
import docopt
import numpy as np
import matplotlib.pyplot as plt
import pandas as pa

try:
    from sklearn.manifold import MDS
except ImportError:
    raise ImportError('Requires sklearn, try:\npip install scikit-learn')



[docs]
def command_help():
    """
    Show help for analysis command.
    """
    print(docopt.docopt(__doc__))




[docs]
def visu_similarity_gsmn_cli(command_args):
    args = docopt.docopt(__doc__, argv=command_args)

    reaction_file = args["--reaction"]
    output_file = args["--output"]
    group_file = args["--group"]
    visu_similarity_gsmn(reaction_file, output_file, group_file)




[docs]
def visu_similarity_gsmn(reaction_file, output_file, group_file=None):
    """
    Create dendrogram, upset figure (if upset argument) and compare reactiosn in species.

    Parameters
    ----------
    reaction_file: str
        path to reaction file from compare_padmet/compare_sbml.
    output_file: str
        path to picture ouput file.
    group_file: str
        path to group file containing group assignation for each metabolic network.
    """
    # Convert the reaction table into a matrix.
    df = pa.read_csv(reaction_file, sep='\t')
    df.set_index('reaction', inplace=True)
    df = df[[column for column in df.columns if "(" not in column and "_formula" not in column]]

    df = df.transpose()

    if group_file:
        # Retrieve the group from the group file.
        df_family = pa.read_csv(group_file, sep='\t')
        df_family.set_index('species', inplace=True)
        df_join = df.join(df_family)

        groups = {index: group for index, group in enumerate(df_join['group'].unique().tolist())}

        # Select only species with a group.
        df_join = df_join[df_join['group'].isin([groups[group] for group in groups])]

        for group in groups:
            df_join = df_join.replace(groups[group], group)

        X = df_join[df.columns.tolist()]

        if "color" in df_family.columns:
            # Extract color from group file.
            color_map = dict(zip(df_family.group, df_family.color))
        else:
            # Create color map for each group in the figure.
            color_map = plt.cm.get_cmap('hsv', len(groups))
    else:
        X = df

    # Projection with MDS.
    embedding = MDS(n_components=2)
    X_transformed = embedding.fit_transform(X)
    X = X_transformed

    plt.rcParams['figure.figsize'] = [20, 20]
    plt.rc('font', size=14)

    # Plot each point in a matplotlib figure. 
    if group_file:
        # Use group as provided by the user.
        for i in groups:
            if isinstance(color_map, dict):
                group_color = color_map[groups[i]]
            else:
                group_color = np.asarray(color_map(i)).reshape(1,-1)
            subset = X[df_join.group == i]
            labels = df_join[df_join.group == i].index

            x = [row[0] for row in subset]
            y = [row[1] for row in subset]
            plt.scatter(x, y, c=group_color, label=groups[i], s=15**2)
            for i in range(len(x)):
                plt.annotate(labels[i], (x[i],y[i]))
            plt.legend()
    else:
        for genome in df.index:
            subset = X[df.index == genome]

            x = [row[0] for row in subset]
            y = [row[1] for row in subset]
            plt.scatter(x, y, c="black", label=genome, s=15**2)
            for i in range(len(x)):
                plt.annotate(genome, (x[i],y[i]))

    plt.savefig(output_file)