Source code for padmet.utils.exploration.dendrogram_reactions_distance

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Description:
    Use reactions.tsv file from compare_padmet.py to create a dendrogram using a Jaccard distance.
    
    From the matrix absence/presence of reactions in different species computes a Jaccard distance between these species.
    Apply a hierarchical clustering on these data with a complete linkage. Then create a dendrogram.
    Apply also intervene to create an upset graph on the data.

::

    usage:
        padmet dendrogram_reactions_distance --reactions=FILE --output=FOLDER [--padmetRef=STR] [--pvclust] [--upset=INT] [-v]

    option:
        -h --help    Show help.
        --reactions=FILE    pathname of the file containing reactions in each species of the comparison.
        --output=FOLDER    path to the output folder.
        --pvclust    launch pvclust dendrogram using R
        --padmetRef=STR    path to the padmet Ref file
        -u --upset=INT    number of cluster in the upset graph.
        -v    verbose mode.
"""

import csv
import docopt
import pandas as pa
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import subprocess

sns.set_style("white")
sns.set('poster', rc={'figure.figsize':(150,140), 'lines.linewidth': 10}, font_scale=4)

from collections import defaultdict
from lxml import etree
from padmet.classes import PadmetRef
from scipy.cluster.hierarchy import dendrogram, fcluster, linkage, to_tree
from scipy.spatial.distance import pdist, squareform
from supervenn import supervenn



[docs]
def command_help():
    """
    Show help for analysis command.
    """
    print(docopt.docopt(__doc__))




[docs]
def dendrogram_reactions_distance_cli(command_args):
    args = docopt.docopt(__doc__, argv=command_args)
    reaction_pathname = args['--reactions']
    upset_cluster = int(args['--upset']) if args['--upset'] else None
    output_pathname = args['--output']
    padmet_ref_file = args['--padmetRef']
    pvclust = args['--pvclust']
    #verbose = args['-v']

    reaction_figure_creation(reaction_pathname, output_pathname, upset_cluster, padmet_ref_file, pvclust)




[docs]
def pvclust_dendrogram(reactions_dataframe, organisms, output_folder):
    """
    Using a distance matrix, pvclust R package (with rpy2 package) create a dendrogram with bootstrap values.

    Parameters
    ----------
    reactions_dataframe: pandas.DataFrame
        Reactions absence/presence matrix
    organisms: list
        organisms names
    output_folder: str
        path to the output folder

    """
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri

    pvclust = importr("pvclust")
    grdevices = importr('grDevices')
    ape = importr('ape')

    # Make pandas dataframe compatible with R dataframe.
    pandas2ri.activate()

    # Launch pvclust on the data silently and in parallel.
    result = pvclust.pvclust(reactions_dataframe, method_dist="binary", method_hclust="complete", nboot=10000, quiet=True, parallel=True)

    # Create the dendrogram picture.
    grdevices.png(file=output_folder+"/"+"pvclust_reaction_dendrogram.png", width=2048, height=2048, pointsize=24)
    pvclust.plot_pvclust(result)
    grdevices.dev_off()

    # Dendrogram to newick
    hclust_result = result.rx2("hclust")
    phylo_result = ape.as_phylo(hclust_result)
    ape.write_tree(phylo_result, file=output_folder+"/"+"dendrogram.nwk", tree_names=True, digits=2)




[docs]
def create_pvclust_dendrogram(reaction_file, output_folder):
    # Check if output_folder exists, if not create it.
    if not os.path.isdir(output_folder):
        os.mkdir(output_folder)

    # Read the reactions file with pandas.
    all_reactions_dataframe = pa.read_csv(reaction_file, sep='\t')
    # Keep column containing absence-presence of reactions.
    # (columns with (sep=;) are column with gene name linked to reactions)
    # (columns with _formula contain the reaction formula)
    columns = [column for column in all_reactions_dataframe.columns if '(sep=;)' not in column]
    columns = [column for column in columns if '_formula' not in column]
    reactions_dataframe = all_reactions_dataframe[columns].copy()

    reactions_dataframe.set_index('reaction', inplace=True)

    # Extract organisms.
    organisms = reactions_dataframe.index.tolist()

    # Create pvclust dendrogram.
    pvclust_dendrogram(reactions_dataframe, organisms, output_folder)




[docs]
def hclust_to_xml(linkage_matrix):
    """
    Using a distance matrix from scipy linkage, create a xml tree corresponding to the hierarchical clustering. Return the root of the tree.

    Parameters
    ----------
    linkage_matrix: ndarray
        linkage matrix

    Returns
    -------
    root:
        root of the xml tree
    """
    _, node_list = to_tree(linkage_matrix, rd=True)
    len_longest_cluster_id = len(str(max([node.id for node in node_list])))
    parent_nodes = {}
    # Create an xml tree from the dendrogram with all the node.
    # Begin by the last cluster (containing all the other cluster).
    # Check child of the cluster to keep the hierarchy.
    for index, node in enumerate(reversed(node_list)):
        if index == 0:
            root = etree.Element('cluster_' + str(node.id).zfill(len_longest_cluster_id))
            if node.get_left():
                parent_nodes[node.get_left().id] = root
            if node.get_right():
                parent_nodes[node.get_right().id] = root
        else:
            subroot = etree.SubElement(parent_nodes[node.id], 'cluster_' + str(node.id).zfill(len_longest_cluster_id))
            if node.get_left():
                parent_nodes[node.get_left().id] = subroot
            if node.get_right():
                parent_nodes[node.get_right().id] = subroot

    return root




[docs]
def create_intersection_files(root, cluster_leaf_species, reactions_dataframe, output_folder_tree_cluster, metacyc_to_ecs):
    """
    Create intersection files.

    Parameters
    ----------
    root: root
        root of the xml tree
    cluster_leaf_species: dictionary
        for each leaf give the organisms in it
    reactions_dataframe: pandas.DataFrame
        dataframe containing absence/presence of reactions in organism
    output_folder_tree_cluster: str
        path to the output folder
    metacyc_to_ecs: dictionary
        mapping of metayc reaction to EC number

    Returns
    -------
    reactions_clust: dictionary
        reactions in each cluster of the tree
    """
    # Extract reactions using XML Tree.
    cluster_folder_name = {}
    reactions_clust = {}
    for element in root.iter():
        folder_name = element.tag
        intersect_ancestor_reactions = []
        ancestors = []
        for ancestor in element.iterancestors():
            ancestor_reactions = reactions_dataframe[reactions_dataframe[cluster_leaf_species[ancestor.tag]].all(1)==True]
            intersect_ancestor_reactions.extend(ancestor_reactions.index.tolist())
            ancestors.append(ancestor.tag)
        element_reactions = reactions_dataframe[reactions_dataframe[cluster_leaf_species[element.tag]].all(1)==True]
        # Option to select only reactions present in our subgroup and not in other species.
        #element_reactions = element_reactions[element_reactions[list(set(reactions_dataframe.columns.tolist()) - set(cluster_leaf_species[element.tag]))].any(1)==False]
        intersect_element_reactions = element_reactions.index.tolist()
        only_intersect_element = list(set(intersect_element_reactions) - set(intersect_ancestor_reactions))
        for reaction in only_intersect_element:
            if reaction not in cluster_folder_name:
                cluster_folder_name[reaction] = [folder_name]
            else:
                cluster_folder_name[reaction].append(folder_name)
        tmp_reactions_dataframe = reactions_dataframe.loc[only_intersect_element]
        folder_path = '/'.join(reversed(ancestors))
        if not os.path.isdir(output_folder_tree_cluster + folder_path + '/' + element.tag):
            os.mkdir(output_folder_tree_cluster + folder_path + '/' + element.tag)
        if metacyc_to_ecs:
            tmp_reactions_dataframe['EC'] = [','.join(metacyc_to_ecs[reaction]) if reaction in metacyc_to_ecs else np.nan for reaction in tmp_reactions_dataframe.index]
        tmp_reactions_dataframe.to_csv(output_folder_tree_cluster  + folder_path + '/' + folder_name + '/' + folder_name + '.tsv', sep='\t')
        reactions_clust[element.tag] = tmp_reactions_dataframe.index.tolist()

    reactions_dataframe['cluster'] = [','.join(cluster_folder_name[reaction]) if reaction in cluster_folder_name else 'no_cluster' for reaction in reactions_dataframe.index]
    if metacyc_to_ecs:
        reactions_dataframe['EC'] = [','.join(metacyc_to_ecs[reaction]) if reaction in metacyc_to_ecs else np.nan for reaction in reactions_dataframe.index]
    for column in reactions_dataframe.columns.tolist():
        reactions_dataframe[column].replace(True, 1, inplace=True)
        reactions_dataframe[column].replace(False, 0, inplace=True)
    reactions_dataframe.to_csv(output_folder_tree_cluster + 'reaction_cluster.tsv', sep='\t')
    
    return reactions_clust




[docs]
def create_cluster(reactions_dataframe, absence_presence_matrix, linkage_matrix):
    """
    Cut the dendrogram to create clusters.

    Parameters
    ----------
    reactions_dataframe: pandas.DataFrame
        dataframe containing absence/presence of reactions in organism
    absence_presence_matrix: pandas.DataFrame
        transposition of the reactions dataframe
    linkage_matrix: ndarray
        linkage matrix

    Returns
    -------
    dendrogram_fclusters: dictionary
        {number used to split the linkage matrix: ndarray with the corresponding clusters}
    """
    species_number = len(reactions_dataframe.columns)
    # Extract Dendrogram information using fcluster.
    dendrogram_fclusters = {}
    for i in range(species_number):
        results = fcluster(linkage_matrix, i, criterion='maxclust')
        dendrogram_fclusters[i] = results

    return dendrogram_fclusters




[docs]
def create_supervenn(absence_presence_matrix, reactions_dataframe, output_folder_upset, dendrogram_fclusters, k, verbose=False):
    """
    Create an supervenn graph.

    Parameters
    ----------
    absence_presence_matrix: pandas.DataFrame
        transposition of the reactions dataframe
    reactions_dataframe: pandas.DataFrame
        dataframe containing absence/presence of reactions in organism
    output_folder_upset: str
        path to output folder
    dendrogram_fclusters: dictionary
        {number used to split the linkage matrix: ndarray with the corresponding clusters}
    k: int
        number of cluster to create
    """
    if k < 2:
        print('supervenn needs at least 2 clusters to work.')
        return
    # Extract species in each cluster.
    results = dendrogram_fclusters[k]

    species = absence_presence_matrix.index.tolist()

    cluster_species = dict(zip(species, results))
    cluster_classes = defaultdict(list)

    for key, value in cluster_species.items():
        cluster_classes[value].append(key)

    # For each group, extract the reactions present in its species to create supervenn sets.
    supervenn_sets = []
    supervenn_labels = []
    for cluster in cluster_classes:
        reactions_temp = []
        for species in cluster_classes[cluster]:
            species_reactions_dataframe = reactions_dataframe[reactions_dataframe[species] == True]
            reactions_temp.extend(species_reactions_dataframe.index.tolist())
        supervenn_sets.append(set(reactions_temp))
        supervenn_labels.append(cluster)

    supervenn(supervenn_sets, supervenn_labels, sets_ordering='minimize gaps')
    plt.savefig(output_folder_upset + '/supervenn.png', bbox_inches='tight')
    plt.clf()

    return




[docs]
def create_intervene_graph(absence_presence_matrix, reactions_dataframe, temp_data_folder, path_to_intervene, output_folder_upset, dendrogram_fclusters, k, verbose=False):
    """
    Create an upset graph. Deprecated function, no we use supervenn look at create_supervenn function.

    Parameters
    ----------
    absence_presence_matrix: pandas.DataFrame
        transposition of the reactions dataframe
    reactions_dataframe: pandas.DataFrame
        dataframe containing absence/presence of reactions in organism
    temp_data_folder: str
        temporary data folder
    path_to_intervene: str
        path to intervene bin
    output_folder_upset: str
        path to output folder
    dendrogram_fclusters: dictionary
        {number used to split the linkage matrix: ndarray with the corresponding clusters}
    k: int
        number of cluster to create

    """
    if k < 2:
        print('intervene needs at least 2 clusters to work.')
        return
    # Extract species in each cluster.
    results = dendrogram_fclusters[k]

    species = absence_presence_matrix.index.tolist()

    cluster_species = dict(zip(species, results))
    cluster_classes = defaultdict(list)

    for key, value in cluster_species.items():
        cluster_classes[value].append(key)

    # Extract reactions in each cluster.
    cluster_reactions = {}
    for cluster in cluster_classes:
        reactions_temp = []
        for species in cluster_classes[cluster]:
            species_reactions_dataframe = reactions_dataframe[reactions_dataframe[species] == True]
            reactions_temp.extend(species_reactions_dataframe.index.tolist())
        cluster_reactions[cluster] = set(reactions_temp)

    # Create data for creating upset graph using intervene.
    n = 0
    folder_names = {}
    for cluster in cluster_classes:
        cluster_name = 'upset_cluster_' + str(n)
        df = pa.DataFrame({cluster_name: list(cluster_reactions[cluster])})
        df.to_csv(temp_data_folder+'/'+cluster_name+'.tsv', sep='\t', index=None, header=None)
        folder_names[cluster_name] = '_'.join(cluster_classes[cluster])
        n += 1
    df_cluster_name = pa.DataFrame.from_dict(folder_names, orient='index')
    df_cluster_name.reset_index(inplace=True)
    df_cluster_name.columns = ['species', 'cluster']
    df_cluster_name.to_csv(output_folder_upset+'/cluster_name.tsv', sep='\t', index=None)

    cmd = '{0} upset -i  {1}/*.tsv --type list -o {2} --figtype svg'.format(path_to_intervene, temp_data_folder, output_folder_upset)
    if verbose:
        subprocess.call(cmd, shell=True)
    else:
        FNULL = open(os.devnull, 'w')
        subprocess.call(cmd, shell=True, stdout=FNULL, stderr=subprocess.STDOUT)

    return




[docs]
def add_dendrogram_node_label(reaction_dendrogram, node_list, reactions_clust, len_longest_cluster_id):
    """
    Using cluster nodes, add label and reactions number on each node of the dendrogram.
    This function comes from this answer on stackoverflow: https://stackoverflow.com/a/43519473 

    Parameters
    ----------
    reactions_dataframe: pandas.DataFrame
        dataframe containing absence/presence of reactions in organism
    node_list: list
        cluster nodes
    reactions_clust: dictionary
        reactions in each cluster of the tree
    len_longest_cluster_id: int
        reactions in each cluster of the tree
    """
    # Add intersection information to dendrogram.
    # Extract coords from dendrogram.
    # Get leave coordinates, which are at y == 0
    Xcoords = [item for sublist in reaction_dendrogram['icoord'] for item in sublist]
    Ycoords = [item for sublist in reaction_dendrogram['dcoord'] for item in sublist]
    leave_coords = [(x,y) for x,y in zip(Xcoords,Ycoords) if y==0]

    # Map leave ID and coords.
    # In the dendogram data structure,
    # leave ids are listed in ascending order according to their x-coordinate
    order = np.argsort([x for x,y in leave_coords])
    id_to_coord = dict(zip(reaction_dendrogram['leaves'], [leave_coords[idx] for idx in order]))

    # Map endpoint of each link to coordinates of parent node.
    # From two childs, compute the parent coords.
    # dendrogram['icoord'] contains 4 x coordinates for U-shape segment.
    # dendrogram['dcoord'] contains 4 y coordinates for U-shape segment.
    # Each 4 couples of x[i]y[i] corresponds to a point in the U-shape segment:
    # y axis
    # |      parent
    # | x1y1___|___x2y2
    # |    |       |
    # |    |       |
    # |    |       |
    # | x0y0       x3y3
    # ------------------ x axis
    # Using x1 and x2 we can compute the center of the segment, which corresponds to x coord of the parent node.
    # Using y1 or y2, we have the y coordinates of the parent node.
    children_to_parent_coords = dict()
    for xcoords, ycoords in zip(reaction_dendrogram['icoord'], reaction_dendrogram['dcoord']):
        xparent_coord = (xcoords[1] + xcoords[2]) / 2
        yparent_coord = ycoords[1]
        parent_coord = (xparent_coord, yparent_coord)
        left_coord = (xcoords[0], ycoords[0])
        right_coord = (xcoords[3], ycoords[3])
        children_to_parent_coords[(left_coord, right_coord)] = parent_coord

    if all((coord[1]==0 for coords in list(children_to_parent_coords.keys()) for coord in coords)) and all((coords[1]==0 for coords in list(children_to_parent_coords.values()))):
        return None

    # Create a range from the latest leaves to the higher node.
    ids_left = range(len(reaction_dendrogram['leaves']), len(node_list))

    # Iterate on all the nodes.
    # Using children (leaves), retrieve the coords of parent (nodes).
    # Until all nodes have coords.
    while len(ids_left) > 0:
        for node_id in ids_left:
            node = node_list[node_id]
            if (node.left.id in id_to_coord) and (node.right.id in id_to_coord):
                left_coord = id_to_coord[node.left.id]
                right_coord = id_to_coord[node.right.id]
                id_to_coord[node_id] = children_to_parent_coords[(left_coord, right_coord)]

        ids_left = [node_id for node_id in range(len(node_list)) if not node_id in id_to_coord]

    # For each node, add the corresponding cluster name and the number of reactions.
    for node_id, (x, y) in id_to_coord.items():
        if not node_list[node_id].is_leaf():
            plt.plot(x, y, 'ro')
            node_label = str(node_id) + ' (' + str(len(reactions_clust['cluster_'+str(node_id).zfill(len_longest_cluster_id)])) + ')'
            plt.annotate(node_label, (x, y), xytext=(0, -8),
                        textcoords='offset points',
                        va='top', ha='center')

    return True



[docs]
def comparison_cluster(reactions_clust, output_folder_comparison):
    """
    Compare all cluster one against another.

    Parameters
    ----------
    reactions_clust: dictionary
        reactions in each cluster of the tree
    output_folder_comparison: str
        path to output folder
    """
    import itertools
    for cluster_1, cluster_2 in itertools.permutations(reactions_clust, 2):
        test = open(output_folder_comparison + cluster_1 + '_vs_' + cluster_2, 'w')
        test.write(str(set(reactions_clust[cluster_1]) - set(reactions_clust[cluster_2])))
        test.close()




[docs]
def getNewick(node, newick, parentdist, leaf_names):
    """
    Create a newick file from the root node of the dendrogram.
    This function comes from this answer on stackoverflow: https://stackoverflow.com/a/31878514.

    Parameters
    ----------
    node: scipy.cluster.hierarchy.ClusterNode
        root ClusterNode of the scipy tree
    newick: str
        newick string
    parentdist: str
        root ClusterNode distance from the linkage matrix
    leaf_names: list
        list of organism names
    """
    if node.is_leaf():
        return "%s:%.2f%s" % (leaf_names[node.id], parentdist - node.dist, newick)
    else:
        if len(newick) > 0:
            newick = "):%.2f%s" % (parentdist - node.dist, newick)
        else:
            newick = ");"
        newick = getNewick(node.get_left(), newick, node.dist, leaf_names)
        newick = getNewick(node.get_right(), ",%s" % (newick), node.dist, leaf_names)
        newick = "(%s" % (newick)
        return newick




[docs]
def absent_and_specific_reactions(reactions_dataframe, output_folder_tree_cluster, output_folder_specific, output_folder_absent, organisms):
    """
    Compare all cluster one against another.

    Parameters
    ----------
    reactions_dataframe: pandas.DataFrame
        dataframe containing absence/presence of reactions in organism
    output_folder_tree_cluster: str
        path to output tree cluster folder
    output_folder_specific: str
        path to output folder with specific reactions for each species
    output_folder_absent: str
        path to output folder with absent reactions for each species
    organisms: list
        organisms names
    """
    specific_file = output_folder_tree_cluster + 'absent_specific_reactions.tsv'
    specific_output = open(specific_file, 'w')
    specific_writer = csv.writer(specific_output, delimiter='\t')
    specific_writer.writerow(['Organism', 'NB reactions', 'Unique reactions', 'Absent reactions'])
    for species in sorted(organisms):
        reactions_in_species = set(reactions_dataframe[reactions_dataframe[species]==True].index.tolist())
        reactions_absent_in_others = set(reactions_dataframe[reactions_dataframe[list(set(organisms)-{species})].any(1)==False].index.tolist())
        reactions_only_in_species = list(reactions_in_species.intersection(reactions_absent_in_others))
        tmp_reactions_dataframe = reactions_dataframe.loc[reactions_only_in_species]
        tmp_reactions_dataframe.to_csv(output_folder_specific+species+'.tsv', sep='\t')

        reactions_not_in_species = set(reactions_dataframe[reactions_dataframe[species]==False].index.tolist())
        reactions_in_others = set(reactions_dataframe[reactions_dataframe[list(set(organisms)-{species})].all(1)==True].index.tolist())
        reactions_only_not_in_species = list(reactions_not_in_species.intersection(reactions_in_others))
        tmp_reactions_dataframe = reactions_dataframe.loc[reactions_only_not_in_species]
        tmp_reactions_dataframe.to_csv(output_folder_absent+species+'.tsv', sep='\t')
        specific_writer.writerow([species, len(reactions_in_species),
                                        len(list(reactions_in_species.intersection(reactions_absent_in_others))),
                                        len(list(reactions_not_in_species.intersection(reactions_in_others)))])
    specific_output.close()




[docs]
def reaction_figure_creation(reaction_file, output_folder, upset_cluster=None, padmetRef_file=None, pvclust=None, verbose=False):
    """
    Create dendrogram, upset figure (if upset argument) and compare reactiosn in species.

    Parameters
    ----------
    reaction_file: str
        path to reaction file
    upset_cluster: int
        the number of cluster you want in the intervene figure
    output_folder: str
        path to output folder
    padmet_ref_file: str
        path to padmet ref file
    pvclust: bool
        boolean to launch or not R pvclust dendrogram
    """
    # Check if output_folder exists, if not create it.
    output_folder_tree_cluster = output_folder + '/tree_cluster/'
    output_folder_comparison = output_folder + '/tree_cluster/comparison_cluster/'
    output_folder_specific = output_folder_tree_cluster + 'specific_reactions/'
    output_folder_absent = output_folder_tree_cluster + 'absent_reactions/'
    if upset_cluster:
        output_folder_upset = output_folder + '/upset_graph'
        temp_data_folder = output_folder + '/upset_graph/temp_data/'
        folders = [output_folder, output_folder_tree_cluster, output_folder_comparison, output_folder_specific, output_folder_absent, output_folder_upset, temp_data_folder]
    else:
        folders = [output_folder, output_folder_tree_cluster, output_folder_comparison, output_folder_specific, output_folder_absent]

    for folder in folders:
        if not os.path.isdir(folder):
            os.mkdir(folder)

    if not os.path.exists(reaction_file):
        raise FileNotFoundError("No reactions.tsv file accessible at " + reaction_file)

    # Read the reactions file with pandas.
    all_reactions_dataframe = pa.read_csv(reaction_file, sep='\t')
    # Keep column containing absence-presence of reactions.
    # (columns with (sep=;) are column with gene name linked to reactions)
    # (columns with _formula contain the reaction formula)
    columns = [column for column in all_reactions_dataframe.columns if '(sep=;)' not in column]
    columns = [column for column in columns if '_formula' not in column]
    reactions_dataframe = all_reactions_dataframe[columns].copy()

    reactions_dataframe.set_index('reaction', inplace=True)

    # Transpose the matrix to have species as index and reactions as columns.
    absence_presence_matrix = reactions_dataframe.transpose()

    # Compute a distance matrix using the Jaccard distance between species and condense it.
    condensed_distance_matrix_jaccard = pdist(absence_presence_matrix, metric='jaccard')

    # Hierarchical clustering on the condensed distance matrix.
    linkage_matrix = linkage(condensed_distance_matrix_jaccard, method='average', metric='jaccard')

    # Draw a dendrogram of the clustering.
    reaction_dendrogram = dendrogram(linkage_matrix, labels=absence_presence_matrix.index, leaf_font_size=100, leaf_rotation=90)

    # Extract organisms.
    organisms = absence_presence_matrix.index.tolist()

    # Create Newick tree
    tree = to_tree(linkage_matrix,False)
    newick_tree = getNewick(tree, "", tree.dist, organisms)
    newick_path = os.path.join(output_folder,'newick.txt')
    with open(newick_path, 'w') as f:
        f.write(newick_tree)

    # Specific reactions for each species.
    absent_and_specific_reactions(reactions_dataframe, output_folder_tree_cluster, output_folder_specific, output_folder_absent, organisms)

    if pvclust:
        pvclust_reactions_dataframe = all_reactions_dataframe[columns].copy()

        pvclust_reactions_dataframe.set_index('reaction', inplace=True)
        # Create pvclust dendrogram.
        pvclust_dendrogram(pvclust_reactions_dataframe, organisms, output_folder)

    # Extract all the nodes inside the clustering. 
    _, node_list = to_tree(linkage_matrix, rd=True)

    if padmetRef_file:
        padmet_ref = PadmetRef(padmetRef_file)
        metacyc_to_ecs = {node.id: node.misc['EC-NUMBER'] for node in padmet_ref.dicOfNode.values() if node.type == "reaction" and 'EC-NUMBER' in node.misc}
    else:
        metacyc_to_ecs = {}

    # For each cluster, give the list of organisms in it.
    # Then write it in a file.
    len_longest_cluster_id = len(str(max([node.id for node in node_list])))
    cluster_leaf_species = {}
    for node in node_list:
        node_leafs = node.pre_order(lambda child: organisms[child.id] if child.is_leaf() else None)
        cluster_leaf_species['cluster_'+str(node.id).zfill(len_longest_cluster_id)] = node_leafs

    species_clustered_df = pa.DataFrame(columns=organisms)
    for cluster_leaf in cluster_leaf_species:
        tmp_organism_cluster = [True if organism in cluster_leaf_species[cluster_leaf] else False for organism in species_clustered_df.columns]
        species_clustered_df.loc[cluster_leaf]  = tmp_organism_cluster

    species_clustered_df = species_clustered_df.replace(np.nan, False)
    species_clustered_df.to_csv(output_folder_tree_cluster + 'clustered_species.tsv', sep='\t')

    # Create xml structure from hierarchical clustering.
    root = hclust_to_xml(linkage_matrix)

    # Post order traversal of the tree.
    d = {}
    for element in root.iter():
        d[element.tag] = [child.tag for child in element]

    post_order_clusters = {}
    for node in node_list:
        node_label = 'cluster_'+str(node.id).zfill(len_longest_cluster_id)
        if d[node_label] == []:
            species = cluster_leaf_species[node_label]
            tmp_reactions = reactions_dataframe[reactions_dataframe[species].all(1) == True]
            post_order_clusters[node_label] = tmp_reactions.index.tolist()
        else:
            if set(post_order_clusters[d[node_label][0]]).intersection(set(post_order_clusters[d[node_label][1]])) != set():
                post_order_clusters[node_label] = set(post_order_clusters[d[node_label][0]]).intersection(set(post_order_clusters[d[node_label][1]]))
            else:
                post_order_clusters[node_label] = set(post_order_clusters[d[node_label][0]]).union(set(post_order_clusters[d[node_label][1]]))

    # Use xml structure to create intersection files.
    reactions_clust = create_intersection_files(root, cluster_leaf_species, reactions_dataframe, output_folder_tree_cluster, metacyc_to_ecs)

    comparison_cluster(reactions_clust, output_folder_comparison)

    # Add label contaning cluster name and reaction number to each node.
    check_label = add_dendrogram_node_label(reaction_dendrogram, node_list, reactions_clust, len_longest_cluster_id)

    if not check_label:
        print('Warning: no label for cluster name have been added.')

    # Create dendrogram, bbox option adjsut the figure size.
    plt.savefig(output_folder+'/reaction_dendrogram.png',bbox_inches='tight')
    plt.clf()

    if upset_cluster:
        dendrogram_fclusters = create_cluster(reactions_dataframe, absence_presence_matrix, linkage_matrix)
        create_supervenn(absence_presence_matrix, reactions_dataframe, output_folder_upset, dendrogram_fclusters, k, verbose)