common_functions.py

import math
import operator

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import MinMaxScaler
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree
import matplotlib.patches as mpatches
from matplotlib.colors import LinearSegmentedColormap
import datetime
from ctypes import cdll
import ctypes

# from gmeans import GMeans

sns.set()


def get_clusters(data,
                 sample_data=None,
                 mean_methylation=None,
                 n_clusters=11,
                 recursive=False,
                 n_init=100,
                 max_iter=1000):
    clusters, cluster_named_dict = get_ordered_clusters(
        data,
        mean_methylation=mean_methylation,
        n_clusters=n_clusters,
        sample_data=sample_data,
        n_init=n_init,
        max_iter=max_iter)

    clusters_row = pd.Series(clusters, name='cluster', index=data.columns)
    if recursive:
        
        # data = data.append(clusters_row)
        grouped = data.groupby(by=clusters_row, axis=1)
        subcluster_series = pd.Series(name='subcluster')
        
        for name, group in grouped:
            # gm = GMeans(min_obs=2, max_depth=500, strictness=3)
            # group = group.drop('cluster')
            # gm.fit(group.T)
            subcluster_linkage = linkage(group.T, optimal_ordering=True, method='average', metric="correlation")
            linkage_order = dendrogram(
                subcluster_linkage, 
                distance_sort='ascending', 
                no_plot=True, 
                labels=group.columns)['ivl']
            linkage_subclusters = {}
            for i, x in enumerate(linkage_order):
                linkage_subclusters[x] = i
            
            # n_subclusters = len(np.unique(gm.labels_))
            # subclusters, subcluster_named_dict = get_ordered_clusters(
            #     group,
            #     mean_methylation=mean_methylation,
            #     n_clusters=n_subclusters,
            #     clusters=gm.labels_,
            #     n_init=n_init,
            #     max_iter=max_iter)
            subcluster_series = subcluster_series.append(
                pd.Series(linkage_subclusters, index=group.columns))

        return clusters_row, subcluster_series

    return clusters_row

# def get_correlation_clusters(data,
#                  sample_data=None,
#                  mean_methylation=None,
#                  n_clusters=11,
#                  recursive=False,
#                  n_init=100,
#                  max_iter=1000):
    
    
#     clusters = KMeans(n_clusters=n_clusters,
#                         n_init=n_init,
#                         max_iter=max_iter,n_jobs=-1,init='random').fit_predict(data.corr())

#     clusters_row = pd.Series(clusters, name='cluster', index=data.columns)

#     return clusters_row


def get_ordered_clusters(data,
                         n_clusters=None,
                         mean_methylation=None,
                         sample_data=None,
                         n_init=100,
                         max_iter=1000,
                         clusters=None,
                         return_kmeans_estimator=False):

    #get weights by frequency of cell_line in the dataset
    if clusters is None:
        if sample_data is None:
            kmeans_estimator = KMeans(n_clusters=n_clusters,
                            n_init=n_init,
                            max_iter=max_iter,n_jobs=-1,init='random').fit(data.T)
            clusters = kmeans_estimator.predict(data.T) 
                        # KMeans(n_clusters=n_clusters,
                        #     n_init=n_init,
                        #     max_iter=max_iter,n_jobs=-1,init='random').fit_predict(data.T)
        else:
            sample_data_subset = sample_data.loc[data.columns.tolist(), :]
            cell_lines = sample_data_subset['cell_line']
            counts = cell_lines.value_counts()
            cell_line_weights = 1 / cell_lines.map(counts)

            kmeans_estimator = KMeans(n_clusters=n_clusters,
                            n_init=n_init,
                            max_iter=max_iter,n_jobs=-1,init='random').fit(
                                data.T, sample_weight=cell_line_weights)

            clusters = kmeans_estimator.predict(data.T)

    cluster_names_dict = {}

    if mean_methylation is None:
        mean_methylation = pd.DataFrame(data.mean())
        mean_methylation = MinMaxScaler(
            feature_range=(0, 5)).fit_transform(mean_methylation)

    for cluster in list(set(clusters)):
        cluster_names_dict[cluster] = np.mean(
            mean_methylation[clusters == cluster])

    sorted_x = sorted(cluster_names_dict.items(), key=lambda kv: kv[1])
    sorted_x.reverse()

    cluster_map_dict = {}

    cluster_names_dict_fixed = {}
    clusters_fixed = [0 for x in clusters]

    for i, sorted_tup in enumerate(sorted_x):
        for j, clust in enumerate(clusters):
            if clust == sorted_tup[0]:
                clusters_fixed[j] = i

        if i == 0:
            cluster_names_dict_fixed[i] = 'Non-Eroded'
        elif i == len(sorted_x) - 1:
            cluster_names_dict_fixed[i] = 'Eroded'
        else:
            cluster_names_dict_fixed[i] = sorted_tup[1]
        cluster_map_dict[sorted_tup[0]] = i

    clusters = np.array(clusters_fixed)
    # cluster_names_dict = cluster_names_dict_fixed

    if return_kmeans_estimator:
        return clusters, cluster_names_dict_fixed, kmeans_estimator, cluster_map_dict
    
    return clusters, cluster_names_dict_fixed


def biplot(dat: pd.DataFrame,
           clf,
           clusters,
           cluster_names_dict,
           mean_methylation=None,
           title='LDA of dataset'):

    lda_embedding = clf.transform(dat.values)

    fig = plt.figure()

    for cluster in list(set(clusters)):
        cluster_x = lda_embedding[clusters == cluster, 0]
        if lda_embedding.shape[1] == 1:
            cluster_y = [1] * len(cluster_x)
        else:
            cluster_y = lda_embedding[clusters == cluster, 1]
        plt.scatter(cluster_x,
                    cluster_y,
                    color=sns.color_palette(n_colors=11)[cluster],
                    label=cluster_names_dict[cluster])

    plt.legend(loc='best', shadow=False, scatterpoints=1)
    plt.title(title)
    explained = clf.explained_variance_ratio_
    plt.xlabel('LDA 1 ({:.1%})'.format(explained[0]))
    if lda_embedding.shape[1] == 1:
        plt.ylabel('No LDA 2')
    else:
        plt.ylabel('LDA 2 ({:.1%})'.format(explained[1]))
    return fig


def detect_outlier(data_1, threshold=1.645, mean=None, only_positive=False):
    mean_1 = np.mean(data_1)
    if mean is not None:
        mean_1 = mean
    std_1 = np.std(data_1)
    z_scores = (data_1 - mean_1) / std_1
    outliers = np.abs(z_scores) >= threshold
    if only_positive:
        outliers = z_scores >= threshold

    return outliers


def filter_data(data, clf, num_sites_wanted=70):

    scalings_0 = clf.scalings_[:, 0]
    scalings_1 = clf.scalings_[:, 1]
    outliers_0 = detect_outlier(scalings_0, threshold=1.645)  #top 10%
    outliers_1 = detect_outlier(scalings_1, threshold=3)  #top 1%

    combined_indicies = np.logical_or(outliers_0, outliers_1)
    # combined_indicies = outliers_0
    filtered_data = data.loc[combined_indicies, ]
    return filtered_data

def filter_data_coef(data, clf):
    coefs = clf.coef_
    all_outliers = np.abs(coefs[0]) >= 50  #detect_outlier(coefs[0], threshold=2, only_positive=False) #top 5%
    for c in coefs[1:]:
        outliers = np.abs(c) >= 50 #detect_outlier(c, threshold=2, only_positive=False) #top 5%
        if np.sum(c[outliers]>0) != np.sum(outliers):
            print('error error')
        else:
            print('all good')
        all_outliers = np.logical_or(all_outliers, outliers)
    
    filtered_data = data.loc[all_outliers, ]
    return filtered_data

def run_feature_selection(data, mean_methylation, clusters,
                          cluster_names_dict, use_coef=False):

    # column_linkage = linkage(data.T, optimal_ordering=True, method='average')

    clf = LinearDiscriminantAnalysis(n_components=2)
    clf.fit(data.T, clusters)

    biplot(dat=data.T,
           clf=clf,
           clusters=clusters,
           cluster_names_dict=cluster_names_dict,
           mean_methylation=mean_methylation,
           title='LDA of more variant sites')

    print('lda explained ratios: ', clf.explained_variance_ratio_)

    #get the significant sites
    if use_coef:
        filtered_data = filter_data_coef(data, clf)
    else:
        filtered_data = filter_data(data, clf)
    
    return filtered_data, clf


def make_cluster_map(data,
                     w=12,
                     h=36,
                     y_font_size=4,
                     x_font_size=8,
                     col_cluster=True,
                     row_cluster=True,
                     row_linkage=None,
                     col_linkage=None,
                     row_colors=None,
                     col_colors=None,
                     method="centroid",
                     metric="euclidean",
                     xticklabels="auto",
                     yticklabels="auto"):
    sns.set()
    sns.set_style({'font.sans-serif': 'Roboto'})
    #the colors stefan wanted
    cpal = sns.diverging_palette(180, 300, s=99, l=65, n=9)

    cmap = ListedColormap(cpal.as_hex())

    hm = sns.clustermap(
        data,
        cmap=cmap,
        method=method,
        metric=metric,
        # method="ward",
        # metric='cityblock',
        vmin=0, vmax=1,
        figsize=(w, h),
        col_cluster=col_cluster,
        row_cluster=row_cluster,
        row_linkage=row_linkage,
        col_linkage=col_linkage,
        row_colors=row_colors,
        col_colors=col_colors,
        xticklabels=xticklabels, 
        yticklabels=yticklabels )
    hm.ax_heatmap.tick_params(axis='y', labelsize=y_font_size)
    hm.ax_heatmap.tick_params(axis='x', labelsize=x_font_size)
    return hm


def get_special_row1_colors():
    paired_palette = sns.color_palette("Paired",n_colors=10)
    row_color1 = [
        paired_palette[0],
        paired_palette[2],
        paired_palette[4],
        paired_palette[6],
        paired_palette[8],
        paired_palette[1],
        paired_palette[3],
        paired_palette[5],
        paired_palette[7],
        paired_palette[9],
        (0,0,0),
    ]

    return row_color1

def make_kmeans_clustermap(df_to_plot,
                           sample_data,
                           mean_methylation=None,
                           n_clusters_samples=6,
                           col_clusters=None,
                           site_cluster_series=None,
                           site_ordering=None,
                           n_clusters_sites=11,
                           genes_to_mark=['XIST','LOC286467','PLS3'],
                           plot_title="",
                           w=12,
                           h=36,
                           y_font_size=4,
                           x_font_size=8,
                           use_special_row_colors=False,
                           hide_x_labels=False,
                           hide_y_labels=False,
                           gene_idx=3):
    if mean_methylation is None:
        mean_methylation = pd.DataFrame(df_to_plot.mean())
        mean_methylation = MinMaxScaler(
            feature_range=(0, 5)).fit_transform(mean_methylation)

    mean_methylation_row = pd.Series(df_to_plot.mean(),
                                     name='mean_methylation')
    # sample_linkage = linkage(df_to_plot.T, optimal_ordering=True, method='average', metric='correlation')
    # linkage_order = dendrogram(
    #     sample_linkage, 
    #     distance_sort='descending', 
    #     no_plot=True, 
    #     labels=df_to_plot.columns)['ivl']
    # sample_ordering = {}
    # for i, x in enumerate(linkage_order):
    #     sample_ordering[x] = i

    # mean_methylation_row = pd.Series(sample_ordering,
    #                                  name='mean_methylation')

    if col_clusters is None:
        clusters, cluster_names_dict = get_clusters(df_to_plot,
                                                    mean_methylation,
                                                    n_clusters_samples,
                                                    sample_data=sample_data)
    else:
        clusters = col_clusters
    clusters_row = pd.Series(clusters,
                             name='clusters',
                             index=df_to_plot.columns)

    df_copy = df_to_plot
    df_copy = df_copy.append(clusters_row).append(mean_methylation_row)
    df_copy = df_copy.sort_values(by=['clusters', 'mean_methylation'],
                                  ascending=[True, False],
                                  axis=1)
    df_copy = df_copy.drop('clusters').drop('mean_methylation')

    # if site_ordering is None:
    site_linkage = linkage(df_copy, optimal_ordering=True, method='average')
    linkage_order = dendrogram(
        site_linkage, 
        distance_sort='ascending', 
        no_plot=True, 
        labels=df_copy.index)['ivl']
    site_ordering = {}
    linkage_order.reverse()
    for i, x in enumerate(linkage_order):
        site_ordering[x] = i
    
    clusters_col = []
    if site_cluster_series is None:
        clusters_col = get_clusters(df_copy.T.corr(), n_clusters=n_clusters_sites, mean_methylation=df_copy.T.mean()) 
    elif site_cluster_series is not False:
        clusters_col = site_cluster_series

    row_colors = []
    if site_cluster_series is not False:
        df_copy['cluster'] = pd.Series(clusters_col, name='cluster')
        df_copy['cluster2'] = pd.Series(site_ordering, name='cluster2')
        df_copy = df_copy.sort_values(by=['cluster','cluster2'],ascending=[True, True], axis=0)
        df_copy = df_copy.drop(['cluster', 'cluster2'], axis=1)
        if use_special_row_colors:
            row1_pal = get_special_row1_colors()
        else:
            row1_pal = sns.color_palette("Paired",n_colors=n_clusters_sites)

        row_color1 = [ row1_pal[int(x)] for x in clusters_col.loc[df_copy.index].tolist()]
        row_colors = [row_color1]
    # site_clusters = fcluster(site_linkage, t=n_clusters_sites, criterion='maxclust')
    # site_cluster_series = pd.Series(site_clusters, index=df_to_plot.index)
    
    
    if genes_to_mark != [] and genes_to_mark is not None:
        row_color2, gene_color_legend = get_row_colors(df_copy, genes_to_mark, gene_idx=gene_idx)
        row_colors.append(row_color2)

    xticklabels="auto"
    yticklabels="auto"
    if hide_x_labels:
        xticklabels=False
    
    if hide_y_labels:
        yticklabels=False
    
    row_cluster=False
    if not isinstance(site_cluster_series, pd.Series) and site_cluster_series == False:
        row_cluster = True
    
    hm = make_cluster_map(
        df_copy,
        col_colors=[
            sns.color_palette("Set2",n_colors=len(clusters_row.unique()))[x]
            for x in clusters_row.sort_values().tolist()
        ],
        col_cluster=False,
        row_colors= row_colors,
        row_cluster=row_cluster,
        w=w,
        h=h,
        y_font_size=y_font_size,
        x_font_size=x_font_size,
        xticklabels=xticklabels,
        yticklabels=yticklabels)
    
    if genes_to_mark != [] and genes_to_mark is not None:
        l2 = hm.ax_col_dendrogram.legend(loc='upper left',
            # bbox_to_anchor=(1.01,0.85),
            handles=gene_color_legend,
            frameon=True)
        l2.set_title(title='Genes',prop={'size':10})
    hm.fig.suptitle(plot_title)

    return hm, df_copy, clusters_col

def get_row_colors(ordered_df, genes_to_mark, gene_idx=3):
    row_color = []
    gene_colors = sns.color_palette('Set1', n_colors=len(genes_to_mark))
    
    gene_color_legend = [mpatches.Patch(color=gene_colors[i], label=g) for i,g in enumerate(genes_to_mark)]

    gene_counts = [0] * len(genes_to_mark)

    for x in ordered_df.index:
        genes = str(x[gene_idx])
        found = False
        for i,g in enumerate(genes_to_mark):
            if g in genes:
                row_color.append(gene_colors[i])
                gene_counts[i] += 1
                found = True
                break
        if not found:
            row_color.append('white')

    for i,g in enumerate(genes_to_mark):
        print(g, ' - ', gene_counts[i])
    
    return row_color, gene_color_legend

def round_up_to_multiple(number, multiple):
    num = number + (multiple - 1)
    return num - (num % multiple)

'''
Get the HiC stuff from juicer

cd /mnt/c/Users/Prakhar\ Bansal/Downloads
java -jar juicer_tools_1.11.04_jcuda.0.8.jar dump observed VC GSM3576787_iPSCORE_2_1.iPSC.hic X X BP 5000 GSM3576787_iPSCORE_2_1_chrX_chrX_5kb.txt
'''

def get_hi_c_matrix(
    hi_c_df: pd.DataFrame, 
    sites_index: list,
    distance_cache_name=None, 
    use_cached=False,
    resolution=1000):

    if distance_cache_name is None and not use_cached:
        distance_cache_name = "hic_cache/hi_c_cache_{}.csv".format(datetime.datetime.now())
    elif distance_cache_name is None and use_cached:
        raise ValueError('If you want to use a cached table, please specify distance_cache_name')

    if not use_cached:
        #access go function
        lib = cdll.LoadLibrary("./libtomutil.so")
        class GoString(ctypes.Structure):
            _fields_ = [("p", ctypes.c_char_p), ("n", ctypes.c_longlong)]
        lib.FormatHiCMatrix.argtypes = [GoString, GoString, GoString, GoString]
        hi_c_filename = "hi_c.csv"
        hi_c_df.to_csv(hi_c_filename, header=False)                
        hiCMatrixFilename = GoString(hi_c_filename.encode('UTF-8'), len(hi_c_filename))

        #write the site names in a understandable format
        formatted_data = []
        for site in sites_index:
            pos = site[2]
            formatted_data.append([site, pos])
        formatted_data = pd.DataFrame(formatted_data, columns = None, index=None)
        formatted_data_filename = "data_temp.csv"
        formatted_data.to_csv(formatted_data_filename, header=None, index=False)
        dataFilename = GoString(formatted_data_filename.encode('UTF-8'), len(formatted_data_filename))

        outFilename = GoString(distance_cache_name.encode('UTF-8'), len(distance_cache_name))

        resolution_str = GoString(str(resolution).encode('UTF-8'), len(str(resolution)))
        lib.FormatHiCMatrix(hiCMatrixFilename, dataFilename, outFilename, resolution_str)

        return pd.read_csv(distance_cache_name, index_col=0)

    else:
        return pd.read_csv(distance_cache_name, index_col=0)


def make_correlation_clustermap(corr_df, cluster_series, index_order,use_special_row_colors=False):
    ordered_df = corr_df.loc[index_order, index_order]
    ordered_clusters = cluster_series.loc[index_order]

    mask = np.zeros_like(ordered_df)
    mask[np.triu_indices_from(mask)] = True

    color_pal = sns.color_palette("Paired",n_colors=ordered_clusters.max()+1)
    if use_special_row_colors:
        color_pal = get_special_row1_colors()

    colors = [color_pal[x] for x in ordered_clusters]

    # print(colors)

    hm = sns.clustermap(
        ordered_df,
        vmin=-1, vmax=1,center=0,
        col_cluster=False,
        row_cluster=False,
        xticklabels=False, yticklabels=False,
        row_colors=colors,
        col_colors=colors,
        cmap=sns.diverging_palette(20, 220, n=200))

    return hm

def make_hic_clustermap(hic_df, cluster_series, index_order, use_special_row_colors=False, use_linear_distances=False):
    ordered_df = hic_df.loc[index_order, index_order]
    ordered_clusters = cluster_series.loc[index_order]

    mask = np.zeros_like(ordered_df)
    mask[np.triu_indices_from(mask)] = True

    color_pal = sns.color_palette("Paired",n_colors=ordered_clusters.max()+1)
    if use_special_row_colors:
        color_pal = get_special_row1_colors()

    colors = [color_pal[x] for x in ordered_clusters]

    # print(colors)

    cdict = {'red':   ((0.0,  1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'green': ((0.0,  1.0, 1.0),
                   (1.0,  0.0, 0.0)),

         'blue':  ((0.0,  1.0, 1.0),
                   (1.0,  0.0, 0.0))}

    custom_cm = LinearSegmentedColormap(
            "white2red", cdict)
    vmin=0
    center=10
    vmax=20
    
    if use_linear_distances:
        
        vmin=0
        center=5000
        vmax=10000
        cdict = {'blue':   ((0.0,  1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'red': ((0.0,  1.0, 1.0),
                   (1.0,  0.0, 0.0)),

         'green':  ((0.0,  1.0, 1.0),
                   (1.0,  0.0, 0.0))}
        custom_cm = LinearSegmentedColormap(
            "white2red", cdict)
        custom_cm = custom_cm.reversed()

    hm = sns.clustermap(
        ordered_df,
        # vmin=vmin, vmax=vmax, center=center,
        col_cluster=False,
        row_cluster=False,
        xticklabels=False, yticklabels=False,
        row_colors=colors,
        col_colors=colors,
        # cmap=custom_cm,
        figsize=(20,20))

    return hm

def make_hic_heatmap(hic_df):
    # print(colors)
    cdict = {'red':   ((0.0,  1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'green': ((0.0,  1.0, 1.0),
                   (1.0,  0.0, 0.0)),

         'blue':  ((0.0,  1.0, 1.0),
                   (1.0,  0.0, 0.0))}

    custom_cm = LinearSegmentedColormap(
            "white2red", cdict)
    vmin=0
    center=10
    vmax=20

    hm = sns.clustermap(
        hic_df,
        # vmin=vmin, vmax=vmax, center=center,
        col_cluster=False,
        row_cluster=False,
        xticklabels=False, yticklabels=False,
        # cmap=custom_cm,
        figsize=(20,20))

    return hm

def make_mean_dist_clustermap(mean_dist_cluster_df, use_special_row_colors=True, use_linear_distances=False):
    # ordered_df = hic_df.loc[index_order, index_order]
    # ordered_clusters = cluster_series.loc[index_order]

    # mask = np.zeros_like(ordered_df)
    # mask[np.triu_indices_from(mask)] = True

    color_pal = sns.color_palette("Paired",n_colors=11)
    if use_special_row_colors:
        color_pal = get_special_row1_colors()

    colors = [color_pal[x] for x in mean_dist_cluster_df.index.tolist()]

    # print(colors)

    cdict = {'red':   ((0.0,  1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'green': ((0.0,  1.0, 1.0),
                   (1.0,  0.0, 0.0)),

         'blue':  ((0.0,  1.0, 1.0),
                   (1.0,  0.0, 0.0))}
    
    if use_linear_distances:
        cdict = {'blue':   ((0.0,  1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'red': ((0.0,  1.0, 1.0),
                   (1.0,  0.0, 0.0)),

         'green':  ((0.0,  1.0, 1.0),
                   (1.0,  0.0, 0.0))}

    custom_cm = LinearSegmentedColormap(
            "white2red", cdict)
    
    if use_linear_distances:
        custom_cm = custom_cm.reversed()

    hm = sns.clustermap(
        mean_dist_cluster_df,
        # vmin=vmin, vmax=vmax, center=center,
        col_cluster=False,
        row_cluster=False,
        xticklabels=False, yticklabels=False,
        row_colors=colors,
        col_colors=colors,
        # cmap=custom_cm,
        figsize=(20,20))

    return hm

def calculate_ppv(order,
                  sample_metadata: pd.DataFrame,
                  sample_col="SampleName",
                  passage_col="Passage",
                  cell_line_col="CellLine",
                  reverse=False):

    ordered_metadata = sample_metadata.sort_values(passage_col)

    grouped_metadata = ordered_metadata.groupby(cell_line_col)
    correct_pairs = 0.0
    total_pairs = 0.0
    for cellline, group in grouped_metadata:
        num_rows = len(group)
        if num_rows > 1:
            i = 1
            for row, data in group.iterrows():
                for row2, data2 in group[i:].iterrows():

                    total_pairs += 1
                    if (not reverse) and order[data[sample_col]] <= order[
                            data2[sample_col]]:
                        correct_pairs += 1
                    elif reverse and order[data[sample_col]] >= order[
                            data2[sample_col]]:
                        correct_pairs += 1
                    else:
                        print('incorrect pair: ', data[sample_col],
                              order[data[sample_col]], data2[sample_col],
                              order[data2[sample_col]])
                i += 1
    if total_pairs == 0:
        return "total pairs = 0"
    print(total_pairs)
    return correct_pairs / total_pairs