Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
contiguous_x_erosion/common_functions.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
742 lines (598 sloc)
24.4 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import operator | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
from matplotlib import pyplot as plt | |
from matplotlib.colors import ListedColormap | |
from scipy.cluster.hierarchy import fcluster | |
from sklearn.cluster import KMeans | |
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis | |
from sklearn.preprocessing import MinMaxScaler | |
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree | |
import matplotlib.patches as mpatches | |
from matplotlib.colors import LinearSegmentedColormap | |
import datetime | |
from ctypes import cdll | |
import ctypes | |
# from gmeans import GMeans | |
sns.set() | |
def get_clusters(data, | |
sample_data=None, | |
mean_methylation=None, | |
n_clusters=11, | |
recursive=False, | |
n_init=100, | |
max_iter=1000): | |
clusters, cluster_named_dict = get_ordered_clusters( | |
data, | |
mean_methylation=mean_methylation, | |
n_clusters=n_clusters, | |
sample_data=sample_data, | |
n_init=n_init, | |
max_iter=max_iter) | |
clusters_row = pd.Series(clusters, name='cluster', index=data.columns) | |
if recursive: | |
# data = data.append(clusters_row) | |
grouped = data.groupby(by=clusters_row, axis=1) | |
subcluster_series = pd.Series(name='subcluster') | |
for name, group in grouped: | |
# gm = GMeans(min_obs=2, max_depth=500, strictness=3) | |
# group = group.drop('cluster') | |
# gm.fit(group.T) | |
subcluster_linkage = linkage(group.T, optimal_ordering=True, method='average', metric="correlation") | |
linkage_order = dendrogram( | |
subcluster_linkage, | |
distance_sort='ascending', | |
no_plot=True, | |
labels=group.columns)['ivl'] | |
linkage_subclusters = {} | |
for i, x in enumerate(linkage_order): | |
linkage_subclusters[x] = i | |
# n_subclusters = len(np.unique(gm.labels_)) | |
# subclusters, subcluster_named_dict = get_ordered_clusters( | |
# group, | |
# mean_methylation=mean_methylation, | |
# n_clusters=n_subclusters, | |
# clusters=gm.labels_, | |
# n_init=n_init, | |
# max_iter=max_iter) | |
subcluster_series = subcluster_series.append( | |
pd.Series(linkage_subclusters, index=group.columns)) | |
return clusters_row, subcluster_series | |
return clusters_row | |
# def get_correlation_clusters(data, | |
# sample_data=None, | |
# mean_methylation=None, | |
# n_clusters=11, | |
# recursive=False, | |
# n_init=100, | |
# max_iter=1000): | |
# clusters = KMeans(n_clusters=n_clusters, | |
# n_init=n_init, | |
# max_iter=max_iter,n_jobs=-1,init='random').fit_predict(data.corr()) | |
# clusters_row = pd.Series(clusters, name='cluster', index=data.columns) | |
# return clusters_row | |
def get_ordered_clusters(data, | |
n_clusters=None, | |
mean_methylation=None, | |
sample_data=None, | |
n_init=100, | |
max_iter=1000, | |
clusters=None, | |
return_kmeans_estimator=False): | |
#get weights by frequency of cell_line in the dataset | |
if clusters is None: | |
if sample_data is None: | |
kmeans_estimator = KMeans(n_clusters=n_clusters, | |
n_init=n_init, | |
max_iter=max_iter,n_jobs=-1,init='random').fit(data.T) | |
clusters = kmeans_estimator.predict(data.T) | |
# KMeans(n_clusters=n_clusters, | |
# n_init=n_init, | |
# max_iter=max_iter,n_jobs=-1,init='random').fit_predict(data.T) | |
else: | |
sample_data_subset = sample_data.loc[data.columns.tolist(), :] | |
cell_lines = sample_data_subset['cell_line'] | |
counts = cell_lines.value_counts() | |
cell_line_weights = 1 / cell_lines.map(counts) | |
kmeans_estimator = KMeans(n_clusters=n_clusters, | |
n_init=n_init, | |
max_iter=max_iter,n_jobs=-1,init='random').fit( | |
data.T, sample_weight=cell_line_weights) | |
clusters = kmeans_estimator.predict(data.T) | |
cluster_names_dict = {} | |
if mean_methylation is None: | |
mean_methylation = pd.DataFrame(data.mean()) | |
mean_methylation = MinMaxScaler( | |
feature_range=(0, 5)).fit_transform(mean_methylation) | |
for cluster in list(set(clusters)): | |
cluster_names_dict[cluster] = np.mean( | |
mean_methylation[clusters == cluster]) | |
sorted_x = sorted(cluster_names_dict.items(), key=lambda kv: kv[1]) | |
sorted_x.reverse() | |
cluster_map_dict = {} | |
cluster_names_dict_fixed = {} | |
clusters_fixed = [0 for x in clusters] | |
for i, sorted_tup in enumerate(sorted_x): | |
for j, clust in enumerate(clusters): | |
if clust == sorted_tup[0]: | |
clusters_fixed[j] = i | |
if i == 0: | |
cluster_names_dict_fixed[i] = 'Non-Eroded' | |
elif i == len(sorted_x) - 1: | |
cluster_names_dict_fixed[i] = 'Eroded' | |
else: | |
cluster_names_dict_fixed[i] = sorted_tup[1] | |
cluster_map_dict[sorted_tup[0]] = i | |
clusters = np.array(clusters_fixed) | |
# cluster_names_dict = cluster_names_dict_fixed | |
if return_kmeans_estimator: | |
return clusters, cluster_names_dict_fixed, kmeans_estimator, cluster_map_dict | |
return clusters, cluster_names_dict_fixed | |
def biplot(dat: pd.DataFrame, | |
clf, | |
clusters, | |
cluster_names_dict, | |
mean_methylation=None, | |
title='LDA of dataset'): | |
lda_embedding = clf.transform(dat.values) | |
fig = plt.figure() | |
for cluster in list(set(clusters)): | |
cluster_x = lda_embedding[clusters == cluster, 0] | |
if lda_embedding.shape[1] == 1: | |
cluster_y = [1] * len(cluster_x) | |
else: | |
cluster_y = lda_embedding[clusters == cluster, 1] | |
plt.scatter(cluster_x, | |
cluster_y, | |
color=sns.color_palette(n_colors=11)[cluster], | |
label=cluster_names_dict[cluster]) | |
plt.legend(loc='best', shadow=False, scatterpoints=1) | |
plt.title(title) | |
explained = clf.explained_variance_ratio_ | |
plt.xlabel('LDA 1 ({:.1%})'.format(explained[0])) | |
if lda_embedding.shape[1] == 1: | |
plt.ylabel('No LDA 2') | |
else: | |
plt.ylabel('LDA 2 ({:.1%})'.format(explained[1])) | |
return fig | |
def detect_outlier(data_1, threshold=1.645, mean=None, only_positive=False): | |
mean_1 = np.mean(data_1) | |
if mean is not None: | |
mean_1 = mean | |
std_1 = np.std(data_1) | |
z_scores = (data_1 - mean_1) / std_1 | |
outliers = np.abs(z_scores) >= threshold | |
if only_positive: | |
outliers = z_scores >= threshold | |
return outliers | |
def filter_data(data, clf, num_sites_wanted=70): | |
scalings_0 = clf.scalings_[:, 0] | |
scalings_1 = clf.scalings_[:, 1] | |
outliers_0 = detect_outlier(scalings_0, threshold=1.645) #top 10% | |
outliers_1 = detect_outlier(scalings_1, threshold=3) #top 1% | |
combined_indicies = np.logical_or(outliers_0, outliers_1) | |
# combined_indicies = outliers_0 | |
filtered_data = data.loc[combined_indicies, ] | |
return filtered_data | |
def filter_data_coef(data, clf): | |
coefs = clf.coef_ | |
all_outliers = np.abs(coefs[0]) >= 50 #detect_outlier(coefs[0], threshold=2, only_positive=False) #top 5% | |
for c in coefs[1:]: | |
outliers = np.abs(c) >= 50 #detect_outlier(c, threshold=2, only_positive=False) #top 5% | |
if np.sum(c[outliers]>0) != np.sum(outliers): | |
print('error error') | |
else: | |
print('all good') | |
all_outliers = np.logical_or(all_outliers, outliers) | |
filtered_data = data.loc[all_outliers, ] | |
return filtered_data | |
def run_feature_selection(data, mean_methylation, clusters, | |
cluster_names_dict, use_coef=False): | |
# column_linkage = linkage(data.T, optimal_ordering=True, method='average') | |
clf = LinearDiscriminantAnalysis(n_components=2) | |
clf.fit(data.T, clusters) | |
biplot(dat=data.T, | |
clf=clf, | |
clusters=clusters, | |
cluster_names_dict=cluster_names_dict, | |
mean_methylation=mean_methylation, | |
title='LDA of more variant sites') | |
print('lda explained ratios: ', clf.explained_variance_ratio_) | |
#get the significant sites | |
if use_coef: | |
filtered_data = filter_data_coef(data, clf) | |
else: | |
filtered_data = filter_data(data, clf) | |
return filtered_data, clf | |
def make_cluster_map(data, | |
w=12, | |
h=36, | |
y_font_size=4, | |
x_font_size=8, | |
col_cluster=True, | |
row_cluster=True, | |
row_linkage=None, | |
col_linkage=None, | |
row_colors=None, | |
col_colors=None, | |
method="centroid", | |
metric="euclidean", | |
xticklabels="auto", | |
yticklabels="auto"): | |
sns.set() | |
sns.set_style({'font.sans-serif': 'Roboto'}) | |
#the colors stefan wanted | |
cpal = sns.diverging_palette(180, 300, s=99, l=65, n=9) | |
cmap = ListedColormap(cpal.as_hex()) | |
hm = sns.clustermap( | |
data, | |
cmap=cmap, | |
method=method, | |
metric=metric, | |
# method="ward", | |
# metric='cityblock', | |
vmin=0, vmax=1, | |
figsize=(w, h), | |
col_cluster=col_cluster, | |
row_cluster=row_cluster, | |
row_linkage=row_linkage, | |
col_linkage=col_linkage, | |
row_colors=row_colors, | |
col_colors=col_colors, | |
xticklabels=xticklabels, | |
yticklabels=yticklabels ) | |
hm.ax_heatmap.tick_params(axis='y', labelsize=y_font_size) | |
hm.ax_heatmap.tick_params(axis='x', labelsize=x_font_size) | |
return hm | |
def get_special_row1_colors(): | |
paired_palette = sns.color_palette("Paired",n_colors=10) | |
row_color1 = [ | |
paired_palette[0], | |
paired_palette[2], | |
paired_palette[4], | |
paired_palette[6], | |
paired_palette[8], | |
paired_palette[1], | |
paired_palette[3], | |
paired_palette[5], | |
paired_palette[7], | |
paired_palette[9], | |
(0,0,0), | |
] | |
return row_color1 | |
def make_kmeans_clustermap(df_to_plot, | |
sample_data, | |
mean_methylation=None, | |
n_clusters_samples=6, | |
col_clusters=None, | |
site_cluster_series=None, | |
site_ordering=None, | |
n_clusters_sites=11, | |
genes_to_mark=['XIST','LOC286467','PLS3'], | |
plot_title="", | |
w=12, | |
h=36, | |
y_font_size=4, | |
x_font_size=8, | |
use_special_row_colors=False, | |
hide_x_labels=False, | |
hide_y_labels=False, | |
gene_idx=3): | |
if mean_methylation is None: | |
mean_methylation = pd.DataFrame(df_to_plot.mean()) | |
mean_methylation = MinMaxScaler( | |
feature_range=(0, 5)).fit_transform(mean_methylation) | |
mean_methylation_row = pd.Series(df_to_plot.mean(), | |
name='mean_methylation') | |
# sample_linkage = linkage(df_to_plot.T, optimal_ordering=True, method='average', metric='correlation') | |
# linkage_order = dendrogram( | |
# sample_linkage, | |
# distance_sort='descending', | |
# no_plot=True, | |
# labels=df_to_plot.columns)['ivl'] | |
# sample_ordering = {} | |
# for i, x in enumerate(linkage_order): | |
# sample_ordering[x] = i | |
# mean_methylation_row = pd.Series(sample_ordering, | |
# name='mean_methylation') | |
if col_clusters is None: | |
clusters, cluster_names_dict = get_clusters(df_to_plot, | |
mean_methylation, | |
n_clusters_samples, | |
sample_data=sample_data) | |
else: | |
clusters = col_clusters | |
clusters_row = pd.Series(clusters, | |
name='clusters', | |
index=df_to_plot.columns) | |
df_copy = df_to_plot | |
df_copy = df_copy.append(clusters_row).append(mean_methylation_row) | |
df_copy = df_copy.sort_values(by=['clusters', 'mean_methylation'], | |
ascending=[True, False], | |
axis=1) | |
df_copy = df_copy.drop('clusters').drop('mean_methylation') | |
# if site_ordering is None: | |
site_linkage = linkage(df_copy, optimal_ordering=True, method='average') | |
linkage_order = dendrogram( | |
site_linkage, | |
distance_sort='ascending', | |
no_plot=True, | |
labels=df_copy.index)['ivl'] | |
site_ordering = {} | |
linkage_order.reverse() | |
for i, x in enumerate(linkage_order): | |
site_ordering[x] = i | |
clusters_col = [] | |
if site_cluster_series is None: | |
clusters_col = get_clusters(df_copy.T.corr(), n_clusters=n_clusters_sites, mean_methylation=df_copy.T.mean()) | |
elif site_cluster_series is not False: | |
clusters_col = site_cluster_series | |
row_colors = [] | |
if site_cluster_series is not False: | |
df_copy['cluster'] = pd.Series(clusters_col, name='cluster') | |
df_copy['cluster2'] = pd.Series(site_ordering, name='cluster2') | |
df_copy = df_copy.sort_values(by=['cluster','cluster2'],ascending=[True, True], axis=0) | |
df_copy = df_copy.drop(['cluster', 'cluster2'], axis=1) | |
if use_special_row_colors: | |
row1_pal = get_special_row1_colors() | |
else: | |
row1_pal = sns.color_palette("Paired",n_colors=n_clusters_sites) | |
row_color1 = [ row1_pal[int(x)] for x in clusters_col.loc[df_copy.index].tolist()] | |
row_colors = [row_color1] | |
# site_clusters = fcluster(site_linkage, t=n_clusters_sites, criterion='maxclust') | |
# site_cluster_series = pd.Series(site_clusters, index=df_to_plot.index) | |
if genes_to_mark != [] and genes_to_mark is not None: | |
row_color2, gene_color_legend = get_row_colors(df_copy, genes_to_mark, gene_idx=gene_idx) | |
row_colors.append(row_color2) | |
xticklabels="auto" | |
yticklabels="auto" | |
if hide_x_labels: | |
xticklabels=False | |
if hide_y_labels: | |
yticklabels=False | |
row_cluster=False | |
if not isinstance(site_cluster_series, pd.Series) and site_cluster_series == False: | |
row_cluster = True | |
hm = make_cluster_map( | |
df_copy, | |
col_colors=[ | |
sns.color_palette("Set2",n_colors=len(clusters_row.unique()))[x] | |
for x in clusters_row.sort_values().tolist() | |
], | |
col_cluster=False, | |
row_colors= row_colors, | |
row_cluster=row_cluster, | |
w=w, | |
h=h, | |
y_font_size=y_font_size, | |
x_font_size=x_font_size, | |
xticklabels=xticklabels, | |
yticklabels=yticklabels) | |
if genes_to_mark != [] and genes_to_mark is not None: | |
l2 = hm.ax_col_dendrogram.legend(loc='upper left', | |
# bbox_to_anchor=(1.01,0.85), | |
handles=gene_color_legend, | |
frameon=True) | |
l2.set_title(title='Genes',prop={'size':10}) | |
hm.fig.suptitle(plot_title) | |
return hm, df_copy, clusters_col | |
def get_row_colors(ordered_df, genes_to_mark, gene_idx=3): | |
row_color = [] | |
gene_colors = sns.color_palette('Set1', n_colors=len(genes_to_mark)) | |
gene_color_legend = [mpatches.Patch(color=gene_colors[i], label=g) for i,g in enumerate(genes_to_mark)] | |
gene_counts = [0] * len(genes_to_mark) | |
for x in ordered_df.index: | |
genes = str(x[gene_idx]) | |
found = False | |
for i,g in enumerate(genes_to_mark): | |
if g in genes: | |
row_color.append(gene_colors[i]) | |
gene_counts[i] += 1 | |
found = True | |
break | |
if not found: | |
row_color.append('white') | |
for i,g in enumerate(genes_to_mark): | |
print(g, ' - ', gene_counts[i]) | |
return row_color, gene_color_legend | |
def round_up_to_multiple(number, multiple): | |
num = number + (multiple - 1) | |
return num - (num % multiple) | |
''' | |
Get the HiC stuff from juicer | |
cd /mnt/c/Users/Prakhar\ Bansal/Downloads | |
java -jar juicer_tools_1.11.04_jcuda.0.8.jar dump observed VC GSM3576787_iPSCORE_2_1.iPSC.hic X X BP 5000 GSM3576787_iPSCORE_2_1_chrX_chrX_5kb.txt | |
''' | |
def get_hi_c_matrix( | |
hi_c_df: pd.DataFrame, | |
sites_index: list, | |
distance_cache_name=None, | |
use_cached=False, | |
resolution=1000): | |
if distance_cache_name is None and not use_cached: | |
distance_cache_name = "hic_cache/hi_c_cache_{}.csv".format(datetime.datetime.now()) | |
elif distance_cache_name is None and use_cached: | |
raise ValueError('If you want to use a cached table, please specify distance_cache_name') | |
if not use_cached: | |
#access go function | |
lib = cdll.LoadLibrary("./libtomutil.so") | |
class GoString(ctypes.Structure): | |
_fields_ = [("p", ctypes.c_char_p), ("n", ctypes.c_longlong)] | |
lib.FormatHiCMatrix.argtypes = [GoString, GoString, GoString, GoString] | |
hi_c_filename = "hi_c.csv" | |
hi_c_df.to_csv(hi_c_filename, header=False) | |
hiCMatrixFilename = GoString(hi_c_filename.encode('UTF-8'), len(hi_c_filename)) | |
#write the site names in a understandable format | |
formatted_data = [] | |
for site in sites_index: | |
pos = site[2] | |
formatted_data.append([site, pos]) | |
formatted_data = pd.DataFrame(formatted_data, columns = None, index=None) | |
formatted_data_filename = "data_temp.csv" | |
formatted_data.to_csv(formatted_data_filename, header=None, index=False) | |
dataFilename = GoString(formatted_data_filename.encode('UTF-8'), len(formatted_data_filename)) | |
outFilename = GoString(distance_cache_name.encode('UTF-8'), len(distance_cache_name)) | |
resolution_str = GoString(str(resolution).encode('UTF-8'), len(str(resolution))) | |
lib.FormatHiCMatrix(hiCMatrixFilename, dataFilename, outFilename, resolution_str) | |
return pd.read_csv(distance_cache_name, index_col=0) | |
else: | |
return pd.read_csv(distance_cache_name, index_col=0) | |
def make_correlation_clustermap(corr_df, cluster_series, index_order,use_special_row_colors=False): | |
ordered_df = corr_df.loc[index_order, index_order] | |
ordered_clusters = cluster_series.loc[index_order] | |
mask = np.zeros_like(ordered_df) | |
mask[np.triu_indices_from(mask)] = True | |
color_pal = sns.color_palette("Paired",n_colors=ordered_clusters.max()+1) | |
if use_special_row_colors: | |
color_pal = get_special_row1_colors() | |
colors = [color_pal[x] for x in ordered_clusters] | |
# print(colors) | |
hm = sns.clustermap( | |
ordered_df, | |
vmin=-1, vmax=1,center=0, | |
col_cluster=False, | |
row_cluster=False, | |
xticklabels=False, yticklabels=False, | |
row_colors=colors, | |
col_colors=colors, | |
cmap=sns.diverging_palette(20, 220, n=200)) | |
return hm | |
def make_hic_clustermap(hic_df, cluster_series, index_order, use_special_row_colors=False, use_linear_distances=False): | |
ordered_df = hic_df.loc[index_order, index_order] | |
ordered_clusters = cluster_series.loc[index_order] | |
mask = np.zeros_like(ordered_df) | |
mask[np.triu_indices_from(mask)] = True | |
color_pal = sns.color_palette("Paired",n_colors=ordered_clusters.max()+1) | |
if use_special_row_colors: | |
color_pal = get_special_row1_colors() | |
colors = [color_pal[x] for x in ordered_clusters] | |
# print(colors) | |
cdict = {'red': ((0.0, 1.0, 1.0), | |
(1.0, 1.0, 1.0)), | |
'green': ((0.0, 1.0, 1.0), | |
(1.0, 0.0, 0.0)), | |
'blue': ((0.0, 1.0, 1.0), | |
(1.0, 0.0, 0.0))} | |
custom_cm = LinearSegmentedColormap( | |
"white2red", cdict) | |
vmin=0 | |
center=10 | |
vmax=20 | |
if use_linear_distances: | |
vmin=0 | |
center=5000 | |
vmax=10000 | |
cdict = {'blue': ((0.0, 1.0, 1.0), | |
(1.0, 1.0, 1.0)), | |
'red': ((0.0, 1.0, 1.0), | |
(1.0, 0.0, 0.0)), | |
'green': ((0.0, 1.0, 1.0), | |
(1.0, 0.0, 0.0))} | |
custom_cm = LinearSegmentedColormap( | |
"white2red", cdict) | |
custom_cm = custom_cm.reversed() | |
hm = sns.clustermap( | |
ordered_df, | |
# vmin=vmin, vmax=vmax, center=center, | |
col_cluster=False, | |
row_cluster=False, | |
xticklabels=False, yticklabels=False, | |
row_colors=colors, | |
col_colors=colors, | |
# cmap=custom_cm, | |
figsize=(20,20)) | |
return hm | |
def make_hic_heatmap(hic_df): | |
# print(colors) | |
cdict = {'red': ((0.0, 1.0, 1.0), | |
(1.0, 1.0, 1.0)), | |
'green': ((0.0, 1.0, 1.0), | |
(1.0, 0.0, 0.0)), | |
'blue': ((0.0, 1.0, 1.0), | |
(1.0, 0.0, 0.0))} | |
custom_cm = LinearSegmentedColormap( | |
"white2red", cdict) | |
vmin=0 | |
center=10 | |
vmax=20 | |
hm = sns.clustermap( | |
hic_df, | |
# vmin=vmin, vmax=vmax, center=center, | |
col_cluster=False, | |
row_cluster=False, | |
xticklabels=False, yticklabels=False, | |
# cmap=custom_cm, | |
figsize=(20,20)) | |
return hm | |
def make_mean_dist_clustermap(mean_dist_cluster_df, use_special_row_colors=True, use_linear_distances=False): | |
# ordered_df = hic_df.loc[index_order, index_order] | |
# ordered_clusters = cluster_series.loc[index_order] | |
# mask = np.zeros_like(ordered_df) | |
# mask[np.triu_indices_from(mask)] = True | |
color_pal = sns.color_palette("Paired",n_colors=11) | |
if use_special_row_colors: | |
color_pal = get_special_row1_colors() | |
colors = [color_pal[x] for x in mean_dist_cluster_df.index.tolist()] | |
# print(colors) | |
cdict = {'red': ((0.0, 1.0, 1.0), | |
(1.0, 1.0, 1.0)), | |
'green': ((0.0, 1.0, 1.0), | |
(1.0, 0.0, 0.0)), | |
'blue': ((0.0, 1.0, 1.0), | |
(1.0, 0.0, 0.0))} | |
if use_linear_distances: | |
cdict = {'blue': ((0.0, 1.0, 1.0), | |
(1.0, 1.0, 1.0)), | |
'red': ((0.0, 1.0, 1.0), | |
(1.0, 0.0, 0.0)), | |
'green': ((0.0, 1.0, 1.0), | |
(1.0, 0.0, 0.0))} | |
custom_cm = LinearSegmentedColormap( | |
"white2red", cdict) | |
if use_linear_distances: | |
custom_cm = custom_cm.reversed() | |
hm = sns.clustermap( | |
mean_dist_cluster_df, | |
# vmin=vmin, vmax=vmax, center=center, | |
col_cluster=False, | |
row_cluster=False, | |
xticklabels=False, yticklabels=False, | |
row_colors=colors, | |
col_colors=colors, | |
# cmap=custom_cm, | |
figsize=(20,20)) | |
return hm | |
def calculate_ppv(order, | |
sample_metadata: pd.DataFrame, | |
sample_col="SampleName", | |
passage_col="Passage", | |
cell_line_col="CellLine", | |
reverse=False): | |
ordered_metadata = sample_metadata.sort_values(passage_col) | |
grouped_metadata = ordered_metadata.groupby(cell_line_col) | |
correct_pairs = 0.0 | |
total_pairs = 0.0 | |
for cellline, group in grouped_metadata: | |
num_rows = len(group) | |
if num_rows > 1: | |
i = 1 | |
for row, data in group.iterrows(): | |
for row2, data2 in group[i:].iterrows(): | |
total_pairs += 1 | |
if (not reverse) and order[data[sample_col]] <= order[ | |
data2[sample_col]]: | |
correct_pairs += 1 | |
elif reverse and order[data[sample_col]] >= order[ | |
data2[sample_col]]: | |
correct_pairs += 1 | |
else: | |
print('incorrect pair: ', data[sample_col], | |
order[data[sample_col]], data2[sample_col], | |
order[data2[sample_col]]) | |
i += 1 | |
if total_pairs == 0: | |
return "total pairs = 0" | |
print(total_pairs) | |
return correct_pairs / total_pairs |