Skip to content
Permalink
2e6bcacfa8
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
742 lines (598 sloc) 24.4 KB
import math
import operator
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import MinMaxScaler
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree
import matplotlib.patches as mpatches
from matplotlib.colors import LinearSegmentedColormap
import datetime
from ctypes import cdll
import ctypes
# from gmeans import GMeans
sns.set()
def get_clusters(data,
sample_data=None,
mean_methylation=None,
n_clusters=11,
recursive=False,
n_init=100,
max_iter=1000):
clusters, cluster_named_dict = get_ordered_clusters(
data,
mean_methylation=mean_methylation,
n_clusters=n_clusters,
sample_data=sample_data,
n_init=n_init,
max_iter=max_iter)
clusters_row = pd.Series(clusters, name='cluster', index=data.columns)
if recursive:
# data = data.append(clusters_row)
grouped = data.groupby(by=clusters_row, axis=1)
subcluster_series = pd.Series(name='subcluster')
for name, group in grouped:
# gm = GMeans(min_obs=2, max_depth=500, strictness=3)
# group = group.drop('cluster')
# gm.fit(group.T)
subcluster_linkage = linkage(group.T, optimal_ordering=True, method='average', metric="correlation")
linkage_order = dendrogram(
subcluster_linkage,
distance_sort='ascending',
no_plot=True,
labels=group.columns)['ivl']
linkage_subclusters = {}
for i, x in enumerate(linkage_order):
linkage_subclusters[x] = i
# n_subclusters = len(np.unique(gm.labels_))
# subclusters, subcluster_named_dict = get_ordered_clusters(
# group,
# mean_methylation=mean_methylation,
# n_clusters=n_subclusters,
# clusters=gm.labels_,
# n_init=n_init,
# max_iter=max_iter)
subcluster_series = subcluster_series.append(
pd.Series(linkage_subclusters, index=group.columns))
return clusters_row, subcluster_series
return clusters_row
# def get_correlation_clusters(data,
# sample_data=None,
# mean_methylation=None,
# n_clusters=11,
# recursive=False,
# n_init=100,
# max_iter=1000):
# clusters = KMeans(n_clusters=n_clusters,
# n_init=n_init,
# max_iter=max_iter,n_jobs=-1,init='random').fit_predict(data.corr())
# clusters_row = pd.Series(clusters, name='cluster', index=data.columns)
# return clusters_row
def get_ordered_clusters(data,
n_clusters=None,
mean_methylation=None,
sample_data=None,
n_init=100,
max_iter=1000,
clusters=None,
return_kmeans_estimator=False):
#get weights by frequency of cell_line in the dataset
if clusters is None:
if sample_data is None:
kmeans_estimator = KMeans(n_clusters=n_clusters,
n_init=n_init,
max_iter=max_iter,n_jobs=-1,init='random').fit(data.T)
clusters = kmeans_estimator.predict(data.T)
# KMeans(n_clusters=n_clusters,
# n_init=n_init,
# max_iter=max_iter,n_jobs=-1,init='random').fit_predict(data.T)
else:
sample_data_subset = sample_data.loc[data.columns.tolist(), :]
cell_lines = sample_data_subset['cell_line']
counts = cell_lines.value_counts()
cell_line_weights = 1 / cell_lines.map(counts)
kmeans_estimator = KMeans(n_clusters=n_clusters,
n_init=n_init,
max_iter=max_iter,n_jobs=-1,init='random').fit(
data.T, sample_weight=cell_line_weights)
clusters = kmeans_estimator.predict(data.T)
cluster_names_dict = {}
if mean_methylation is None:
mean_methylation = pd.DataFrame(data.mean())
mean_methylation = MinMaxScaler(
feature_range=(0, 5)).fit_transform(mean_methylation)
for cluster in list(set(clusters)):
cluster_names_dict[cluster] = np.mean(
mean_methylation[clusters == cluster])
sorted_x = sorted(cluster_names_dict.items(), key=lambda kv: kv[1])
sorted_x.reverse()
cluster_map_dict = {}
cluster_names_dict_fixed = {}
clusters_fixed = [0 for x in clusters]
for i, sorted_tup in enumerate(sorted_x):
for j, clust in enumerate(clusters):
if clust == sorted_tup[0]:
clusters_fixed[j] = i
if i == 0:
cluster_names_dict_fixed[i] = 'Non-Eroded'
elif i == len(sorted_x) - 1:
cluster_names_dict_fixed[i] = 'Eroded'
else:
cluster_names_dict_fixed[i] = sorted_tup[1]
cluster_map_dict[sorted_tup[0]] = i
clusters = np.array(clusters_fixed)
# cluster_names_dict = cluster_names_dict_fixed
if return_kmeans_estimator:
return clusters, cluster_names_dict_fixed, kmeans_estimator, cluster_map_dict
return clusters, cluster_names_dict_fixed
def biplot(dat: pd.DataFrame,
clf,
clusters,
cluster_names_dict,
mean_methylation=None,
title='LDA of dataset'):
lda_embedding = clf.transform(dat.values)
fig = plt.figure()
for cluster in list(set(clusters)):
cluster_x = lda_embedding[clusters == cluster, 0]
if lda_embedding.shape[1] == 1:
cluster_y = [1] * len(cluster_x)
else:
cluster_y = lda_embedding[clusters == cluster, 1]
plt.scatter(cluster_x,
cluster_y,
color=sns.color_palette(n_colors=11)[cluster],
label=cluster_names_dict[cluster])
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title(title)
explained = clf.explained_variance_ratio_
plt.xlabel('LDA 1 ({:.1%})'.format(explained[0]))
if lda_embedding.shape[1] == 1:
plt.ylabel('No LDA 2')
else:
plt.ylabel('LDA 2 ({:.1%})'.format(explained[1]))
return fig
def detect_outlier(data_1, threshold=1.645, mean=None, only_positive=False):
mean_1 = np.mean(data_1)
if mean is not None:
mean_1 = mean
std_1 = np.std(data_1)
z_scores = (data_1 - mean_1) / std_1
outliers = np.abs(z_scores) >= threshold
if only_positive:
outliers = z_scores >= threshold
return outliers
def filter_data(data, clf, num_sites_wanted=70):
scalings_0 = clf.scalings_[:, 0]
scalings_1 = clf.scalings_[:, 1]
outliers_0 = detect_outlier(scalings_0, threshold=1.645) #top 10%
outliers_1 = detect_outlier(scalings_1, threshold=3) #top 1%
combined_indicies = np.logical_or(outliers_0, outliers_1)
# combined_indicies = outliers_0
filtered_data = data.loc[combined_indicies, ]
return filtered_data
def filter_data_coef(data, clf):
coefs = clf.coef_
all_outliers = np.abs(coefs[0]) >= 50 #detect_outlier(coefs[0], threshold=2, only_positive=False) #top 5%
for c in coefs[1:]:
outliers = np.abs(c) >= 50 #detect_outlier(c, threshold=2, only_positive=False) #top 5%
if np.sum(c[outliers]>0) != np.sum(outliers):
print('error error')
else:
print('all good')
all_outliers = np.logical_or(all_outliers, outliers)
filtered_data = data.loc[all_outliers, ]
return filtered_data
def run_feature_selection(data, mean_methylation, clusters,
cluster_names_dict, use_coef=False):
# column_linkage = linkage(data.T, optimal_ordering=True, method='average')
clf = LinearDiscriminantAnalysis(n_components=2)
clf.fit(data.T, clusters)
biplot(dat=data.T,
clf=clf,
clusters=clusters,
cluster_names_dict=cluster_names_dict,
mean_methylation=mean_methylation,
title='LDA of more variant sites')
print('lda explained ratios: ', clf.explained_variance_ratio_)
#get the significant sites
if use_coef:
filtered_data = filter_data_coef(data, clf)
else:
filtered_data = filter_data(data, clf)
return filtered_data, clf
def make_cluster_map(data,
w=12,
h=36,
y_font_size=4,
x_font_size=8,
col_cluster=True,
row_cluster=True,
row_linkage=None,
col_linkage=None,
row_colors=None,
col_colors=None,
method="centroid",
metric="euclidean",
xticklabels="auto",
yticklabels="auto"):
sns.set()
sns.set_style({'font.sans-serif': 'Roboto'})
#the colors stefan wanted
cpal = sns.diverging_palette(180, 300, s=99, l=65, n=9)
cmap = ListedColormap(cpal.as_hex())
hm = sns.clustermap(
data,
cmap=cmap,
method=method,
metric=metric,
# method="ward",
# metric='cityblock',
vmin=0, vmax=1,
figsize=(w, h),
col_cluster=col_cluster,
row_cluster=row_cluster,
row_linkage=row_linkage,
col_linkage=col_linkage,
row_colors=row_colors,
col_colors=col_colors,
xticklabels=xticklabels,
yticklabels=yticklabels )
hm.ax_heatmap.tick_params(axis='y', labelsize=y_font_size)
hm.ax_heatmap.tick_params(axis='x', labelsize=x_font_size)
return hm
def get_special_row1_colors():
paired_palette = sns.color_palette("Paired",n_colors=10)
row_color1 = [
paired_palette[0],
paired_palette[2],
paired_palette[4],
paired_palette[6],
paired_palette[8],
paired_palette[1],
paired_palette[3],
paired_palette[5],
paired_palette[7],
paired_palette[9],
(0,0,0),
]
return row_color1
def make_kmeans_clustermap(df_to_plot,
sample_data,
mean_methylation=None,
n_clusters_samples=6,
col_clusters=None,
site_cluster_series=None,
site_ordering=None,
n_clusters_sites=11,
genes_to_mark=['XIST','LOC286467','PLS3'],
plot_title="",
w=12,
h=36,
y_font_size=4,
x_font_size=8,
use_special_row_colors=False,
hide_x_labels=False,
hide_y_labels=False,
gene_idx=3):
if mean_methylation is None:
mean_methylation = pd.DataFrame(df_to_plot.mean())
mean_methylation = MinMaxScaler(
feature_range=(0, 5)).fit_transform(mean_methylation)
mean_methylation_row = pd.Series(df_to_plot.mean(),
name='mean_methylation')
# sample_linkage = linkage(df_to_plot.T, optimal_ordering=True, method='average', metric='correlation')
# linkage_order = dendrogram(
# sample_linkage,
# distance_sort='descending',
# no_plot=True,
# labels=df_to_plot.columns)['ivl']
# sample_ordering = {}
# for i, x in enumerate(linkage_order):
# sample_ordering[x] = i
# mean_methylation_row = pd.Series(sample_ordering,
# name='mean_methylation')
if col_clusters is None:
clusters, cluster_names_dict = get_clusters(df_to_plot,
mean_methylation,
n_clusters_samples,
sample_data=sample_data)
else:
clusters = col_clusters
clusters_row = pd.Series(clusters,
name='clusters',
index=df_to_plot.columns)
df_copy = df_to_plot
df_copy = df_copy.append(clusters_row).append(mean_methylation_row)
df_copy = df_copy.sort_values(by=['clusters', 'mean_methylation'],
ascending=[True, False],
axis=1)
df_copy = df_copy.drop('clusters').drop('mean_methylation')
# if site_ordering is None:
site_linkage = linkage(df_copy, optimal_ordering=True, method='average')
linkage_order = dendrogram(
site_linkage,
distance_sort='ascending',
no_plot=True,
labels=df_copy.index)['ivl']
site_ordering = {}
linkage_order.reverse()
for i, x in enumerate(linkage_order):
site_ordering[x] = i
clusters_col = []
if site_cluster_series is None:
clusters_col = get_clusters(df_copy.T.corr(), n_clusters=n_clusters_sites, mean_methylation=df_copy.T.mean())
elif site_cluster_series is not False:
clusters_col = site_cluster_series
row_colors = []
if site_cluster_series is not False:
df_copy['cluster'] = pd.Series(clusters_col, name='cluster')
df_copy['cluster2'] = pd.Series(site_ordering, name='cluster2')
df_copy = df_copy.sort_values(by=['cluster','cluster2'],ascending=[True, True], axis=0)
df_copy = df_copy.drop(['cluster', 'cluster2'], axis=1)
if use_special_row_colors:
row1_pal = get_special_row1_colors()
else:
row1_pal = sns.color_palette("Paired",n_colors=n_clusters_sites)
row_color1 = [ row1_pal[int(x)] for x in clusters_col.loc[df_copy.index].tolist()]
row_colors = [row_color1]
# site_clusters = fcluster(site_linkage, t=n_clusters_sites, criterion='maxclust')
# site_cluster_series = pd.Series(site_clusters, index=df_to_plot.index)
if genes_to_mark != [] and genes_to_mark is not None:
row_color2, gene_color_legend = get_row_colors(df_copy, genes_to_mark, gene_idx=gene_idx)
row_colors.append(row_color2)
xticklabels="auto"
yticklabels="auto"
if hide_x_labels:
xticklabels=False
if hide_y_labels:
yticklabels=False
row_cluster=False
if not isinstance(site_cluster_series, pd.Series) and site_cluster_series == False:
row_cluster = True
hm = make_cluster_map(
df_copy,
col_colors=[
sns.color_palette("Set2",n_colors=len(clusters_row.unique()))[x]
for x in clusters_row.sort_values().tolist()
],
col_cluster=False,
row_colors= row_colors,
row_cluster=row_cluster,
w=w,
h=h,
y_font_size=y_font_size,
x_font_size=x_font_size,
xticklabels=xticklabels,
yticklabels=yticklabels)
if genes_to_mark != [] and genes_to_mark is not None:
l2 = hm.ax_col_dendrogram.legend(loc='upper left',
# bbox_to_anchor=(1.01,0.85),
handles=gene_color_legend,
frameon=True)
l2.set_title(title='Genes',prop={'size':10})
hm.fig.suptitle(plot_title)
return hm, df_copy, clusters_col
def get_row_colors(ordered_df, genes_to_mark, gene_idx=3):
row_color = []
gene_colors = sns.color_palette('Set1', n_colors=len(genes_to_mark))
gene_color_legend = [mpatches.Patch(color=gene_colors[i], label=g) for i,g in enumerate(genes_to_mark)]
gene_counts = [0] * len(genes_to_mark)
for x in ordered_df.index:
genes = str(x[gene_idx])
found = False
for i,g in enumerate(genes_to_mark):
if g in genes:
row_color.append(gene_colors[i])
gene_counts[i] += 1
found = True
break
if not found:
row_color.append('white')
for i,g in enumerate(genes_to_mark):
print(g, ' - ', gene_counts[i])
return row_color, gene_color_legend
def round_up_to_multiple(number, multiple):
num = number + (multiple - 1)
return num - (num % multiple)
'''
Get the HiC stuff from juicer
cd /mnt/c/Users/Prakhar\ Bansal/Downloads
java -jar juicer_tools_1.11.04_jcuda.0.8.jar dump observed VC GSM3576787_iPSCORE_2_1.iPSC.hic X X BP 5000 GSM3576787_iPSCORE_2_1_chrX_chrX_5kb.txt
'''
def get_hi_c_matrix(
hi_c_df: pd.DataFrame,
sites_index: list,
distance_cache_name=None,
use_cached=False,
resolution=1000):
if distance_cache_name is None and not use_cached:
distance_cache_name = "hic_cache/hi_c_cache_{}.csv".format(datetime.datetime.now())
elif distance_cache_name is None and use_cached:
raise ValueError('If you want to use a cached table, please specify distance_cache_name')
if not use_cached:
#access go function
lib = cdll.LoadLibrary("./libtomutil.so")
class GoString(ctypes.Structure):
_fields_ = [("p", ctypes.c_char_p), ("n", ctypes.c_longlong)]
lib.FormatHiCMatrix.argtypes = [GoString, GoString, GoString, GoString]
hi_c_filename = "hi_c.csv"
hi_c_df.to_csv(hi_c_filename, header=False)
hiCMatrixFilename = GoString(hi_c_filename.encode('UTF-8'), len(hi_c_filename))
#write the site names in a understandable format
formatted_data = []
for site in sites_index:
pos = site[2]
formatted_data.append([site, pos])
formatted_data = pd.DataFrame(formatted_data, columns = None, index=None)
formatted_data_filename = "data_temp.csv"
formatted_data.to_csv(formatted_data_filename, header=None, index=False)
dataFilename = GoString(formatted_data_filename.encode('UTF-8'), len(formatted_data_filename))
outFilename = GoString(distance_cache_name.encode('UTF-8'), len(distance_cache_name))
resolution_str = GoString(str(resolution).encode('UTF-8'), len(str(resolution)))
lib.FormatHiCMatrix(hiCMatrixFilename, dataFilename, outFilename, resolution_str)
return pd.read_csv(distance_cache_name, index_col=0)
else:
return pd.read_csv(distance_cache_name, index_col=0)
def make_correlation_clustermap(corr_df, cluster_series, index_order,use_special_row_colors=False):
ordered_df = corr_df.loc[index_order, index_order]
ordered_clusters = cluster_series.loc[index_order]
mask = np.zeros_like(ordered_df)
mask[np.triu_indices_from(mask)] = True
color_pal = sns.color_palette("Paired",n_colors=ordered_clusters.max()+1)
if use_special_row_colors:
color_pal = get_special_row1_colors()
colors = [color_pal[x] for x in ordered_clusters]
# print(colors)
hm = sns.clustermap(
ordered_df,
vmin=-1, vmax=1,center=0,
col_cluster=False,
row_cluster=False,
xticklabels=False, yticklabels=False,
row_colors=colors,
col_colors=colors,
cmap=sns.diverging_palette(20, 220, n=200))
return hm
def make_hic_clustermap(hic_df, cluster_series, index_order, use_special_row_colors=False, use_linear_distances=False):
ordered_df = hic_df.loc[index_order, index_order]
ordered_clusters = cluster_series.loc[index_order]
mask = np.zeros_like(ordered_df)
mask[np.triu_indices_from(mask)] = True
color_pal = sns.color_palette("Paired",n_colors=ordered_clusters.max()+1)
if use_special_row_colors:
color_pal = get_special_row1_colors()
colors = [color_pal[x] for x in ordered_clusters]
# print(colors)
cdict = {'red': ((0.0, 1.0, 1.0),
(1.0, 1.0, 1.0)),
'green': ((0.0, 1.0, 1.0),
(1.0, 0.0, 0.0)),
'blue': ((0.0, 1.0, 1.0),
(1.0, 0.0, 0.0))}
custom_cm = LinearSegmentedColormap(
"white2red", cdict)
vmin=0
center=10
vmax=20
if use_linear_distances:
vmin=0
center=5000
vmax=10000
cdict = {'blue': ((0.0, 1.0, 1.0),
(1.0, 1.0, 1.0)),
'red': ((0.0, 1.0, 1.0),
(1.0, 0.0, 0.0)),
'green': ((0.0, 1.0, 1.0),
(1.0, 0.0, 0.0))}
custom_cm = LinearSegmentedColormap(
"white2red", cdict)
custom_cm = custom_cm.reversed()
hm = sns.clustermap(
ordered_df,
# vmin=vmin, vmax=vmax, center=center,
col_cluster=False,
row_cluster=False,
xticklabels=False, yticklabels=False,
row_colors=colors,
col_colors=colors,
# cmap=custom_cm,
figsize=(20,20))
return hm
def make_hic_heatmap(hic_df):
# print(colors)
cdict = {'red': ((0.0, 1.0, 1.0),
(1.0, 1.0, 1.0)),
'green': ((0.0, 1.0, 1.0),
(1.0, 0.0, 0.0)),
'blue': ((0.0, 1.0, 1.0),
(1.0, 0.0, 0.0))}
custom_cm = LinearSegmentedColormap(
"white2red", cdict)
vmin=0
center=10
vmax=20
hm = sns.clustermap(
hic_df,
# vmin=vmin, vmax=vmax, center=center,
col_cluster=False,
row_cluster=False,
xticklabels=False, yticklabels=False,
# cmap=custom_cm,
figsize=(20,20))
return hm
def make_mean_dist_clustermap(mean_dist_cluster_df, use_special_row_colors=True, use_linear_distances=False):
# ordered_df = hic_df.loc[index_order, index_order]
# ordered_clusters = cluster_series.loc[index_order]
# mask = np.zeros_like(ordered_df)
# mask[np.triu_indices_from(mask)] = True
color_pal = sns.color_palette("Paired",n_colors=11)
if use_special_row_colors:
color_pal = get_special_row1_colors()
colors = [color_pal[x] for x in mean_dist_cluster_df.index.tolist()]
# print(colors)
cdict = {'red': ((0.0, 1.0, 1.0),
(1.0, 1.0, 1.0)),
'green': ((0.0, 1.0, 1.0),
(1.0, 0.0, 0.0)),
'blue': ((0.0, 1.0, 1.0),
(1.0, 0.0, 0.0))}
if use_linear_distances:
cdict = {'blue': ((0.0, 1.0, 1.0),
(1.0, 1.0, 1.0)),
'red': ((0.0, 1.0, 1.0),
(1.0, 0.0, 0.0)),
'green': ((0.0, 1.0, 1.0),
(1.0, 0.0, 0.0))}
custom_cm = LinearSegmentedColormap(
"white2red", cdict)
if use_linear_distances:
custom_cm = custom_cm.reversed()
hm = sns.clustermap(
mean_dist_cluster_df,
# vmin=vmin, vmax=vmax, center=center,
col_cluster=False,
row_cluster=False,
xticklabels=False, yticklabels=False,
row_colors=colors,
col_colors=colors,
# cmap=custom_cm,
figsize=(20,20))
return hm
def calculate_ppv(order,
sample_metadata: pd.DataFrame,
sample_col="SampleName",
passage_col="Passage",
cell_line_col="CellLine",
reverse=False):
ordered_metadata = sample_metadata.sort_values(passage_col)
grouped_metadata = ordered_metadata.groupby(cell_line_col)
correct_pairs = 0.0
total_pairs = 0.0
for cellline, group in grouped_metadata:
num_rows = len(group)
if num_rows > 1:
i = 1
for row, data in group.iterrows():
for row2, data2 in group[i:].iterrows():
total_pairs += 1
if (not reverse) and order[data[sample_col]] <= order[
data2[sample_col]]:
correct_pairs += 1
elif reverse and order[data[sample_col]] >= order[
data2[sample_col]]:
correct_pairs += 1
else:
print('incorrect pair: ', data[sample_col],
order[data[sample_col]], data2[sample_col],
order[data2[sample_col]])
i += 1
if total_pairs == 0:
return "total pairs = 0"
print(total_pairs)
return correct_pairs / total_pairs