-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit b39dbb9
Showing
1 changed file
with
252 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,252 @@ | ||
import numpy as np | ||
import random | ||
import pandas as pd | ||
|
||
from sklearn import preprocessing | ||
import matplotlib.pyplot as plt | ||
from sklearn.preprocessing import MinMaxScaler, StandardScaler | ||
|
||
|
||
def get_raw_train_test_old(dataset, method, feature, seed_index, alpha_scanvi=0, num_epochs=0, model_name=""): | ||
# alpha_scanvi and num_epochs are only for scANVI use, e.g. 0.5, 100 | ||
# model_name is only for DeepMicro use, e.g., AE[20] | ||
|
||
#This function still requires method, as scanvi = raw read count data with covariates | ||
# deep micro = pct data, no covariates | ||
# dca = raw read count, no covariates | ||
|
||
#X_train and test file names change (no latent rep specifier) | ||
|
||
save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' | ||
split_folder = 'Split_' + str(seed_index) + '/' | ||
scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder | ||
deep_micro_use = save_folder + 'DeepMicro_Use/' + str(dataset) + '/' + str(feature) + '/' + str(model_name) + '/' + split_folder + '/' | ||
dca_use = save_folder + 'DCA_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder | ||
|
||
if method == 'scanvi': | ||
y_train = pd.read_csv(scanvi_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) | ||
y_test = pd.read_csv(scanvi_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) | ||
X_train = pd.read_csv(scanvi_use + 'X_train.csv', header=[0], index_col=0) | ||
X_test = pd.read_csv(scanvi_use + 'X_test.csv', header=[0], index_col=0) | ||
save_folder = scanvi_use + 'Raw/' #add raw to final save path | ||
|
||
if method == 'DeepMicro': | ||
X_train = pd.read_csv(deep_micro_use + 'X_train.csv', header=None, index_col=False) | ||
X_test = pd.read_csv(deep_micro_use + 'X_test.csv', header=None, index_col=False) | ||
y_train = pd.read_csv(deep_micro_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) | ||
y_test = pd.read_csv(deep_micro_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) | ||
save_folder = deep_micro_use + 'Raw/' | ||
|
||
if method == 'DCA': | ||
X_train = pd.read_csv(dca_use + 'X_train.csv', header=[0], index_col=0) | ||
X_test = pd.read_csv(dca_use + 'X_test.csv', header=[0], index_col=0) | ||
y_train = pd.read_csv(dca_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) | ||
y_test = pd.read_csv(dca_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) | ||
save_folder = dca_use + 'Raw/' | ||
|
||
# For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression | ||
# So we add y_train to the X dataframe, remove the NA sample rows, then separate them again: | ||
X_train['y'] = y_train | ||
X_train = X_train[X_train['y'].notna()] | ||
y_train = X_train['y'] | ||
X_train = X_train.drop(columns=['y']) | ||
|
||
y_test = y_test.reset_index() | ||
y_test = y_test.drop(columns=['index']) | ||
|
||
return X_train, X_test, y_train, y_test, save_folder | ||
|
||
def get_raw_train_test(dataset, regression, feature, seed_index): | ||
# We are using raw pct data, which is in the DeepMicro path | ||
#save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' | ||
save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$ | ||
split_folder = 'Split_' + str(seed_index) + '/' | ||
raw_use = save_folder + 'Raw_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder | ||
|
||
#For each X and y, read from the Generic Split folder (this is the place most recent changes to y_test/train have been made) | ||
#y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/pomp/postdiet_Cholesterol/GenericSplits/' + split_folder | ||
|
||
#y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder | ||
y_path = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder | ||
|
||
X_train = pd.read_csv(y_path + 'X_train.csv', header=None, index_col=False) | ||
X_test = pd.read_csv(y_path + 'X_test.csv', header=None, index_col=False) | ||
|
||
y_train = pd.read_csv(y_path + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index() | ||
y_test = pd.read_csv(y_path + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index() | ||
save_folder = raw_use | ||
|
||
#For DeepMicro and DCA, I did not update the X rep's to include sample id's so those only exist for y *** | ||
# This will cause NA's to appear since the id's won't match up, so for now, I am resetting the indexes on X's *** | ||
y_train = y_train.drop(columns=['index']) | ||
y_test = y_test.drop(columns=['index']) | ||
#print(y_train) | ||
#print(y_test) | ||
|
||
# For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression | ||
# So we add y_train to the X dataframe, remove the NA sample rows, then separate them again: | ||
if regression == 'logistic': | ||
X_train['y'] = y_train[feature] | ||
else: | ||
X_train['y'] = y_train | ||
X_train = X_train[X_train['y'].notna()] | ||
#X_train.to_csv(scanvi_use + 'X_Train_check.csv') | ||
|
||
y_train = X_train['y'] | ||
X_train = X_train.drop(columns=['y']) | ||
|
||
X_test['y'] = y_test[feature] | ||
#X_test = X_test[X_test['y'].notna()] | ||
|
||
y_test = X_test['y'] | ||
#X_test.to_csv(scanvi_use + 'X_Test_check.csv') | ||
X_test = X_test.drop(columns=['y']) | ||
|
||
return X_train, X_test, y_train, y_test, save_folder | ||
|
||
def get_latent_train_test(dataset, regression, method, feature, seed_index, alpha_scanvi=0, num_epochs=0, model_name=""): | ||
# alpha_scanvi and num_epochs are only for scANVI use, e.g. 0.5, 100 | ||
# model_name is only for DeepMicro use, e.g., AE[20] | ||
|
||
#save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' | ||
#save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$ | ||
save_folder = 'Master_Results/' #for cluster path $$$ | ||
split_folder = 'Split_' + str(seed_index) + '/' | ||
scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder | ||
deep_micro_use = save_folder + 'DeepMicro_Use/' + str(dataset) + '/' + str(feature) + '/' + str(model_name) + '/' + split_folder + '/' | ||
dca_use = save_folder + 'DCA_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder | ||
|
||
if method == 'scanvi': | ||
y_train = pd.read_csv(scanvi_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) | ||
y_test = pd.read_csv(scanvi_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) | ||
|
||
scanvi_use = scanvi_use + 'Alpha' + str(alpha_scanvi) + '_' + str(num_epochs) + 'epochs/' #add alpha setting and num epochs to final save path | ||
#X_train = pd.read_csv(scanvi_use + 'X_train_latent_rep_scanvi.csv', header=None, index_col=False) | ||
#X_test = pd.read_csv(scanvi_use + 'X_test_latent_rep_scanvi.csv', header=None, index_col=False) | ||
X_train = pd.read_csv(scanvi_use + 'X_train_latent_rep_scanvi.csv', header=None, index_col=0) | ||
#print(X_train) | ||
X_test = pd.read_csv(scanvi_use + 'X_test_latent_rep_scanvi.csv', header=None, index_col=0) | ||
#print(X_test) | ||
save_folder = scanvi_use | ||
|
||
if method == 'DeepMicro': | ||
X_train = pd.read_csv(deep_micro_use + model_name + '_X_train_rep.csv', header=None, index_col=False) | ||
X_test = pd.read_csv(deep_micro_use + model_name + '_X_test_rep.csv', header=None, index_col=False) | ||
#For the y's, read from the Generic Split folder (this is the place most recent changes to y_test/train have been made) | ||
#y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/pomp/postdiet_Cholesterol/GenericSplits/' + split_folder | ||
|
||
#y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder | ||
y_path = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder | ||
|
||
y_train = pd.read_csv(y_path + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index() | ||
y_test = pd.read_csv(y_path + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index() | ||
save_folder = deep_micro_use | ||
|
||
#For DeepMicro and DCA, I did not update the X rep's to include sample id's so those only exist for y *** | ||
# This will cause NA's to appear since the id's won't match up, so for now, I am resetting the indexes on X's *** | ||
y_train = y_train.drop(columns=['index']) | ||
y_test = y_test.drop(columns=['index']) | ||
#print(y_train) | ||
#print(y_test) | ||
|
||
if method == 'DCA': | ||
X_train = pd.read_csv(dca_use + 'latent.tsv', sep='\t', header=None, index_col=0).reset_index() #sep='\t' for TSV file | ||
|
||
X_test = pd.read_csv(dca_use + 'latent_X_test.tsv', sep='\t', header=None, index_col=0).reset_index() #sep='\t' for TSV file | ||
|
||
X_train = X_train.iloc[:, 1:] | ||
X_test = X_test.iloc[:, 1:] | ||
#print(X_train) | ||
|
||
##Having to add the below scaling or else ValueError 'too large' is triggered: | ||
## Get column names first | ||
#names = X_train.columns | ||
## Create the Scaler object | ||
#scaler = preprocessing.StandardScaler() | ||
## Fit your data on the scaler object | ||
#scaled_df = scaler.fit_transform(X_train) | ||
#X_train = pd.DataFrame(scaled_df, columns=names) | ||
|
||
## Get column names first | ||
#names = X_test.columns | ||
## Create the Scaler object | ||
#scaler = preprocessing.StandardScaler() | ||
## Fit your data on the scaler object | ||
#scaled_df = scaler.fit_transform(X_test) | ||
#X_test = pd.DataFrame(scaled_df, columns=names) | ||
|
||
y_train = pd.read_csv(dca_use + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index() | ||
y_test = pd.read_csv(dca_use + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index() | ||
save_folder = dca_use | ||
|
||
#X_train = X_train.drop(columns=['index']) | ||
#X_test = X_test.drop(columns=['index']) | ||
y_train = y_train.drop(columns=['index']) | ||
y_test = y_test.drop(columns=['index']) | ||
|
||
# For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression | ||
# So we add y_train to the X dataframe, remove the NA sample rows, then separate them again: | ||
if regression == 'logistic' and method == 'scanvi': #only scanvi has one-hot encoded y values, others are stored as labels already | ||
#print(y_train) | ||
y_train = convert_labels_to_assignment(y_train, dataset, feature) | ||
y_test = convert_labels_to_assignment(y_test, dataset, feature) | ||
#X_train['y'] = y_train[feature] | ||
#else: | ||
X_train['y'] = y_train | ||
X_train = X_train[X_train['y'].notna()] | ||
#X_train.to_csv(scanvi_use + 'X_Train_check.csv') | ||
|
||
y_train = X_train['y'] | ||
X_train = X_train.drop(columns=['y']) | ||
|
||
#Testing mirroring exact handling of xtrain: | ||
X_test['y'] = y_test | ||
X_test = X_test[X_test['y'].notna()] | ||
#X_train.to_csv(scanvi_use + 'X_Train_check.csv') | ||
|
||
y_test = X_test['y'] | ||
X_test = X_test.drop(columns=['y']) | ||
|
||
## Testing: min-max scaling all features: | ||
#y_test = y_test.reset_index() #We need to reset the index or else when we combine, NA's show up where indexes differ | ||
##print(y_test) | ||
|
||
#if regression == 'logistic': | ||
# #y_test = convert_labels_to_assignment(y_test, dataset, feature) | ||
# X_test['y'] = y_test[feature] | ||
# print(X_test) | ||
#else: | ||
# X_test['y'] = y_test[feature] | ||
# print(X_test) | ||
#X_test = X_test[X_test['y'].notna()] | ||
#print(X_test) | ||
##print(X_train) | ||
|
||
|
||
### Moving this up here so y is not scaled: | ||
#y_test = X_test['y'] | ||
#X_test = X_test.drop(columns=['y']) | ||
##print(save_folder) | ||
##print(X_test) | ||
##print(y_test) | ||
#print(X_train) | ||
#print(y_train) | ||
#y_train= y_train.map({6: 0, 12: 1, 18:2}) | ||
#y_test= y_test.map({6: 0, 12: 1, 18:2}) | ||
#print(y_train) | ||
return X_train, X_test, y_train, y_test, save_folder | ||
|
||
def convert_labels_to_assignment(ypred, dataset, feature): | ||
true_labels =[] | ||
#print(ypred) | ||
for i in range(0, ypred.shape[0]): #select max of each row | ||
max_index = np.argmax(ypred.iloc[i, :]) | ||
if dataset == 'doma' and feature == 'Age': # ages 6, 12, 18 months | ||
if max_index == 0: | ||
max_index = 6 | ||
if max_index == 1: | ||
max_index = 12 | ||
if max_index == 2: | ||
max_index = 18 | ||
true_labels.append(max_index) | ||
return true_labels |