From b39dbb9ae56cbf72e3de293fc2ea829d1a701d01 Mon Sep 17 00:00:00 2001 From: Rigel Mahmood Date: Mon, 19 Feb 2024 14:05:25 -0500 Subject: [PATCH] Add files via upload --- Dataloader_All.py | 252 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 Dataloader_All.py diff --git a/Dataloader_All.py b/Dataloader_All.py new file mode 100644 index 0000000..210dc04 --- /dev/null +++ b/Dataloader_All.py @@ -0,0 +1,252 @@ +import numpy as np +import random +import pandas as pd + +from sklearn import preprocessing +import matplotlib.pyplot as plt +from sklearn.preprocessing import MinMaxScaler, StandardScaler + + +def get_raw_train_test_old(dataset, method, feature, seed_index, alpha_scanvi=0, num_epochs=0, model_name=""): + # alpha_scanvi and num_epochs are only for scANVI use, e.g. 0.5, 100 + # model_name is only for DeepMicro use, e.g., AE[20] + + #This function still requires method, as scanvi = raw read count data with covariates + # deep micro = pct data, no covariates + # dca = raw read count, no covariates + + #X_train and test file names change (no latent rep specifier) + + save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' + split_folder = 'Split_' + str(seed_index) + '/' + scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder + deep_micro_use = save_folder + 'DeepMicro_Use/' + str(dataset) + '/' + str(feature) + '/' + str(model_name) + '/' + split_folder + '/' + dca_use = save_folder + 'DCA_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder + + if method == 'scanvi': + y_train = pd.read_csv(scanvi_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) + y_test = pd.read_csv(scanvi_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) + X_train = pd.read_csv(scanvi_use + 'X_train.csv', header=[0], index_col=0) + X_test = pd.read_csv(scanvi_use + 'X_test.csv', header=[0], index_col=0) + save_folder = scanvi_use + 'Raw/' #add raw to final save path + + if method == 'DeepMicro': + X_train = pd.read_csv(deep_micro_use + 'X_train.csv', header=None, index_col=False) + X_test = pd.read_csv(deep_micro_use + 'X_test.csv', header=None, index_col=False) + y_train = pd.read_csv(deep_micro_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) + y_test = pd.read_csv(deep_micro_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) + save_folder = deep_micro_use + 'Raw/' + + if method == 'DCA': + X_train = pd.read_csv(dca_use + 'X_train.csv', header=[0], index_col=0) + X_test = pd.read_csv(dca_use + 'X_test.csv', header=[0], index_col=0) + y_train = pd.read_csv(dca_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) + y_test = pd.read_csv(dca_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) + save_folder = dca_use + 'Raw/' + + # For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression + # So we add y_train to the X dataframe, remove the NA sample rows, then separate them again: + X_train['y'] = y_train + X_train = X_train[X_train['y'].notna()] + y_train = X_train['y'] + X_train = X_train.drop(columns=['y']) + + y_test = y_test.reset_index() + y_test = y_test.drop(columns=['index']) + + return X_train, X_test, y_train, y_test, save_folder + +def get_raw_train_test(dataset, regression, feature, seed_index): + # We are using raw pct data, which is in the DeepMicro path + #save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' + save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$ + split_folder = 'Split_' + str(seed_index) + '/' + raw_use = save_folder + 'Raw_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder + + #For each X and y, read from the Generic Split folder (this is the place most recent changes to y_test/train have been made) + #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/pomp/postdiet_Cholesterol/GenericSplits/' + split_folder + + #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder + y_path = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder + + X_train = pd.read_csv(y_path + 'X_train.csv', header=None, index_col=False) + X_test = pd.read_csv(y_path + 'X_test.csv', header=None, index_col=False) + + y_train = pd.read_csv(y_path + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index() + y_test = pd.read_csv(y_path + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index() + save_folder = raw_use + + #For DeepMicro and DCA, I did not update the X rep's to include sample id's so those only exist for y *** + # This will cause NA's to appear since the id's won't match up, so for now, I am resetting the indexes on X's *** + y_train = y_train.drop(columns=['index']) + y_test = y_test.drop(columns=['index']) + #print(y_train) + #print(y_test) + + # For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression + # So we add y_train to the X dataframe, remove the NA sample rows, then separate them again: + if regression == 'logistic': + X_train['y'] = y_train[feature] + else: + X_train['y'] = y_train + X_train = X_train[X_train['y'].notna()] + #X_train.to_csv(scanvi_use + 'X_Train_check.csv') + + y_train = X_train['y'] + X_train = X_train.drop(columns=['y']) + + X_test['y'] = y_test[feature] + #X_test = X_test[X_test['y'].notna()] + + y_test = X_test['y'] + #X_test.to_csv(scanvi_use + 'X_Test_check.csv') + X_test = X_test.drop(columns=['y']) + + return X_train, X_test, y_train, y_test, save_folder + +def get_latent_train_test(dataset, regression, method, feature, seed_index, alpha_scanvi=0, num_epochs=0, model_name=""): + # alpha_scanvi and num_epochs are only for scANVI use, e.g. 0.5, 100 + # model_name is only for DeepMicro use, e.g., AE[20] + + #save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' + #save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$ + save_folder = 'Master_Results/' #for cluster path $$$ + split_folder = 'Split_' + str(seed_index) + '/' + scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder + deep_micro_use = save_folder + 'DeepMicro_Use/' + str(dataset) + '/' + str(feature) + '/' + str(model_name) + '/' + split_folder + '/' + dca_use = save_folder + 'DCA_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder + + if method == 'scanvi': + y_train = pd.read_csv(scanvi_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) + y_test = pd.read_csv(scanvi_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) + + scanvi_use = scanvi_use + 'Alpha' + str(alpha_scanvi) + '_' + str(num_epochs) + 'epochs/' #add alpha setting and num epochs to final save path + #X_train = pd.read_csv(scanvi_use + 'X_train_latent_rep_scanvi.csv', header=None, index_col=False) + #X_test = pd.read_csv(scanvi_use + 'X_test_latent_rep_scanvi.csv', header=None, index_col=False) + X_train = pd.read_csv(scanvi_use + 'X_train_latent_rep_scanvi.csv', header=None, index_col=0) + #print(X_train) + X_test = pd.read_csv(scanvi_use + 'X_test_latent_rep_scanvi.csv', header=None, index_col=0) + #print(X_test) + save_folder = scanvi_use + + if method == 'DeepMicro': + X_train = pd.read_csv(deep_micro_use + model_name + '_X_train_rep.csv', header=None, index_col=False) + X_test = pd.read_csv(deep_micro_use + model_name + '_X_test_rep.csv', header=None, index_col=False) + #For the y's, read from the Generic Split folder (this is the place most recent changes to y_test/train have been made) + #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/pomp/postdiet_Cholesterol/GenericSplits/' + split_folder + + #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder + y_path = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder + + y_train = pd.read_csv(y_path + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index() + y_test = pd.read_csv(y_path + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index() + save_folder = deep_micro_use + + #For DeepMicro and DCA, I did not update the X rep's to include sample id's so those only exist for y *** + # This will cause NA's to appear since the id's won't match up, so for now, I am resetting the indexes on X's *** + y_train = y_train.drop(columns=['index']) + y_test = y_test.drop(columns=['index']) + #print(y_train) + #print(y_test) + + if method == 'DCA': + X_train = pd.read_csv(dca_use + 'latent.tsv', sep='\t', header=None, index_col=0).reset_index() #sep='\t' for TSV file + + X_test = pd.read_csv(dca_use + 'latent_X_test.tsv', sep='\t', header=None, index_col=0).reset_index() #sep='\t' for TSV file + + X_train = X_train.iloc[:, 1:] + X_test = X_test.iloc[:, 1:] + #print(X_train) + + ##Having to add the below scaling or else ValueError 'too large' is triggered: + ## Get column names first + #names = X_train.columns + ## Create the Scaler object + #scaler = preprocessing.StandardScaler() + ## Fit your data on the scaler object + #scaled_df = scaler.fit_transform(X_train) + #X_train = pd.DataFrame(scaled_df, columns=names) + + ## Get column names first + #names = X_test.columns + ## Create the Scaler object + #scaler = preprocessing.StandardScaler() + ## Fit your data on the scaler object + #scaled_df = scaler.fit_transform(X_test) + #X_test = pd.DataFrame(scaled_df, columns=names) + + y_train = pd.read_csv(dca_use + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index() + y_test = pd.read_csv(dca_use + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index() + save_folder = dca_use + + #X_train = X_train.drop(columns=['index']) + #X_test = X_test.drop(columns=['index']) + y_train = y_train.drop(columns=['index']) + y_test = y_test.drop(columns=['index']) + + # For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression + # So we add y_train to the X dataframe, remove the NA sample rows, then separate them again: + if regression == 'logistic' and method == 'scanvi': #only scanvi has one-hot encoded y values, others are stored as labels already + #print(y_train) + y_train = convert_labels_to_assignment(y_train, dataset, feature) + y_test = convert_labels_to_assignment(y_test, dataset, feature) + #X_train['y'] = y_train[feature] + #else: + X_train['y'] = y_train + X_train = X_train[X_train['y'].notna()] + #X_train.to_csv(scanvi_use + 'X_Train_check.csv') + + y_train = X_train['y'] + X_train = X_train.drop(columns=['y']) + + #Testing mirroring exact handling of xtrain: + X_test['y'] = y_test + X_test = X_test[X_test['y'].notna()] + #X_train.to_csv(scanvi_use + 'X_Train_check.csv') + + y_test = X_test['y'] + X_test = X_test.drop(columns=['y']) + + ## Testing: min-max scaling all features: + #y_test = y_test.reset_index() #We need to reset the index or else when we combine, NA's show up where indexes differ + ##print(y_test) + + #if regression == 'logistic': + # #y_test = convert_labels_to_assignment(y_test, dataset, feature) + # X_test['y'] = y_test[feature] + # print(X_test) + #else: + # X_test['y'] = y_test[feature] + # print(X_test) + #X_test = X_test[X_test['y'].notna()] + #print(X_test) + ##print(X_train) + + + ### Moving this up here so y is not scaled: + #y_test = X_test['y'] + #X_test = X_test.drop(columns=['y']) + ##print(save_folder) + ##print(X_test) + ##print(y_test) + #print(X_train) + #print(y_train) + #y_train= y_train.map({6: 0, 12: 1, 18:2}) + #y_test= y_test.map({6: 0, 12: 1, 18:2}) + #print(y_train) + return X_train, X_test, y_train, y_test, save_folder + +def convert_labels_to_assignment(ypred, dataset, feature): + true_labels =[] + #print(ypred) + for i in range(0, ypred.shape[0]): #select max of each row + max_index = np.argmax(ypred.iloc[i, :]) + if dataset == 'doma' and feature == 'Age': # ages 6, 12, 18 months + if max_index == 0: + max_index = 6 + if max_index == 1: + max_index = 12 + if max_index == 2: + max_index = 18 + true_labels.append(max_index) + return true_labels \ No newline at end of file