diff --git a/Dataloader_All.py b/Dataloader_All.py deleted file mode 100644 index 210dc04..0000000 --- a/Dataloader_All.py +++ /dev/null @@ -1,252 +0,0 @@ -import numpy as np -import random -import pandas as pd - -from sklearn import preprocessing -import matplotlib.pyplot as plt -from sklearn.preprocessing import MinMaxScaler, StandardScaler - - -def get_raw_train_test_old(dataset, method, feature, seed_index, alpha_scanvi=0, num_epochs=0, model_name=""): - # alpha_scanvi and num_epochs are only for scANVI use, e.g. 0.5, 100 - # model_name is only for DeepMicro use, e.g., AE[20] - - #This function still requires method, as scanvi = raw read count data with covariates - # deep micro = pct data, no covariates - # dca = raw read count, no covariates - - #X_train and test file names change (no latent rep specifier) - - save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' - split_folder = 'Split_' + str(seed_index) + '/' - scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder - deep_micro_use = save_folder + 'DeepMicro_Use/' + str(dataset) + '/' + str(feature) + '/' + str(model_name) + '/' + split_folder + '/' - dca_use = save_folder + 'DCA_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder - - if method == 'scanvi': - y_train = pd.read_csv(scanvi_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) - y_test = pd.read_csv(scanvi_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) - X_train = pd.read_csv(scanvi_use + 'X_train.csv', header=[0], index_col=0) - X_test = pd.read_csv(scanvi_use + 'X_test.csv', header=[0], index_col=0) - save_folder = scanvi_use + 'Raw/' #add raw to final save path - - if method == 'DeepMicro': - X_train = pd.read_csv(deep_micro_use + 'X_train.csv', header=None, index_col=False) - X_test = pd.read_csv(deep_micro_use + 'X_test.csv', header=None, index_col=False) - y_train = pd.read_csv(deep_micro_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) - y_test = pd.read_csv(deep_micro_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) - save_folder = deep_micro_use + 'Raw/' - - if method == 'DCA': - X_train = pd.read_csv(dca_use + 'X_train.csv', header=[0], index_col=0) - X_test = pd.read_csv(dca_use + 'X_test.csv', header=[0], index_col=0) - y_train = pd.read_csv(dca_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) - y_test = pd.read_csv(dca_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) - save_folder = dca_use + 'Raw/' - - # For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression - # So we add y_train to the X dataframe, remove the NA sample rows, then separate them again: - X_train['y'] = y_train - X_train = X_train[X_train['y'].notna()] - y_train = X_train['y'] - X_train = X_train.drop(columns=['y']) - - y_test = y_test.reset_index() - y_test = y_test.drop(columns=['index']) - - return X_train, X_test, y_train, y_test, save_folder - -def get_raw_train_test(dataset, regression, feature, seed_index): - # We are using raw pct data, which is in the DeepMicro path - #save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' - save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$ - split_folder = 'Split_' + str(seed_index) + '/' - raw_use = save_folder + 'Raw_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder - - #For each X and y, read from the Generic Split folder (this is the place most recent changes to y_test/train have been made) - #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/pomp/postdiet_Cholesterol/GenericSplits/' + split_folder - - #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder - y_path = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder - - X_train = pd.read_csv(y_path + 'X_train.csv', header=None, index_col=False) - X_test = pd.read_csv(y_path + 'X_test.csv', header=None, index_col=False) - - y_train = pd.read_csv(y_path + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index() - y_test = pd.read_csv(y_path + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index() - save_folder = raw_use - - #For DeepMicro and DCA, I did not update the X rep's to include sample id's so those only exist for y *** - # This will cause NA's to appear since the id's won't match up, so for now, I am resetting the indexes on X's *** - y_train = y_train.drop(columns=['index']) - y_test = y_test.drop(columns=['index']) - #print(y_train) - #print(y_test) - - # For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression - # So we add y_train to the X dataframe, remove the NA sample rows, then separate them again: - if regression == 'logistic': - X_train['y'] = y_train[feature] - else: - X_train['y'] = y_train - X_train = X_train[X_train['y'].notna()] - #X_train.to_csv(scanvi_use + 'X_Train_check.csv') - - y_train = X_train['y'] - X_train = X_train.drop(columns=['y']) - - X_test['y'] = y_test[feature] - #X_test = X_test[X_test['y'].notna()] - - y_test = X_test['y'] - #X_test.to_csv(scanvi_use + 'X_Test_check.csv') - X_test = X_test.drop(columns=['y']) - - return X_train, X_test, y_train, y_test, save_folder - -def get_latent_train_test(dataset, regression, method, feature, seed_index, alpha_scanvi=0, num_epochs=0, model_name=""): - # alpha_scanvi and num_epochs are only for scANVI use, e.g. 0.5, 100 - # model_name is only for DeepMicro use, e.g., AE[20] - - #save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' - #save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$ - save_folder = 'Master_Results/' #for cluster path $$$ - split_folder = 'Split_' + str(seed_index) + '/' - scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder - deep_micro_use = save_folder + 'DeepMicro_Use/' + str(dataset) + '/' + str(feature) + '/' + str(model_name) + '/' + split_folder + '/' - dca_use = save_folder + 'DCA_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder - - if method == 'scanvi': - y_train = pd.read_csv(scanvi_use + 'y_train_' + feature + '.csv', header=[0], index_col=0) - y_test = pd.read_csv(scanvi_use + 'y_test_' + feature + '.csv', header=[0], index_col=0) - - scanvi_use = scanvi_use + 'Alpha' + str(alpha_scanvi) + '_' + str(num_epochs) + 'epochs/' #add alpha setting and num epochs to final save path - #X_train = pd.read_csv(scanvi_use + 'X_train_latent_rep_scanvi.csv', header=None, index_col=False) - #X_test = pd.read_csv(scanvi_use + 'X_test_latent_rep_scanvi.csv', header=None, index_col=False) - X_train = pd.read_csv(scanvi_use + 'X_train_latent_rep_scanvi.csv', header=None, index_col=0) - #print(X_train) - X_test = pd.read_csv(scanvi_use + 'X_test_latent_rep_scanvi.csv', header=None, index_col=0) - #print(X_test) - save_folder = scanvi_use - - if method == 'DeepMicro': - X_train = pd.read_csv(deep_micro_use + model_name + '_X_train_rep.csv', header=None, index_col=False) - X_test = pd.read_csv(deep_micro_use + model_name + '_X_test_rep.csv', header=None, index_col=False) - #For the y's, read from the Generic Split folder (this is the place most recent changes to y_test/train have been made) - #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/pomp/postdiet_Cholesterol/GenericSplits/' + split_folder - - #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder - y_path = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder - - y_train = pd.read_csv(y_path + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index() - y_test = pd.read_csv(y_path + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index() - save_folder = deep_micro_use - - #For DeepMicro and DCA, I did not update the X rep's to include sample id's so those only exist for y *** - # This will cause NA's to appear since the id's won't match up, so for now, I am resetting the indexes on X's *** - y_train = y_train.drop(columns=['index']) - y_test = y_test.drop(columns=['index']) - #print(y_train) - #print(y_test) - - if method == 'DCA': - X_train = pd.read_csv(dca_use + 'latent.tsv', sep='\t', header=None, index_col=0).reset_index() #sep='\t' for TSV file - - X_test = pd.read_csv(dca_use + 'latent_X_test.tsv', sep='\t', header=None, index_col=0).reset_index() #sep='\t' for TSV file - - X_train = X_train.iloc[:, 1:] - X_test = X_test.iloc[:, 1:] - #print(X_train) - - ##Having to add the below scaling or else ValueError 'too large' is triggered: - ## Get column names first - #names = X_train.columns - ## Create the Scaler object - #scaler = preprocessing.StandardScaler() - ## Fit your data on the scaler object - #scaled_df = scaler.fit_transform(X_train) - #X_train = pd.DataFrame(scaled_df, columns=names) - - ## Get column names first - #names = X_test.columns - ## Create the Scaler object - #scaler = preprocessing.StandardScaler() - ## Fit your data on the scaler object - #scaled_df = scaler.fit_transform(X_test) - #X_test = pd.DataFrame(scaled_df, columns=names) - - y_train = pd.read_csv(dca_use + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index() - y_test = pd.read_csv(dca_use + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index() - save_folder = dca_use - - #X_train = X_train.drop(columns=['index']) - #X_test = X_test.drop(columns=['index']) - y_train = y_train.drop(columns=['index']) - y_test = y_test.drop(columns=['index']) - - # For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression - # So we add y_train to the X dataframe, remove the NA sample rows, then separate them again: - if regression == 'logistic' and method == 'scanvi': #only scanvi has one-hot encoded y values, others are stored as labels already - #print(y_train) - y_train = convert_labels_to_assignment(y_train, dataset, feature) - y_test = convert_labels_to_assignment(y_test, dataset, feature) - #X_train['y'] = y_train[feature] - #else: - X_train['y'] = y_train - X_train = X_train[X_train['y'].notna()] - #X_train.to_csv(scanvi_use + 'X_Train_check.csv') - - y_train = X_train['y'] - X_train = X_train.drop(columns=['y']) - - #Testing mirroring exact handling of xtrain: - X_test['y'] = y_test - X_test = X_test[X_test['y'].notna()] - #X_train.to_csv(scanvi_use + 'X_Train_check.csv') - - y_test = X_test['y'] - X_test = X_test.drop(columns=['y']) - - ## Testing: min-max scaling all features: - #y_test = y_test.reset_index() #We need to reset the index or else when we combine, NA's show up where indexes differ - ##print(y_test) - - #if regression == 'logistic': - # #y_test = convert_labels_to_assignment(y_test, dataset, feature) - # X_test['y'] = y_test[feature] - # print(X_test) - #else: - # X_test['y'] = y_test[feature] - # print(X_test) - #X_test = X_test[X_test['y'].notna()] - #print(X_test) - ##print(X_train) - - - ### Moving this up here so y is not scaled: - #y_test = X_test['y'] - #X_test = X_test.drop(columns=['y']) - ##print(save_folder) - ##print(X_test) - ##print(y_test) - #print(X_train) - #print(y_train) - #y_train= y_train.map({6: 0, 12: 1, 18:2}) - #y_test= y_test.map({6: 0, 12: 1, 18:2}) - #print(y_train) - return X_train, X_test, y_train, y_test, save_folder - -def convert_labels_to_assignment(ypred, dataset, feature): - true_labels =[] - #print(ypred) - for i in range(0, ypred.shape[0]): #select max of each row - max_index = np.argmax(ypred.iloc[i, :]) - if dataset == 'doma' and feature == 'Age': # ages 6, 12, 18 months - if max_index == 0: - max_index = 6 - if max_index == 1: - max_index = 12 - if max_index == 2: - max_index = 18 - true_labels.append(max_index) - return true_labels \ No newline at end of file