Add files via upload

HealthInfoLab · Feb 19, 2024 · b39dbb9 · b39dbb9
commit b39dbb9
Showing 1 changed file with 252 additions and 0 deletions.
diff --git a/Dataloader_All.py b/Dataloader_All.py
@@ -0,0 +1,252 @@
+import numpy as np 
+import random 
+import pandas as pd
+
+from sklearn import preprocessing
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+
+
+def get_raw_train_test_old(dataset, method, feature, seed_index, alpha_scanvi=0, num_epochs=0, model_name=""):
+    # alpha_scanvi and num_epochs are only for scANVI use, e.g. 0.5, 100
+    # model_name is only for DeepMicro use, e.g., AE[20]
+
+    #This function still requires method, as scanvi = raw read count data with covariates
+    # deep micro = pct data, no covariates
+    # dca = raw read count, no covariates
+
+    #X_train and test file names change (no latent rep specifier)
+
+    save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/'
+    split_folder = 'Split_' + str(seed_index) + '/'
+    scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' +  split_folder 
+    deep_micro_use = save_folder + 'DeepMicro_Use/' + str(dataset) + '/' + str(feature) + '/' + str(model_name) + '/' +  split_folder + '/' 
+    dca_use = save_folder + 'DCA_Use/' + str(dataset) + '/' + str(feature) + '/' +  split_folder 
+
+    if method == 'scanvi':
+        y_train = pd.read_csv(scanvi_use + 'y_train_' + feature + '.csv', header=[0], index_col=0)
+        y_test = pd.read_csv(scanvi_use + 'y_test_' + feature + '.csv', header=[0], index_col=0)
+        X_train = pd.read_csv(scanvi_use + 'X_train.csv', header=[0], index_col=0)
+        X_test = pd.read_csv(scanvi_use + 'X_test.csv', header=[0], index_col=0)
+        save_folder = scanvi_use + 'Raw/' #add raw to final save path 
+
+    if method == 'DeepMicro':
+        X_train = pd.read_csv(deep_micro_use + 'X_train.csv', header=None, index_col=False)
+        X_test = pd.read_csv(deep_micro_use + 'X_test.csv', header=None, index_col=False)
+        y_train = pd.read_csv(deep_micro_use + 'y_train_' + feature + '.csv', header=[0], index_col=0)
+        y_test = pd.read_csv(deep_micro_use + 'y_test_' + feature + '.csv', header=[0], index_col=0)
+        save_folder = deep_micro_use + 'Raw/'
+
+    if method == 'DCA':
+        X_train = pd.read_csv(dca_use + 'X_train.csv', header=[0], index_col=0)
+        X_test = pd.read_csv(dca_use + 'X_test.csv', header=[0], index_col=0)
+        y_train = pd.read_csv(dca_use + 'y_train_' + feature + '.csv', header=[0], index_col=0)
+        y_test = pd.read_csv(dca_use + 'y_test_' + feature + '.csv', header=[0], index_col=0)
+        save_folder = dca_use + 'Raw/'
+
+    # For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression
+    # So we add y_train to the X dataframe, remove the NA sample rows, then separate them again:
+    X_train['y'] = y_train
+    X_train = X_train[X_train['y'].notna()]
+    y_train = X_train['y']
+    X_train = X_train.drop(columns=['y'])
+
+    y_test = y_test.reset_index()
+    y_test = y_test.drop(columns=['index'])
+
+    return X_train, X_test, y_train, y_test, save_folder
+
+def get_raw_train_test(dataset, regression, feature, seed_index):
+    # We are using raw pct data, which is in the DeepMicro path
+    #save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/'
+    save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$
+    split_folder = 'Split_' + str(seed_index) + '/'
+    raw_use = save_folder + 'Raw_Use/' + str(dataset) + '/' + str(feature) + '/' +  split_folder 
+
+    #For each X and y, read from the Generic Split folder (this is the place most recent changes to y_test/train have been made)
+    #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/pomp/postdiet_Cholesterol/GenericSplits/' + split_folder 
+
+    #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder 
+    y_path = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/'  + dataset + '/' + feature + '/GenericSplits/' + split_folder 
+
+    X_train = pd.read_csv(y_path + 'X_train.csv', header=None, index_col=False)
+    X_test = pd.read_csv(y_path + 'X_test.csv', header=None, index_col=False)
+
+    y_train = pd.read_csv(y_path + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index()
+    y_test = pd.read_csv(y_path + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index()
+    save_folder = raw_use
+
+    #For DeepMicro and DCA, I did not update the X rep's to include sample id's so those only exist for y ***
+    # This will cause NA's to appear since the id's won't match up, so for now, I am resetting the indexes on X's ***
+    y_train = y_train.drop(columns=['index'])
+    y_test = y_test.drop(columns=['index'])
+    #print(y_train)
+    #print(y_test)
+
+    # For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression
+    # So we add y_train to the X dataframe, remove the NA sample rows, then separate them again:
+    if regression == 'logistic':
+        X_train['y'] = y_train[feature]
+    else:
+        X_train['y'] = y_train
+    X_train = X_train[X_train['y'].notna()]
+    #X_train.to_csv(scanvi_use + 'X_Train_check.csv')
+
+    y_train = X_train['y']
+    X_train = X_train.drop(columns=['y'])
+
+    X_test['y'] = y_test[feature]
+    #X_test = X_test[X_test['y'].notna()]
+
+    y_test = X_test['y']
+    #X_test.to_csv(scanvi_use + 'X_Test_check.csv')
+    X_test = X_test.drop(columns=['y'])
+
+    return X_train, X_test, y_train, y_test, save_folder
+
+def get_latent_train_test(dataset, regression, method, feature, seed_index, alpha_scanvi=0, num_epochs=0, model_name=""):
+    # alpha_scanvi and num_epochs are only for scANVI use, e.g. 0.5, 100
+    # model_name is only for DeepMicro use, e.g., AE[20]
+
+    #save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/'
+    #save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$
+    save_folder = 'Master_Results/' #for cluster path $$$
+    split_folder = 'Split_' + str(seed_index) + '/'
+    scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' +  split_folder 
+    deep_micro_use = save_folder + 'DeepMicro_Use/' + str(dataset) + '/' + str(feature) + '/' + str(model_name) + '/' +  split_folder + '/' 
+    dca_use = save_folder + 'DCA_Use/' + str(dataset) + '/' + str(feature) + '/' +  split_folder 
+
+    if method == 'scanvi':
+        y_train = pd.read_csv(scanvi_use + 'y_train_' + feature + '.csv', header=[0], index_col=0)
+        y_test = pd.read_csv(scanvi_use + 'y_test_' + feature + '.csv', header=[0], index_col=0)
+
+        scanvi_use = scanvi_use + 'Alpha' + str(alpha_scanvi) + '_' + str(num_epochs) + 'epochs/' #add alpha setting and num epochs to final save path 
+        #X_train = pd.read_csv(scanvi_use + 'X_train_latent_rep_scanvi.csv', header=None, index_col=False)
+        #X_test = pd.read_csv(scanvi_use + 'X_test_latent_rep_scanvi.csv', header=None, index_col=False)
+        X_train = pd.read_csv(scanvi_use + 'X_train_latent_rep_scanvi.csv', header=None, index_col=0)
+        #print(X_train)
+        X_test = pd.read_csv(scanvi_use + 'X_test_latent_rep_scanvi.csv', header=None, index_col=0)
+        #print(X_test)
+        save_folder = scanvi_use
+
+    if method == 'DeepMicro':
+        X_train = pd.read_csv(deep_micro_use + model_name + '_X_train_rep.csv', header=None, index_col=False)
+        X_test = pd.read_csv(deep_micro_use + model_name + '_X_test_rep.csv', header=None, index_col=False)
+        #For the y's, read from the Generic Split folder (this is the place most recent changes to y_test/train have been made)
+        #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/pomp/postdiet_Cholesterol/GenericSplits/' + split_folder 
+
+        #y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder 
+        y_path = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder 
+
+        y_train = pd.read_csv(y_path + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index()
+        y_test = pd.read_csv(y_path + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index()
+        save_folder = deep_micro_use
+
+        #For DeepMicro and DCA, I did not update the X rep's to include sample id's so those only exist for y ***
+        # This will cause NA's to appear since the id's won't match up, so for now, I am resetting the indexes on X's ***
+        y_train = y_train.drop(columns=['index'])
+        y_test = y_test.drop(columns=['index'])
+        #print(y_train)
+        #print(y_test)
+
+    if method == 'DCA':
+        X_train = pd.read_csv(dca_use + 'latent.tsv', sep='\t', header=None, index_col=0).reset_index() #sep='\t' for TSV file
+
+        X_test = pd.read_csv(dca_use + 'latent_X_test.tsv', sep='\t', header=None, index_col=0).reset_index() #sep='\t' for TSV file
+
+        X_train = X_train.iloc[:, 1:]
+        X_test = X_test.iloc[:, 1:]
+        #print(X_train)
+
+        ##Having to add the below scaling or else ValueError 'too large' is triggered:
+        ## Get column names first
+        #names = X_train.columns
+        ## Create the Scaler object
+        #scaler = preprocessing.StandardScaler()
+        ## Fit your data on the scaler object
+        #scaled_df = scaler.fit_transform(X_train)
+        #X_train = pd.DataFrame(scaled_df, columns=names)
+
+        ## Get column names first
+        #names = X_test.columns
+        ## Create the Scaler object
+        #scaler = preprocessing.StandardScaler()
+        ## Fit your data on the scaler object
+        #scaled_df = scaler.fit_transform(X_test)
+        #X_test = pd.DataFrame(scaled_df, columns=names)
+
+        y_train = pd.read_csv(dca_use + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index()
+        y_test = pd.read_csv(dca_use + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index()
+        save_folder = dca_use
+
+        #X_train = X_train.drop(columns=['index'])
+        #X_test = X_test.drop(columns=['index'])
+        y_train = y_train.drop(columns=['index'])
+        y_test = y_test.drop(columns=['index'])
+
+    # For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression
+    # So we add y_train to the X dataframe, remove the NA sample rows, then separate them again:
+    if regression == 'logistic' and method == 'scanvi': #only scanvi has one-hot encoded y values, others are stored as labels already
+        #print(y_train)
+        y_train = convert_labels_to_assignment(y_train, dataset, feature)
+        y_test = convert_labels_to_assignment(y_test, dataset, feature)
+        #X_train['y'] = y_train[feature]
+    #else:
+    X_train['y'] = y_train
+    X_train = X_train[X_train['y'].notna()]
+    #X_train.to_csv(scanvi_use + 'X_Train_check.csv')
+
+    y_train = X_train['y']
+    X_train = X_train.drop(columns=['y'])
+
+    #Testing mirroring exact handling of xtrain:
+    X_test['y'] = y_test
+    X_test = X_test[X_test['y'].notna()]
+    #X_train.to_csv(scanvi_use + 'X_Train_check.csv')
+
+    y_test = X_test['y']
+    X_test = X_test.drop(columns=['y'])
+
+    ## Testing: min-max scaling all features:
+    #y_test = y_test.reset_index() #We need to reset the index or else when we combine, NA's show up where indexes differ
+    ##print(y_test)
+
+    #if regression == 'logistic':
+    #    #y_test = convert_labels_to_assignment(y_test, dataset, feature)
+    #    X_test['y'] = y_test[feature]
+    #    print(X_test)
+    #else:
+    #    X_test['y'] = y_test[feature]
+    #    print(X_test)
+    #X_test = X_test[X_test['y'].notna()]
+    #print(X_test)
+    ##print(X_train)
+
+
+    ### Moving this up here so y is not scaled:
+    #y_test = X_test['y']
+    #X_test = X_test.drop(columns=['y'])
+    ##print(save_folder)
+    ##print(X_test)
+    ##print(y_test)
+    #print(X_train)
+    #print(y_train)
+    #y_train= y_train.map({6: 0, 12: 1, 18:2})
+    #y_test= y_test.map({6: 0, 12: 1, 18:2})
+    #print(y_train)
+    return X_train, X_test, y_train, y_test, save_folder
+
+def convert_labels_to_assignment(ypred, dataset, feature):
+    true_labels =[]
+    #print(ypred)
+    for i in range(0, ypred.shape[0]): #select max of each row
+        max_index = np.argmax(ypred.iloc[i, :])
+        if dataset == 'doma' and feature == 'Age': # ages 6, 12, 18 months
+            if max_index == 0:
+                max_index = 6
+            if max_index == 1:
+                max_index = 12
+            if max_index == 2:
+                max_index = 18
+        true_labels.append(max_index)
+    return true_labels