Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
rim17004 authored Feb 19, 2024
0 parents commit b39dbb9
Showing 1 changed file with 252 additions and 0 deletions.
252 changes: 252 additions & 0 deletions Dataloader_All.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
import numpy as np
import random
import pandas as pd

from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler


def get_raw_train_test_old(dataset, method, feature, seed_index, alpha_scanvi=0, num_epochs=0, model_name=""):
# alpha_scanvi and num_epochs are only for scANVI use, e.g. 0.5, 100
# model_name is only for DeepMicro use, e.g., AE[20]

#This function still requires method, as scanvi = raw read count data with covariates
# deep micro = pct data, no covariates
# dca = raw read count, no covariates

#X_train and test file names change (no latent rep specifier)

save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/'
split_folder = 'Split_' + str(seed_index) + '/'
scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder
deep_micro_use = save_folder + 'DeepMicro_Use/' + str(dataset) + '/' + str(feature) + '/' + str(model_name) + '/' + split_folder + '/'
dca_use = save_folder + 'DCA_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder

if method == 'scanvi':
y_train = pd.read_csv(scanvi_use + 'y_train_' + feature + '.csv', header=[0], index_col=0)
y_test = pd.read_csv(scanvi_use + 'y_test_' + feature + '.csv', header=[0], index_col=0)
X_train = pd.read_csv(scanvi_use + 'X_train.csv', header=[0], index_col=0)
X_test = pd.read_csv(scanvi_use + 'X_test.csv', header=[0], index_col=0)
save_folder = scanvi_use + 'Raw/' #add raw to final save path

if method == 'DeepMicro':
X_train = pd.read_csv(deep_micro_use + 'X_train.csv', header=None, index_col=False)
X_test = pd.read_csv(deep_micro_use + 'X_test.csv', header=None, index_col=False)
y_train = pd.read_csv(deep_micro_use + 'y_train_' + feature + '.csv', header=[0], index_col=0)
y_test = pd.read_csv(deep_micro_use + 'y_test_' + feature + '.csv', header=[0], index_col=0)
save_folder = deep_micro_use + 'Raw/'

if method == 'DCA':
X_train = pd.read_csv(dca_use + 'X_train.csv', header=[0], index_col=0)
X_test = pd.read_csv(dca_use + 'X_test.csv', header=[0], index_col=0)
y_train = pd.read_csv(dca_use + 'y_train_' + feature + '.csv', header=[0], index_col=0)
y_test = pd.read_csv(dca_use + 'y_test_' + feature + '.csv', header=[0], index_col=0)
save_folder = dca_use + 'Raw/'

# For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression
# So we add y_train to the X dataframe, remove the NA sample rows, then separate them again:
X_train['y'] = y_train
X_train = X_train[X_train['y'].notna()]
y_train = X_train['y']
X_train = X_train.drop(columns=['y'])

y_test = y_test.reset_index()
y_test = y_test.drop(columns=['index'])

return X_train, X_test, y_train, y_test, save_folder

def get_raw_train_test(dataset, regression, feature, seed_index):
# We are using raw pct data, which is in the DeepMicro path
#save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/'
save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$
split_folder = 'Split_' + str(seed_index) + '/'
raw_use = save_folder + 'Raw_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder

#For each X and y, read from the Generic Split folder (this is the place most recent changes to y_test/train have been made)
#y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/pomp/postdiet_Cholesterol/GenericSplits/' + split_folder

#y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder
y_path = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder

X_train = pd.read_csv(y_path + 'X_train.csv', header=None, index_col=False)
X_test = pd.read_csv(y_path + 'X_test.csv', header=None, index_col=False)

y_train = pd.read_csv(y_path + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index()
y_test = pd.read_csv(y_path + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index()
save_folder = raw_use

#For DeepMicro and DCA, I did not update the X rep's to include sample id's so those only exist for y ***
# This will cause NA's to appear since the id's won't match up, so for now, I am resetting the indexes on X's ***
y_train = y_train.drop(columns=['index'])
y_test = y_test.drop(columns=['index'])
#print(y_train)
#print(y_test)

# For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression
# So we add y_train to the X dataframe, remove the NA sample rows, then separate them again:
if regression == 'logistic':
X_train['y'] = y_train[feature]
else:
X_train['y'] = y_train
X_train = X_train[X_train['y'].notna()]
#X_train.to_csv(scanvi_use + 'X_Train_check.csv')

y_train = X_train['y']
X_train = X_train.drop(columns=['y'])

X_test['y'] = y_test[feature]
#X_test = X_test[X_test['y'].notna()]

y_test = X_test['y']
#X_test.to_csv(scanvi_use + 'X_Test_check.csv')
X_test = X_test.drop(columns=['y'])

return X_train, X_test, y_train, y_test, save_folder

def get_latent_train_test(dataset, regression, method, feature, seed_index, alpha_scanvi=0, num_epochs=0, model_name=""):
# alpha_scanvi and num_epochs are only for scANVI use, e.g. 0.5, 100
# model_name is only for DeepMicro use, e.g., AE[20]

#save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/'
#save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$
save_folder = 'Master_Results/' #for cluster path $$$
split_folder = 'Split_' + str(seed_index) + '/'
scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder
deep_micro_use = save_folder + 'DeepMicro_Use/' + str(dataset) + '/' + str(feature) + '/' + str(model_name) + '/' + split_folder + '/'
dca_use = save_folder + 'DCA_Use/' + str(dataset) + '/' + str(feature) + '/' + split_folder

if method == 'scanvi':
y_train = pd.read_csv(scanvi_use + 'y_train_' + feature + '.csv', header=[0], index_col=0)
y_test = pd.read_csv(scanvi_use + 'y_test_' + feature + '.csv', header=[0], index_col=0)

scanvi_use = scanvi_use + 'Alpha' + str(alpha_scanvi) + '_' + str(num_epochs) + 'epochs/' #add alpha setting and num epochs to final save path
#X_train = pd.read_csv(scanvi_use + 'X_train_latent_rep_scanvi.csv', header=None, index_col=False)
#X_test = pd.read_csv(scanvi_use + 'X_test_latent_rep_scanvi.csv', header=None, index_col=False)
X_train = pd.read_csv(scanvi_use + 'X_train_latent_rep_scanvi.csv', header=None, index_col=0)
#print(X_train)
X_test = pd.read_csv(scanvi_use + 'X_test_latent_rep_scanvi.csv', header=None, index_col=0)
#print(X_test)
save_folder = scanvi_use

if method == 'DeepMicro':
X_train = pd.read_csv(deep_micro_use + model_name + '_X_train_rep.csv', header=None, index_col=False)
X_test = pd.read_csv(deep_micro_use + model_name + '_X_test_rep.csv', header=None, index_col=False)
#For the y's, read from the Generic Split folder (this is the place most recent changes to y_test/train have been made)
#y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/pomp/postdiet_Cholesterol/GenericSplits/' + split_folder

#y_path = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder
y_path = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/DeepMicro_Use/' + dataset + '/' + feature + '/GenericSplits/' + split_folder

y_train = pd.read_csv(y_path + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index()
y_test = pd.read_csv(y_path + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index()
save_folder = deep_micro_use

#For DeepMicro and DCA, I did not update the X rep's to include sample id's so those only exist for y ***
# This will cause NA's to appear since the id's won't match up, so for now, I am resetting the indexes on X's ***
y_train = y_train.drop(columns=['index'])
y_test = y_test.drop(columns=['index'])
#print(y_train)
#print(y_test)

if method == 'DCA':
X_train = pd.read_csv(dca_use + 'latent.tsv', sep='\t', header=None, index_col=0).reset_index() #sep='\t' for TSV file

X_test = pd.read_csv(dca_use + 'latent_X_test.tsv', sep='\t', header=None, index_col=0).reset_index() #sep='\t' for TSV file

X_train = X_train.iloc[:, 1:]
X_test = X_test.iloc[:, 1:]
#print(X_train)

##Having to add the below scaling or else ValueError 'too large' is triggered:
## Get column names first
#names = X_train.columns
## Create the Scaler object
#scaler = preprocessing.StandardScaler()
## Fit your data on the scaler object
#scaled_df = scaler.fit_transform(X_train)
#X_train = pd.DataFrame(scaled_df, columns=names)

## Get column names first
#names = X_test.columns
## Create the Scaler object
#scaler = preprocessing.StandardScaler()
## Fit your data on the scaler object
#scaled_df = scaler.fit_transform(X_test)
#X_test = pd.DataFrame(scaled_df, columns=names)

y_train = pd.read_csv(dca_use + 'y_train_' + feature + '.csv', header=[0], index_col=0).reset_index()
y_test = pd.read_csv(dca_use + 'y_test_' + feature + '.csv', header=[0], index_col=0).reset_index()
save_folder = dca_use

#X_train = X_train.drop(columns=['index'])
#X_test = X_test.drop(columns=['index'])
y_train = y_train.drop(columns=['index'])
y_test = y_test.drop(columns=['index'])

# For only X_train and y_train, we need to remove the samples where we have missing y (NA) because we are running regression
# So we add y_train to the X dataframe, remove the NA sample rows, then separate them again:
if regression == 'logistic' and method == 'scanvi': #only scanvi has one-hot encoded y values, others are stored as labels already
#print(y_train)
y_train = convert_labels_to_assignment(y_train, dataset, feature)
y_test = convert_labels_to_assignment(y_test, dataset, feature)
#X_train['y'] = y_train[feature]
#else:
X_train['y'] = y_train
X_train = X_train[X_train['y'].notna()]
#X_train.to_csv(scanvi_use + 'X_Train_check.csv')

y_train = X_train['y']
X_train = X_train.drop(columns=['y'])

#Testing mirroring exact handling of xtrain:
X_test['y'] = y_test
X_test = X_test[X_test['y'].notna()]
#X_train.to_csv(scanvi_use + 'X_Train_check.csv')

y_test = X_test['y']
X_test = X_test.drop(columns=['y'])

## Testing: min-max scaling all features:
#y_test = y_test.reset_index() #We need to reset the index or else when we combine, NA's show up where indexes differ
##print(y_test)

#if regression == 'logistic':
# #y_test = convert_labels_to_assignment(y_test, dataset, feature)
# X_test['y'] = y_test[feature]
# print(X_test)
#else:
# X_test['y'] = y_test[feature]
# print(X_test)
#X_test = X_test[X_test['y'].notna()]
#print(X_test)
##print(X_train)


### Moving this up here so y is not scaled:
#y_test = X_test['y']
#X_test = X_test.drop(columns=['y'])
##print(save_folder)
##print(X_test)
##print(y_test)
#print(X_train)
#print(y_train)
#y_train= y_train.map({6: 0, 12: 1, 18:2})
#y_test= y_test.map({6: 0, 12: 1, 18:2})
#print(y_train)
return X_train, X_test, y_train, y_test, save_folder

def convert_labels_to_assignment(ypred, dataset, feature):
true_labels =[]
#print(ypred)
for i in range(0, ypred.shape[0]): #select max of each row
max_index = np.argmax(ypred.iloc[i, :])
if dataset == 'doma' and feature == 'Age': # ages 6, 12, 18 months
if max_index == 0:
max_index = 6
if max_index == 1:
max_index = 12
if max_index == 2:
max_index = 18
true_labels.append(max_index)
return true_labels

0 comments on commit b39dbb9

Please sign in to comment.