Skip to content

Commit

Permalink
Update README.md
Browse files Browse the repository at this point in the history
  • Loading branch information
rim17004 authored Apr 11, 2024
1 parent 089e9fe commit 1e78133
Showing 1 changed file with 0 additions and 151 deletions.
151 changes: 0 additions & 151 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,152 +1 @@
# MicroVI
This project consists of MicroVI-main, which runs our regular semi-supervised Micro-VI, and MicroVI-retraining, which has the added functionality of retraining on an artificially generated training set, as sampled from the learned latent space.

The sample script myjob.sh calls the run.py file, which is copied below. The corresponding regression and classification dataset settings are listed first, below which the percent supervision and l_parameter initialization settings are specified. Further, the run number allows the set of training followed by prediction evaluation to be repeated as many times as desired.

Under the commented 'Regression' setting, the final-stage classification or regression models should be set as corresponding to the appropriate dataset.

def main():
###Regression setting:
dataset = 'pomp'
regression = True
num_labels = 1
feature='postdiet_Cholesterol'
covariate_list = ['Body_weight', 'Diet', 'prediet_Cholesterol']
latent_dim = 150
alpha = 0.5 #1.0 #0.1 #0.25 # 0.5 #1.0
num_epochs = 200
covariate_ablation = False #True #drop covariates from scanvi
##retrain_with_sampling = False #generates the latent sampling
##retrain = False #after the above has been run, this will conduct the actual retraining

latent_visualization_x_all = False #True
#full_retrain_process = False

full_retrain_process = True #doing away with above, runs with single click

#dataset = 'doma'
#regression = False
#num_labels = 3
#feature='Age'
#covariate_list = ['Body_weight', 'Gender']
#latent_dim = 100
#alpha = 1.0 #0.1 #0.25 # 0.5 #1.0
#num_epochs = 200
#covariate_ablation = False #if True: drop covariates from scanvi

load_saved_model = False #False
generate_splits = False #true if need to generate data splits (first time through for dataset/feature); false to simply get scanvi portion #if true, make sure covariate_ablation is set to false, otherwise it tries to find covariates that don't exist
raw_no_normalization = False
batch_size = 42

##------------RUN SCANVI----------------------------------------------------------
l_loc = 8.35
l_scale = 1.5

pct_supervised = 0

for run_number in range(46, 100): #Do 5-fold cross-validation 100x 0,100

seed_index_value = -1 # -1 will loop through all 10 splits; otherwise, set seed_index to particular split value, e.g., 2
seed_index_list = []
if seed_index_value == -1:
for i in range(0, 5): #temporarily changed
seed_index_list.append(i)
else:
seed_index_list.append(seed_index_value)
#seed_index_list.append(0) #re-do first split, for testing purposes

r2_list = [] # compile r2 over all 10 splits (or single split)

for seed_index in seed_index_list:
print('------------------------' + 'STARTING SPLIT ' + str(seed_index) + '------------------------')
if full_retrain_process == True:
latent_visualization_x_all = False
#First, train the model from scratch with covariate ablation on or off:
covariate_ablation = True
#covariate_ablation = False #NOW RUNNING WITH COVARIATES
retrain_with_sampling = False
load_saved_model = False
retrain = False
run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number)

#Then, load the trained model and sample from the latent space:
retrain_with_sampling = True
load_saved_model = True
retrain = False
run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number)

#Next, load the trained model again and retrain using the generated samples:
retrain_with_sampling = True
load_saved_model = True
retrain = True
run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number)

#Finally, load this retrained model and proceed with UMAP generation, etc.
retrain_with_sampling = False
load_saved_model = True
retrain = False
run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number)

else: #run particular settings as specified:
run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number)

#----------REGRESSION-------------------------------
setting = 'latent'
method = 'scanvi'

regression = 'linear'
#regression = 'logistic'

regression_model = 'ridge'
#regression_model = 'mlp'
#regression_model = 'logistic'

X_train, X_test, y_train, y_test, save_folder = dl.get_latent_train_test(dataset, regression, method, feature, seed_index, alpha_scanvi=alpha, num_epochs=num_epochs)
r2 = reg.do_regression(setting, dataset, feature, regression, X_train, X_test, y_train, y_test, save_folder, regression_model=regression_model)
#save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/'
#save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$
save_folder = 'Master_Results/' #for cluster path $$$ 'home/rim17004/micro-vi/MicroVI-retraining/Master_Results/'
scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/'

if regression == 'linear':
score_name = 'All_R2_'
else: #logistic, accuracy or f1 scoring instead of r2
#score_name = 'All_ACC_'
score_name = 'All_F1_'

save_path = scanvi_use
if full_retrain_process == True:
retrain_str = '_retrained'
else:
retrain_str = ''

cov_tag = ''
if covariate_ablation == False: #True cov ablation will mean covariates excluded
cov_tag = '_cov'
file_name = score_name + regression_model + '_' + 'Alpha' + str(alpha) + '_' + str(num_epochs) + 'epochs' + '_' + setting + '_' + str(pct_supervised) + 'supervised' + retrain_str + cov_tag + '.csv'

file_path = scanvi_use + file_name
print('FILE PATH= ', file_path)
file_exists = os.path.isfile(file_path)
if file_exists == True:
progress_df = pd.read_csv(file_path)
else:
progress_df = pd.DataFrame(columns =['Run Number'])
progress_df['r2'] = 0
#progress_df.to_csv(file_path) #saves
#print(progress_df)
#print(file_name + ' created. Continuing...')

add_new = {'Run Number': run_number, 'r2': r2}
progress_df.loc[len(progress_df)] = add_new
progress_df.to_csv(file_path, index=False) #saves



if __name__ == "__main__":
sys.exit(int(main() or 0))

0 comments on commit 1e78133

Please sign in to comment.