From 1e78133586698b8ad0e4cc8f7bde04d1f75b6e71 Mon Sep 17 00:00:00 2001 From: Rigel Mahmood Date: Thu, 11 Apr 2024 13:10:04 -0400 Subject: [PATCH] Update README.md --- README.md | 151 ------------------------------------------------------ 1 file changed, 151 deletions(-) diff --git a/README.md b/README.md index 5a14ecc..8b13789 100644 --- a/README.md +++ b/README.md @@ -1,152 +1 @@ -# MicroVI -This project consists of MicroVI-main, which runs our regular semi-supervised Micro-VI, and MicroVI-retraining, which has the added functionality of retraining on an artificially generated training set, as sampled from the learned latent space. - -The sample script myjob.sh calls the run.py file, which is copied below. The corresponding regression and classification dataset settings are listed first, below which the percent supervision and l_parameter initialization settings are specified. Further, the run number allows the set of training followed by prediction evaluation to be repeated as many times as desired. - -Under the commented 'Regression' setting, the final-stage classification or regression models should be set as corresponding to the appropriate dataset. - -def main(): - ###Regression setting: - dataset = 'pomp' - regression = True - num_labels = 1 - feature='postdiet_Cholesterol' - covariate_list = ['Body_weight', 'Diet', 'prediet_Cholesterol'] - latent_dim = 150 - alpha = 0.5 #1.0 #0.1 #0.25 # 0.5 #1.0 - num_epochs = 200 - covariate_ablation = False #True #drop covariates from scanvi - ##retrain_with_sampling = False #generates the latent sampling - ##retrain = False #after the above has been run, this will conduct the actual retraining - - latent_visualization_x_all = False #True - #full_retrain_process = False - - full_retrain_process = True #doing away with above, runs with single click - - #dataset = 'doma' - #regression = False - #num_labels = 3 - #feature='Age' - #covariate_list = ['Body_weight', 'Gender'] - #latent_dim = 100 - #alpha = 1.0 #0.1 #0.25 # 0.5 #1.0 - #num_epochs = 200 - #covariate_ablation = False #if True: drop covariates from scanvi - - load_saved_model = False #False - generate_splits = False #true if need to generate data splits (first time through for dataset/feature); false to simply get scanvi portion #if true, make sure covariate_ablation is set to false, otherwise it tries to find covariates that don't exist - raw_no_normalization = False - batch_size = 42 - - ##------------RUN SCANVI---------------------------------------------------------- - l_loc = 8.35 - l_scale = 1.5 - - pct_supervised = 0 - - for run_number in range(46, 100): #Do 5-fold cross-validation 100x 0,100 - - seed_index_value = -1 # -1 will loop through all 10 splits; otherwise, set seed_index to particular split value, e.g., 2 - seed_index_list = [] - if seed_index_value == -1: - for i in range(0, 5): #temporarily changed - seed_index_list.append(i) - else: - seed_index_list.append(seed_index_value) - #seed_index_list.append(0) #re-do first split, for testing purposes - - r2_list = [] # compile r2 over all 10 splits (or single split) - - for seed_index in seed_index_list: - print('------------------------' + 'STARTING SPLIT ' + str(seed_index) + '------------------------') - if full_retrain_process == True: - latent_visualization_x_all = False - #First, train the model from scratch with covariate ablation on or off: - covariate_ablation = True - #covariate_ablation = False #NOW RUNNING WITH COVARIATES - retrain_with_sampling = False - load_saved_model = False - retrain = False - run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) - - #Then, load the trained model and sample from the latent space: - retrain_with_sampling = True - load_saved_model = True - retrain = False - run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) - - #Next, load the trained model again and retrain using the generated samples: - retrain_with_sampling = True - load_saved_model = True - retrain = True - run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) - - #Finally, load this retrained model and proceed with UMAP generation, etc. - retrain_with_sampling = False - load_saved_model = True - retrain = False - run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) - - else: #run particular settings as specified: - run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) - - #----------REGRESSION------------------------------- - setting = 'latent' - method = 'scanvi' - - regression = 'linear' - #regression = 'logistic' - - regression_model = 'ridge' - #regression_model = 'mlp' - #regression_model = 'logistic' - - X_train, X_test, y_train, y_test, save_folder = dl.get_latent_train_test(dataset, regression, method, feature, seed_index, alpha_scanvi=alpha, num_epochs=num_epochs) - - r2 = reg.do_regression(setting, dataset, feature, regression, X_train, X_test, y_train, y_test, save_folder, regression_model=regression_model) - - #save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' - #save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$ - save_folder = 'Master_Results/' #for cluster path $$$ 'home/rim17004/micro-vi/MicroVI-retraining/Master_Results/' - scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' - - if regression == 'linear': - score_name = 'All_R2_' - else: #logistic, accuracy or f1 scoring instead of r2 - #score_name = 'All_ACC_' - score_name = 'All_F1_' - - save_path = scanvi_use - if full_retrain_process == True: - retrain_str = '_retrained' - else: - retrain_str = '' - - cov_tag = '' - if covariate_ablation == False: #True cov ablation will mean covariates excluded - cov_tag = '_cov' - file_name = score_name + regression_model + '_' + 'Alpha' + str(alpha) + '_' + str(num_epochs) + 'epochs' + '_' + setting + '_' + str(pct_supervised) + 'supervised' + retrain_str + cov_tag + '.csv' - - file_path = scanvi_use + file_name - print('FILE PATH= ', file_path) - file_exists = os.path.isfile(file_path) - if file_exists == True: - progress_df = pd.read_csv(file_path) - else: - progress_df = pd.DataFrame(columns =['Run Number']) - progress_df['r2'] = 0 - #progress_df.to_csv(file_path) #saves - #print(progress_df) - #print(file_name + ' created. Continuing...') - - add_new = {'Run Number': run_number, 'r2': r2} - progress_df.loc[len(progress_df)] = add_new - progress_df.to_csv(file_path, index=False) #saves - - - - -if __name__ == "__main__": - sys.exit(int(main() or 0))