Update README.md

HealthInfoLab · Apr 11, 2024 · 1e78133 · 1e78133
1 parent 089e9fe
commit 1e78133
Showing 1 changed file with 0 additions and 151 deletions.
diff --git a/README.md b/README.md
@@ -1,152 +1 @@
-# MicroVI
-This project consists of MicroVI-main, which runs our regular semi-supervised Micro-VI, and MicroVI-retraining, which has the added functionality of retraining on an artificially generated training set, as sampled from the learned latent space. 
-
-The sample script myjob.sh calls the run.py file, which is copied below. The corresponding regression and classification dataset settings are listed first, below which the percent supervision and l_parameter initialization settings are specified. Further, the run number allows the set of training followed by prediction evaluation to be repeated as many times as desired.
-
-Under the commented 'Regression' setting, the final-stage classification or regression models should be set as corresponding to the appropriate dataset.
-
-def main():
-    ###Regression setting:
-    dataset = 'pomp'
-    regression = True
-    num_labels = 1
-    feature='postdiet_Cholesterol'
-    covariate_list = ['Body_weight', 'Diet', 'prediet_Cholesterol']
-    latent_dim = 150
-    alpha = 0.5 #1.0 #0.1 #0.25 # 0.5 #1.0
-    num_epochs = 200
-    covariate_ablation = False #True #drop covariates from scanvi
-    ##retrain_with_sampling = False #generates the latent sampling
-    ##retrain = False #after the above has been run, this will conduct the actual retraining
-
-    latent_visualization_x_all = False #True
-    #full_retrain_process = False
-
-    full_retrain_process = True #doing away with above, runs with single click
-
-    #dataset = 'doma'
-    #regression = False
-    #num_labels = 3
-    #feature='Age'
-    #covariate_list = ['Body_weight', 'Gender']
-    #latent_dim = 100
-    #alpha = 1.0 #0.1 #0.25 # 0.5 #1.0
-    #num_epochs = 200
-    #covariate_ablation = False #if True: drop covariates from scanvi
-
-    load_saved_model = False  #False
-    generate_splits = False #true if need to generate data splits (first time through for dataset/feature); false to simply get scanvi portion #if true, make sure covariate_ablation is set to false, otherwise it tries to find covariates that don't exist
-    raw_no_normalization = False
-    batch_size = 42
-
-    ##------------RUN SCANVI----------------------------------------------------------
-    l_loc = 8.35
-    l_scale = 1.5
-
-    pct_supervised = 0
-
-    for run_number in range(46, 100): #Do 5-fold cross-validation 100x 0,100
-
-        seed_index_value = -1 # -1 will loop through all 10 splits; otherwise, set seed_index to particular split value, e.g., 2
-        seed_index_list = []
-        if seed_index_value == -1:
-            for i in range(0, 5): #temporarily changed
-                seed_index_list.append(i)
-        else:
-            seed_index_list.append(seed_index_value)
-            #seed_index_list.append(0) #re-do first split, for testing purposes
-
-        r2_list = [] # compile r2 over all 10 splits (or single split)
-
-        for seed_index in seed_index_list:
-            print('------------------------' + 'STARTING SPLIT ' + str(seed_index) + '------------------------')
-            if full_retrain_process == True:
-                latent_visualization_x_all = False
-                #First, train the model from scratch with covariate ablation on or off:
-                covariate_ablation = True 
-                #covariate_ablation = False #NOW RUNNING WITH COVARIATES 
-                retrain_with_sampling = False
-                load_saved_model = False
-                retrain = False
-                run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number)
-
-                #Then, load the trained model and sample from the latent space:
-                retrain_with_sampling = True
-                load_saved_model = True
-                retrain = False
-                run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number)
-
-                #Next, load the trained model again and retrain using the generated samples:
-                retrain_with_sampling = True
-                load_saved_model = True
-                retrain = True
-                run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number)
-
-                #Finally, load this retrained model and proceed with UMAP generation, etc.
-                retrain_with_sampling = False
-                load_saved_model = True
-                retrain = False
-                run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number)
-
-            else: #run particular settings as specified:
-                run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number)
-
-            #----------REGRESSION-------------------------------
-            setting = 'latent'
-            method = 'scanvi'
-
-            regression = 'linear' 
-            #regression = 'logistic' 
-
-            regression_model = 'ridge'
-            #regression_model = 'mlp'
-            #regression_model = 'logistic'
-
-            X_train, X_test, y_train, y_test, save_folder = dl.get_latent_train_test(dataset, regression, method, feature, seed_index, alpha_scanvi=alpha, num_epochs=num_epochs)
-            
-            r2 = reg.do_regression(setting, dataset, feature, regression, X_train, X_test, y_train, y_test, save_folder, regression_model=regression_model) 
-            
-            #save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/'
-            #save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$
-            save_folder = 'Master_Results/' #for cluster path $$$ 'home/rim17004/micro-vi/MicroVI-retraining/Master_Results/'
-            scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/'
-
-            if regression == 'linear':
-                score_name = 'All_R2_'
-            else: #logistic, accuracy or f1 scoring instead of r2
-                #score_name = 'All_ACC_'
-                score_name = 'All_F1_'
-
-            save_path = scanvi_use
-            if full_retrain_process == True:
-                retrain_str = '_retrained'
-            else:
-                retrain_str = ''
-
-            cov_tag = ''
-            if covariate_ablation == False: #True cov ablation will mean covariates excluded
-                cov_tag = '_cov'
-            file_name = score_name + regression_model + '_' + 'Alpha' + str(alpha) + '_' + str(num_epochs) + 'epochs' + '_' + setting + '_' + str(pct_supervised) + 'supervised' + retrain_str + cov_tag + '.csv'
-
-            file_path = scanvi_use + file_name
-            print('FILE PATH= ', file_path)
-            file_exists = os.path.isfile(file_path)
-            if file_exists == True:
-                progress_df = pd.read_csv(file_path)
-            else:
-                progress_df = pd.DataFrame(columns =['Run Number'])
-                progress_df['r2'] = 0
-                #progress_df.to_csv(file_path) #saves
-                #print(progress_df)
-                #print(file_name + ' created. Continuing...')
-
-            add_new = {'Run Number': run_number, 'r2': r2}
-            progress_df.loc[len(progress_df)] = add_new
-            progress_df.to_csv(file_path, index=False) #saves
-
-        
-
-
-if __name__ == "__main__":
-    sys.exit(int(main() or 0))