diff --git a/README.md b/README.md new file mode 100644 index 0000000..5a14ecc --- /dev/null +++ b/README.md @@ -0,0 +1,152 @@ +# MicroVI +This project consists of MicroVI-main, which runs our regular semi-supervised Micro-VI, and MicroVI-retraining, which has the added functionality of retraining on an artificially generated training set, as sampled from the learned latent space. + +The sample script myjob.sh calls the run.py file, which is copied below. The corresponding regression and classification dataset settings are listed first, below which the percent supervision and l_parameter initialization settings are specified. Further, the run number allows the set of training followed by prediction evaluation to be repeated as many times as desired. + +Under the commented 'Regression' setting, the final-stage classification or regression models should be set as corresponding to the appropriate dataset. + +def main(): + ###Regression setting: + dataset = 'pomp' + regression = True + num_labels = 1 + feature='postdiet_Cholesterol' + covariate_list = ['Body_weight', 'Diet', 'prediet_Cholesterol'] + latent_dim = 150 + alpha = 0.5 #1.0 #0.1 #0.25 # 0.5 #1.0 + num_epochs = 200 + covariate_ablation = False #True #drop covariates from scanvi + ##retrain_with_sampling = False #generates the latent sampling + ##retrain = False #after the above has been run, this will conduct the actual retraining + + latent_visualization_x_all = False #True + #full_retrain_process = False + + full_retrain_process = True #doing away with above, runs with single click + + #dataset = 'doma' + #regression = False + #num_labels = 3 + #feature='Age' + #covariate_list = ['Body_weight', 'Gender'] + #latent_dim = 100 + #alpha = 1.0 #0.1 #0.25 # 0.5 #1.0 + #num_epochs = 200 + #covariate_ablation = False #if True: drop covariates from scanvi + + load_saved_model = False #False + generate_splits = False #true if need to generate data splits (first time through for dataset/feature); false to simply get scanvi portion #if true, make sure covariate_ablation is set to false, otherwise it tries to find covariates that don't exist + raw_no_normalization = False + batch_size = 42 + + ##------------RUN SCANVI---------------------------------------------------------- + l_loc = 8.35 + l_scale = 1.5 + + pct_supervised = 0 + + for run_number in range(46, 100): #Do 5-fold cross-validation 100x 0,100 + + seed_index_value = -1 # -1 will loop through all 10 splits; otherwise, set seed_index to particular split value, e.g., 2 + seed_index_list = [] + if seed_index_value == -1: + for i in range(0, 5): #temporarily changed + seed_index_list.append(i) + else: + seed_index_list.append(seed_index_value) + #seed_index_list.append(0) #re-do first split, for testing purposes + + r2_list = [] # compile r2 over all 10 splits (or single split) + + for seed_index in seed_index_list: + print('------------------------' + 'STARTING SPLIT ' + str(seed_index) + '------------------------') + if full_retrain_process == True: + latent_visualization_x_all = False + #First, train the model from scratch with covariate ablation on or off: + covariate_ablation = True + #covariate_ablation = False #NOW RUNNING WITH COVARIATES + retrain_with_sampling = False + load_saved_model = False + retrain = False + run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) + + #Then, load the trained model and sample from the latent space: + retrain_with_sampling = True + load_saved_model = True + retrain = False + run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) + + #Next, load the trained model again and retrain using the generated samples: + retrain_with_sampling = True + load_saved_model = True + retrain = True + run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) + + #Finally, load this retrained model and proceed with UMAP generation, etc. + retrain_with_sampling = False + load_saved_model = True + retrain = False + run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) + + else: #run particular settings as specified: + run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) + + #----------REGRESSION------------------------------- + setting = 'latent' + method = 'scanvi' + + regression = 'linear' + #regression = 'logistic' + + regression_model = 'ridge' + #regression_model = 'mlp' + #regression_model = 'logistic' + + X_train, X_test, y_train, y_test, save_folder = dl.get_latent_train_test(dataset, regression, method, feature, seed_index, alpha_scanvi=alpha, num_epochs=num_epochs) + + r2 = reg.do_regression(setting, dataset, feature, regression, X_train, X_test, y_train, y_test, save_folder, regression_model=regression_model) + + #save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' + #save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$ + save_folder = 'Master_Results/' #for cluster path $$$ 'home/rim17004/micro-vi/MicroVI-retraining/Master_Results/' + scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' + + if regression == 'linear': + score_name = 'All_R2_' + else: #logistic, accuracy or f1 scoring instead of r2 + #score_name = 'All_ACC_' + score_name = 'All_F1_' + + save_path = scanvi_use + if full_retrain_process == True: + retrain_str = '_retrained' + else: + retrain_str = '' + + cov_tag = '' + if covariate_ablation == False: #True cov ablation will mean covariates excluded + cov_tag = '_cov' + file_name = score_name + regression_model + '_' + 'Alpha' + str(alpha) + '_' + str(num_epochs) + 'epochs' + '_' + setting + '_' + str(pct_supervised) + 'supervised' + retrain_str + cov_tag + '.csv' + + file_path = scanvi_use + file_name + print('FILE PATH= ', file_path) + file_exists = os.path.isfile(file_path) + if file_exists == True: + progress_df = pd.read_csv(file_path) + else: + progress_df = pd.DataFrame(columns =['Run Number']) + progress_df['r2'] = 0 + #progress_df.to_csv(file_path) #saves + #print(progress_df) + #print(file_name + ' created. Continuing...') + + add_new = {'Run Number': run_number, 'r2': r2} + progress_df.loc[len(progress_df)] = add_new + progress_df.to_csv(file_path, index=False) #saves + + + + +if __name__ == "__main__": + sys.exit(int(main() or 0)) +