-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
0 additions
and
151 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,152 +1 @@ | ||
# MicroVI | ||
This project consists of MicroVI-main, which runs our regular semi-supervised Micro-VI, and MicroVI-retraining, which has the added functionality of retraining on an artificially generated training set, as sampled from the learned latent space. | ||
|
||
The sample script myjob.sh calls the run.py file, which is copied below. The corresponding regression and classification dataset settings are listed first, below which the percent supervision and l_parameter initialization settings are specified. Further, the run number allows the set of training followed by prediction evaluation to be repeated as many times as desired. | ||
|
||
Under the commented 'Regression' setting, the final-stage classification or regression models should be set as corresponding to the appropriate dataset. | ||
|
||
def main(): | ||
###Regression setting: | ||
dataset = 'pomp' | ||
regression = True | ||
num_labels = 1 | ||
feature='postdiet_Cholesterol' | ||
covariate_list = ['Body_weight', 'Diet', 'prediet_Cholesterol'] | ||
latent_dim = 150 | ||
alpha = 0.5 #1.0 #0.1 #0.25 # 0.5 #1.0 | ||
num_epochs = 200 | ||
covariate_ablation = False #True #drop covariates from scanvi | ||
##retrain_with_sampling = False #generates the latent sampling | ||
##retrain = False #after the above has been run, this will conduct the actual retraining | ||
|
||
latent_visualization_x_all = False #True | ||
#full_retrain_process = False | ||
|
||
full_retrain_process = True #doing away with above, runs with single click | ||
|
||
#dataset = 'doma' | ||
#regression = False | ||
#num_labels = 3 | ||
#feature='Age' | ||
#covariate_list = ['Body_weight', 'Gender'] | ||
#latent_dim = 100 | ||
#alpha = 1.0 #0.1 #0.25 # 0.5 #1.0 | ||
#num_epochs = 200 | ||
#covariate_ablation = False #if True: drop covariates from scanvi | ||
|
||
load_saved_model = False #False | ||
generate_splits = False #true if need to generate data splits (first time through for dataset/feature); false to simply get scanvi portion #if true, make sure covariate_ablation is set to false, otherwise it tries to find covariates that don't exist | ||
raw_no_normalization = False | ||
batch_size = 42 | ||
|
||
##------------RUN SCANVI---------------------------------------------------------- | ||
l_loc = 8.35 | ||
l_scale = 1.5 | ||
|
||
pct_supervised = 0 | ||
|
||
for run_number in range(46, 100): #Do 5-fold cross-validation 100x 0,100 | ||
|
||
seed_index_value = -1 # -1 will loop through all 10 splits; otherwise, set seed_index to particular split value, e.g., 2 | ||
seed_index_list = [] | ||
if seed_index_value == -1: | ||
for i in range(0, 5): #temporarily changed | ||
seed_index_list.append(i) | ||
else: | ||
seed_index_list.append(seed_index_value) | ||
#seed_index_list.append(0) #re-do first split, for testing purposes | ||
|
||
r2_list = [] # compile r2 over all 10 splits (or single split) | ||
|
||
for seed_index in seed_index_list: | ||
print('------------------------' + 'STARTING SPLIT ' + str(seed_index) + '------------------------') | ||
if full_retrain_process == True: | ||
latent_visualization_x_all = False | ||
#First, train the model from scratch with covariate ablation on or off: | ||
covariate_ablation = True | ||
#covariate_ablation = False #NOW RUNNING WITH COVARIATES | ||
retrain_with_sampling = False | ||
load_saved_model = False | ||
retrain = False | ||
run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) | ||
|
||
#Then, load the trained model and sample from the latent space: | ||
retrain_with_sampling = True | ||
load_saved_model = True | ||
retrain = False | ||
run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) | ||
|
||
#Next, load the trained model again and retrain using the generated samples: | ||
retrain_with_sampling = True | ||
load_saved_model = True | ||
retrain = True | ||
run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) | ||
|
||
#Finally, load this retrained model and proceed with UMAP generation, etc. | ||
retrain_with_sampling = False | ||
load_saved_model = True | ||
retrain = False | ||
run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) | ||
|
||
else: #run particular settings as specified: | ||
run(dataset, regression, num_labels, feature, alpha, num_epochs, latent_dim, covariate_list, batch_size, raw_no_normalization, seed_index, generate_splits, load_saved_model, covariate_ablation, l_loc, l_scale, pct_supervised, retrain_with_sampling, retrain, run_number) | ||
|
||
#----------REGRESSION------------------------------- | ||
setting = 'latent' | ||
method = 'scanvi' | ||
|
||
regression = 'linear' | ||
#regression = 'logistic' | ||
|
||
regression_model = 'ridge' | ||
#regression_model = 'mlp' | ||
#regression_model = 'logistic' | ||
|
||
X_train, X_test, y_train, y_test, save_folder = dl.get_latent_train_test(dataset, regression, method, feature, seed_index, alpha_scanvi=alpha, num_epochs=num_epochs) | ||
r2 = reg.do_regression(setting, dataset, feature, regression, X_train, X_test, y_train, y_test, save_folder, regression_model=regression_model) | ||
#save_folder = 'G:/My Drive/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' | ||
#save_folder = 'C:/Users/rigel/Desktop/Backup/Work_Local/Jinbo_Mouse_D/SCANVI/New_Dimension/Comparison_Experiments/Master_Results/' #for backup path $$$ | ||
save_folder = 'Master_Results/' #for cluster path $$$ 'home/rim17004/micro-vi/MicroVI-retraining/Master_Results/' | ||
scanvi_use = save_folder + 'scANVI_Use/' + str(dataset) + '/' + str(feature) + '/' | ||
|
||
if regression == 'linear': | ||
score_name = 'All_R2_' | ||
else: #logistic, accuracy or f1 scoring instead of r2 | ||
#score_name = 'All_ACC_' | ||
score_name = 'All_F1_' | ||
|
||
save_path = scanvi_use | ||
if full_retrain_process == True: | ||
retrain_str = '_retrained' | ||
else: | ||
retrain_str = '' | ||
|
||
cov_tag = '' | ||
if covariate_ablation == False: #True cov ablation will mean covariates excluded | ||
cov_tag = '_cov' | ||
file_name = score_name + regression_model + '_' + 'Alpha' + str(alpha) + '_' + str(num_epochs) + 'epochs' + '_' + setting + '_' + str(pct_supervised) + 'supervised' + retrain_str + cov_tag + '.csv' | ||
|
||
file_path = scanvi_use + file_name | ||
print('FILE PATH= ', file_path) | ||
file_exists = os.path.isfile(file_path) | ||
if file_exists == True: | ||
progress_df = pd.read_csv(file_path) | ||
else: | ||
progress_df = pd.DataFrame(columns =['Run Number']) | ||
progress_df['r2'] = 0 | ||
#progress_df.to_csv(file_path) #saves | ||
#print(progress_df) | ||
#print(file_name + ' created. Continuing...') | ||
|
||
add_new = {'Run Number': run_number, 'r2': r2} | ||
progress_df.loc[len(progress_df)] = add_new | ||
progress_df.to_csv(file_path, index=False) #saves | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(int(main() or 0)) | ||
|