CAPSULE LOADING

The following files are the code core, that is going to be defined as relevant on this repository
lrm22005 · May 6, 2024 · ae973fe · ae973fe
1 parent 9941660
commit ae973fe
Show file tree

Hide file tree

Showing 973 changed files with 245,872 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+
+data/gene_name_info/query_full_name.txt
+data/gene_name_info/query_ids.txt
+data/gene_name_info/query_snps.txt
+data/gene_name_info/query_symbol.txt
+results/baseline_doc/pubmed.zinc.0.15.txt
+results/baseline_doc/pubmed.zinc.1.15.txt
diff --git a/REPRODUCING.md b/REPRODUCING.md
@@ -0,0 +1,36 @@
+This [Code Ocean](https://codeocean.com) Compute Capsule will allow you to reproduce the results published by the author on your local machine<sup>1</sup>. Follow the instructions below, or consult [our knowledge base](https://help.codeocean.com/user-manual/sharing-and-finding-published-capsules/exporting-capsules-and-reproducing-results-on-your-local-machine) for more information. Don't hesitate to reach out to [Support](mailto:support@codeocean.com) if you have any questions.
+
+<sup>1</sup> You may need access to additional hardware and/or software licenses.
+
+# Prerequisites
+
+- [Docker Community Edition (CE)](https://www.docker.com/community-edition)
+- [nvidia-container-runtime](https://docs.docker.com/config/containers/resource_constraints/#gpu) for code that leverages the GPU
+- MATLAB/MOSEK/Stata licenses where applicable
+
+# Instructions
+
+## The computational environment (Docker image)
+
+This capsule is private and its environment cannot be downloaded at this time. You will need to rebuild the environment locally.
+
+> If there's any software requiring a license that needs to be run during the build stage, you'll need to make your license available. See [our knowledge base](https://help.codeocean.com/user-manual/sharing-and-finding-published-capsules/exporting-capsules-and-reproducing-results-on-your-local-machine) for more information.
+
+In your terminal, navigate to the folder where you've extracted the capsule and execute the following command:
+```shell
+cd environment && docker build . --tag 6ef700ed-ff07-4a42-bf13-65d4165511b6; cd ..
+```
+
+> This step will recreate the environment (i.e., the Docker image) locally, fetching and installing any required dependencies in the process. If any external resources have become unavailable for any reason, the environment will fail to build.
+
+## Running the capsule to reproduce the results
+
+In your terminal, navigate to the folder where you've extracted the capsule and execute the following command, adjusting parameters as needed:
+```shell
+docker run --platform linux/amd64 --rm --gpus all \
+  --workdir /code \
+  --volume "$PWD/data":/data \
+  --volume "$PWD/code":/code \
+  --volume "$PWD/results":/results \
+  6ef700ed-ff07-4a42-bf13-65d4165511b6 bash run
+```
diff --git a/code/Extrinsic_application_CVD_prediction.py b/code/Extrinsic_application_CVD_prediction.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug 30 21:59:06 2022
+
+@author: Jihye Moon
+"""
+import sys
+import os
+import pathlib
+
+import pandas as pd
+import numpy as np
+
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import StratifiedShuffleSplit as strata
+
+import lib.ML_models as ml
+sys.path.append('lib')   
+import loading_literature_embedding as emb
+
+def data_split(X_train_index, X_test_index, X, y):
+    valid_data = int(len(X_test_index)/2) 
+    test_data = int(len(X_test_index))-valid_data 
+
+    test = X_test_index[0:test_data]; valid = X_test_index[test_data:test_data+valid_data] 
+
+    X_train = X[X_train_index]; X_test = X[test]; X_valid = X[valid]
+
+    y_train = y[X_train_index]
+    y_test = y[test]
+    y_valid = y[valid]
+
+    X_train = np.reshape(X_train, (X_train.shape[0], -1)); X_test = np.reshape(X_test, (X_test.shape[0], -1))
+    X_valid = np.reshape(X_valid, (X_valid.shape[0], -1)) 
+    y_train = np.squeeze(y_train); y_test = np.squeeze(y_test); y_valid = np.squeeze(y_valid) 
+
+    scaler = StandardScaler()  
+    scaler.fit(X_train)
+    X_train = scaler.transform(X_train); X_test = scaler.transform(X_test); X_valid = scaler.transform(X_valid) 
+    return X_train, X_test, X_valid, y_train, y_test, y_valid
+
+def loading_variable_embedding(data_path):
+    var_symbol = list(pd.read_csv(data_path+'/variables_symbol.csv').drop(columns='Unnamed: 0')['0'])
+    var_name = list(pd.read_csv(data_path+'/variables_preprocessed_names.csv').drop(columns='Unnamed: 0')['0'])
+    tar_symbol = list(pd.read_csv(data_path+'/target_variables_symbol.csv').drop(columns='Unnamed: 0')['0'])
+    tar_name = list(pd.read_csv(data_path+'/target_variables_preprocessed_names.csv').drop(columns='Unnamed: 0')['0'])
+
+    variables_indexing={}; disease_variables_indexing={}
+
+    for i in range(len(var_name)):
+        variables_indexing[var_symbol[i]] = var_name[i]
+
+    for i in range(len(tar_name)):
+        disease_variables_indexing[tar_symbol[i]] = tar_name[i]
+
+    additional_dictionary = {'uricosurics':'uricosuric'} 
+    # If some variable names are very unique that can't find in embedding vocabulary, 
+    # add the unique variable names here to avoid error for feature selection tasks
+
+    embedding_list, index2variables, embedding, removal, removed_words = emb2simi.variable2embed(words_list, syn0norm, variables_indexing, additional_dictionary)
+
+    if removal==[]:
+        print(" === NO problem for your variables") 
+        target_embedding_list, index2target, target_embedding, _, _ = emb2simi.variable2embed(words_list, syn0norm, disease_variables_indexing, additional_dictionary)
+
+        return embedding_list, variables_indexing, disease_variables_indexing, additional_dictionary, \
+            target_embedding_list, index2target, index2variables, target_embedding, embedding
+    else:
+        print(" === Check if there are errors for your variable names")
+        return 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+def CVD_Prediction_with_FS_DR(data_path, Xt, y):
+    feature_size = 128; i=0
+    split_info = strata(n_splits=5, test_size=0.2, random_state=12)
+    total_FS_Pre=[]; total_FS_prob=[]
+    total_DR_pre=[]; total_DR_prob=[]
+    embedding_list, variables_indexing, disease_variables_indexing, additional_dictionary, target_embedding_list, index2target, index2variables, target_embedding, embedding = loading_variable_embedding(data_path)
+    for X_train_index, X_test_index in split_info.split(Xt.values, y): 
+        result_dir = os.path.join(output_path +str(i)) 
+        pathlib.Path(result_dir).mkdir(parents=True, exist_ok=True)
+        X_train, X_test, X_valid, y_train, y_test, y_valid = data_split(X_train_index, X_test_index, Xt.values, y)
+        pr.save_label(y_test, 'CVD_label', result_dir) # y_test labels to evaludate CVD prediction performance for each fold
+        print("=== run Our feature selector --- our FS selected features via feature name , our FS uses same feature set for 5-fold cross validation. ")
+        embed_name = fs.Our_FS(emb2simi, str(i)+'rf_embedding_features', embedding_list, variables_indexing, disease_variables_indexing, additional_dictionary, embedding, target_embedding_list, index2target, index2variables, target_embedding, feature_size, result_dir)
+
+        print("=== run Our dimensionality reductor ")
+        A1, A2, A3 = dr.Our_DR(embedding, X_train, X_test, X_valid, feature_size)
+
+        print("=== Running with MLs with Feature Selection (Our FS)")
+        X2 = Xt[embed_name].values ### selecting only 128 variables based on our 128 features
+        valid_data = int(len(X_test_index)/2); test_data = int(len(X_test_index))-valid_data 
+        test = X_test_index[0:test_data]; valid = X_test_index[test_data:test_data+valid_data] # split test data 
+        X_train2 = X2[X_train_index]; X_test2 = X2[test]; X_valid2 = X2[valid] 
+
+        X_train2 = np.reshape(X_train2, (X_train2.shape[0], -1)) 
+        X_test2 = np.reshape(X_test2, (X_test2.shape[0], -1))
+        X_valid2 = np.reshape(X_valid2, (X_valid2.shape[0], -1))
+
+        scaler = StandardScaler()  
+        scaler.fit(X_train2)
+        X_train2 = scaler.transform(X_train2); X_test2 = scaler.transform(X_test2); X_valid2 = scaler.transform(X_valid2) 
+
+        Our_FS_total_prediction, Our_FS_total_prob = pr.run_save(X_train2, y_train, X_test2, y_test, X_valid2, y_valid, 'FS.embedding', 'SMOTE', feature_size, result_dir)
+        total_FS_Pre.append(Our_FS_total_prediction); total_FS_prob.append(Our_FS_total_prob)
+        print("=== Running MLs with Dimensionality Reduction (Our DR)")
+        Our_DR_total_prediction, Our_DR_total_prob = pr.run_save(A1, y_train, A2, y_test, A3, y_valid, 'DR.embedding', 'SMOTE', feature_size, result_dir)
+        total_DR_pre.append(Our_FS_total_prediction); total_DR_prob.append(Our_FS_total_prob)
+        i+=1
+    print('all results are saved in ', output_path)
+    return total_FS_Pre, total_FS_prob, total_DR_pre, total_DR_prob
+
+data_path = '../data/Example'
+model_path = '../data/old_model'
+output_path = '../results/prediction/'
+
+fs = ml.feature_selectors()
+dr = ml.dimension_reducers()
+pr = ml.predictors()
+
+gene_name = '../data/gene_name_info/query_full_name'; gene_symb='../data/gene_name_info/query_symbol' 
+emb2simi=emb.embedding_vector()  
+
+words_list, index2word, syn0norm, _ = emb2simi.setting(model_path, gene_symb) 
+
+Xt = pd.read_csv(data_path+'/Example_X.csv').drop(columns='Unnamed: 0')
+y = pd.read_csv(data_path+'/Example_y.csv').drop(columns='Unnamed: 0').values
+
+total_FS_Pre, total_FS_prob, total_DR_pre, total_DR_prob = CVD_Prediction_with_FS_DR(data_path, Xt, y)
diff --git a/code/LICENSE b/code/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Jihye Moon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.