Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Classifier_with_biotyping_knowledge/dataset_preprocessing.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
160 lines (137 sloc)
8.27 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
#import pathos | |
import pickle | |
import numpy as np | |
import pandas as pd | |
import multiprocessing | |
import math | |
import sys | |
#import seaborn as sns | |
#!/bin/python3 | |
import os | |
import numpy as np | |
import pandas as pd | |
from scipy.io import loadmat | |
from queue import Queue | |
from threading import Thread | |
from multiprocessing import Pool | |
from multiprocessing.dummy import Pool as ThreadPool | |
from mpl_toolkits.mplot3d import Axes3D | |
import numpy as np | |
import pandas as pd | |
from sklearn import metrics | |
import random | |
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis | |
from sklearn.model_selection import cross_val_score | |
from sklearn.model_selection import cross_validate | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.neural_network import MLPClassifier | |
from sklearn import svm | |
from sklearn.model_selection import KFold | |
from scipy.stats import norm | |
from sklearn.linear_model import LogisticRegressionCV | |
#import statsmodels.api as sm | |
from sklearn.cross_decomposition import CCA | |
#select training data | |
from sklearn.utils import shuffle | |
from scipy.stats import spearmanr | |
from scipy.stats import ranksums | |
from sklearn.linear_model import LinearRegression | |
import os | |
import os.path | |
import math | |
def generate_AUD_without_comorbidity_dataset(train_ratio=0.8,folder="",anothersession=False): | |
jobid="1" | |
SUBJECT='Subject' | |
dataset_suffix="r1" | |
if anothersession==True: | |
dataset_suffix="r2" | |
func_conn_raw = pickle.load(open('..' +os.sep+"fmri_data_processing"+os.sep+'AAL_dataset_'+dataset_suffix+'.pickle','rb')) | |
psychiatric = pd.read_csv('..' +os.sep+"raw_data"+os.sep+'psychiatric_data_HCP.csv') | |
func_conn_raw_col=func_conn_raw.columns | |
psychiatric_col=psychiatric.columns | |
func_conn = func_conn_raw.merge(psychiatric[SUBJECT],on=SUBJECT)[func_conn_raw_col] | |
psychiatric=psychiatric.merge(func_conn_raw[SUBJECT],on=SUBJECT)[psychiatric_col] | |
#Head Motion preprocessing | |
missing_frame_number = pd.read_csv('..' +os.sep+"raw_data"+os.sep+'missing_frame_number.csv') | |
missing_frame_number=missing_frame_number[["ID","Rest1_LR","Rest1_RL","Rest1_LR _num","Rest1_RL _num"]] | |
missing_frame_number.columns=[SUBJECT,"s1r","s2r","s1n","s2n"] | |
missing_frame_number["totaln"]=missing_frame_number["s1n"]+missing_frame_number["s2n"] | |
kept_subject=missing_frame_number | |
#if headMotion_exclude_type!=3: | |
func_conn = func_conn.merge(kept_subject[SUBJECT],on=SUBJECT) | |
#Load SUD labels | |
labels_alc = pd.read_csv('..' +os.sep+"raw_data"+os.sep + 'Full_alc_label.csv') | |
labels_tob = pd.read_csv('..' +os.sep+"raw_data"+os.sep + 'full_tob_label.csv') | |
labels_mar = pd.read_csv('..' +os.sep+"raw_data"+os.sep + 'Full_Marijuana_label.csv') | |
#divide subjects into training set and test set | |
a=(labels_alc["label"]==True)&(labels_tob["label"]==False)&(labels_mar["label"]==False) | |
b=(labels_alc["label"]==False)&(labels_tob["label"]==False)&(labels_mar["label"]==False) | |
selected_subject=a|b | |
labels_alc_with_comorbidity=labels_alc[selected_subject] | |
#import pdb; pdb.set_trace() | |
func_conn_i = func_conn.merge(labels_alc_with_comorbidity[SUBJECT],on=SUBJECT) | |
labels_alc_with_comorbidity_i = labels_alc_with_comorbidity.merge(func_conn_i[SUBJECT],on=SUBJECT) | |
if anothersession==False: | |
list_train_test=[True for i in range(int(func_conn_i.shape[0]*train_ratio))]+[False for i in range(func_conn_i.shape[0]-int(func_conn_i.shape[0]*train_ratio))] | |
#print(func_conn_i.shape[0]*train_ratio) | |
else: | |
list_train_test=[True for i in range(int(func_conn_i.shape[0]))] | |
Train_test_label=pd.Series(shuffle(list_train_test, random_state=0)) | |
regression_models={} | |
train_x=func_conn_i[(Train_test_label==True).values] | |
train_y=labels_alc_with_comorbidity_i[(Train_test_label==True).values] | |
test_x=func_conn_i[(Train_test_label==False).values] | |
test_y=labels_alc_with_comorbidity_i[(Train_test_label==False).values] | |
psychiatric = pd.read_csv('..' +os.sep+"raw_data"+os.sep +'psychiatric_data_HCP.csv') | |
HCP_summary = pd.read_csv('..' +os.sep+"raw_data"+os.sep +'HCP_summary_S1206.csv') | |
train_psychiatric = train_x[["Subject"]].merge(psychiatric[["Subject","Age_in_Yrs"]],on="Subject") | |
train_HCP_summary = train_x[["Subject"]].merge(HCP_summary[["Subject","Gender"]].replace(["M","F"],[0,1]),on="Subject") | |
test_HCP_summary = test_x[["Subject"]].merge(HCP_summary[["Subject","Gender"]].replace(["M","F"],[0,1]),on="Subject") | |
test_HCP_summary = test_HCP_summary.merge(psychiatric[["Subject","Age_in_Yrs"]],on="Subject") | |
test_HCP_summary = test_HCP_summary.merge(kept_subject[[SUBJECT,"totaln"]],on="Subject") | |
test_HCP_summary ["interactive"]=test_HCP_summary["Gender"]*test_HCP_summary["Age_in_Yrs"] | |
cov_feature = train_HCP_summary.merge(train_psychiatric,on="Subject") | |
cov_feature = cov_feature.merge(kept_subject[[SUBJECT,"totaln"]],on="Subject") | |
cov_feature ["interactive"]=cov_feature["Gender"]*cov_feature["Age_in_Yrs"] | |
selected_cov=["Gender","Age_in_Yrs"] | |
selected_cov.append("interactive") | |
selected_cov.append("totaln") | |
if(len(selected_cov)!=0): | |
if anothersession==False: | |
if not os.path.exists(folder+os.sep+"exclude_gender_conn_train_comorbidity.p") or not os.path.exists(folder+os.sep+"exclude_gender_conn_train_comorbidity_test.p"): | |
for i_c in range(len(func_conn_i.columns[1:])): | |
print("model_correction",i_c) | |
column = func_conn_i.columns[i_c+1] | |
linear_regressor = LinearRegression(fit_intercept=True) # create object for the class | |
linear_regressor.fit(cov_feature[selected_cov].to_numpy(), train_x[column]) # perform linear regression | |
linear_regressor.intercept_=0 | |
train_x.loc[:,[column]] = train_x[column]-linear_regressor.predict(cov_feature[selected_cov].to_numpy()) # make predictions | |
test_x.loc[:,[column]] = test_x[column]-linear_regressor.predict(test_HCP_summary[selected_cov].to_numpy()) # make predictions | |
regression_models[column]=linear_regressor | |
pickle.dump(regression_models, open( folder+os.sep+"regress_models.p", "wb" ) ) | |
pickle.dump( train_x, open( folder+os.sep+"exclude_gender_conn_train_comorbidity.p", "wb" ) ) | |
pickle.dump( test_x, open( folder+os.sep+"exclude_gender_conn_train_comorbidity_test.p", "wb" ) ) | |
else: | |
train_x=pickle.load( open( folder+os.sep+"exclude_gender_conn_train_comorbidity.p", "rb" ) ) | |
test_x=pickle.load( open( folder+os.sep+"exclude_gender_conn_train_comorbidity_test.p", "rb" ) ) | |
return train_x,train_y,test_x,test_y | |
else: | |
if not os.path.exists(folder+os.sep+"exclude_gender_conn_train_comorbidity_other.p"): | |
regression_models=pickle.load( open( folder+os.sep+"regress_models.p", "rb" ) ) | |
train_x=train_x.astype(float) | |
for i_c in range(len(func_conn_i.columns[1:])): | |
print("model_correction",i_c) | |
column = func_conn_i.columns[i_c+1] | |
linear_regressor=regression_models[column] | |
train_x.loc[:,column] = train_x[column].astype(float)-linear_regressor.predict(cov_feature[selected_cov].to_numpy()) # make predictions | |
pickle.dump( train_x, open( folder+os.sep+"exclude_gender_conn_train_comorbidity_other.p", "wb" ) ) | |
else: | |
train_x=pickle.load( open( folder+os.sep+"exclude_gender_conn_train_comorbidity_other.p", "rb" ) ) | |
return train_x,train_y | |
else: | |
if anothersession==False: | |
return func_conn_i[(Train_test_label==True).values],labels_alc_with_comorbidity_i[(Train_test_label==True).values],func_conn_i[(Train_test_label==False).values],labels_alc_with_comorbidity_i[(Train_test_label==False).values] | |
else: | |
return func_conn_i[(Train_test_label==True).values],labels_alc_with_comorbidity_i[(Train_test_label==True).values] |