Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Classifier_with_biotyping_knowledge/classficiation.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
198 lines (162 sloc)
8.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
import pandas as pd | |
import os | |
import numpy as np | |
from sklearn import metrics | |
import sys | |
from dataset_preprocessing import generate_AUD_without_comorbidity_dataset | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
class Net(nn.Module): | |
def __init__(self,column_training,inputsize,clusternumber,structure=[20,5]): | |
super(Net, self).__init__() | |
region_roi=pd.read_csv('..' +os.sep+"raw_data"+os.sep+"aal_roi_list_Regions.csv") | |
ROIlist=(region_roi["Region"]-1).tolist() | |
mark=[] | |
candicate_hm_region=set() | |
for c in range(len(column_training)): | |
i=column_training[c][0] | |
j=column_training[c][1] | |
candicate_hm_region.add(min(ROIlist[i]*8+ROIlist[j],ROIlist[j]*8+ROIlist[i])) | |
if min(ROIlist[i]*8+ROIlist[j],ROIlist[j]*8+ROIlist[i]) in candicate_hm_region: | |
mark.append(min(ROIlist[i]*8+ROIlist[j],ROIlist[j]*8+ROIlist[i])) | |
candicate_hm_region=list(candicate_hm_region) | |
self.ROImark=torch.IntTensor(mark) | |
Num_pROI=len(candicate_hm_region) | |
self.pairwise_ROI=nn.ModuleList() | |
for mark in range(len(candicate_hm_region)): | |
self.pairwise_ROI.append(nn.Linear((self.ROImark==candicate_hm_region[mark]).sum(), structure[0])) | |
self.candicate_hm_region=candicate_hm_region | |
self.fclist=nn.ModuleList() | |
cur_dim=structure[0]*len(self.candicate_hm_region) | |
self.structure=structure | |
for i in range(len(structure)-1): | |
self.fclist.append(nn.Linear(cur_dim, structure[i+1])) | |
cur_dim=structure[i+1] | |
cur_dim=structure[-1] | |
self.final = nn.Linear(cur_dim, clusternumber) | |
self.final2 = nn.Linear(cur_dim, 1) | |
self.dropout1 = nn.Dropout(0.5) | |
def forward(self, x): | |
region_x=self.pairwise_ROI[-1](x[:,self.ROImark==self.candicate_hm_region[-1]]) | |
for i in range(len(self.pairwise_ROI)-1): | |
region_x=torch.cat([region_x,self.pairwise_ROI[i](x[:,self.ROImark==self.candicate_hm_region[i]])],axis=1) | |
region_x = F.relu(region_x) | |
region_x = self.dropout1(region_x) | |
for i in range(len(self.structure)-1): | |
region_x = self.fclist[i](region_x) | |
region_x = F.relu(region_x) | |
x_final = region_x | |
x_final=self.final(x_final) | |
output = x_final | |
output2=self.final2(x_final) | |
return output,output2 | |
def conduct_classification(alpha,dim): | |
folder="selected_result"+os.sep+"selected_fc_features" | |
cluster_data_folder=folder+os.sep+str(alpha) | |
cluster_labels=pickle.load(open( cluster_data_folder +os.sep+str(dim)+"_clustering_result.p", "rb" ) ) | |
train_x_raw,train_y_raw,test_x_raw,test_y_raw=generate_AUD_without_comorbidity_dataset(folder=folder) | |
x_raw_other,y_raw_other=generate_AUD_without_comorbidity_dataset(folder=folder,anothersession=True) | |
subtype_classifier_train_x_am=train_x_raw[train_y_raw["label"]==True] | |
subtype_classifier_train_x_hc=train_x_raw[train_y_raw["label"]==False] | |
cluster_result_withsubject=pd.DataFrame({"Subject":subtype_classifier_train_x_am["Subject"],"Subtype":cluster_labels}) | |
cluster_result_withsubject_hc=pd.DataFrame({"Subject":subtype_classifier_train_x_hc["Subject"],"Subtype":cluster_labels.max()+1}) | |
cluster_result_withsubject=pd.concat([cluster_result_withsubject,cluster_result_withsubject_hc],axis=0) | |
subtype_classifier_train_x_am=subtype_classifier_train_x_am.drop("Subject",axis=1) | |
subtype_classifier_train_x_hc=subtype_classifier_train_x_hc.drop("Subject",axis=1) | |
test_x_raw=test_x_raw.drop("Subject",axis=1) | |
column_training=subtype_classifier_train_x_am.columns.tolist() | |
trainlabel=np.concatenate([cluster_labels,[(cluster_labels.max()+1) for i in range(subtype_classifier_train_x_hc.shape[0])]],axis=0) | |
traindata=np.concatenate([subtype_classifier_train_x_am,subtype_classifier_train_x_hc],axis=0) | |
use_cuda = torch.cuda.is_available() | |
device = torch.device("cuda" if use_cuda else "cpu") | |
batch_size=32 | |
epoches=500 | |
learning_rate=0.001 | |
ratio=0.75 | |
seed=0 | |
train_kwargs = {'batch_size': batch_size} | |
test_kwargs = {} | |
if use_cuda: | |
cuda_kwargs = {'num_workers': 1, | |
'pin_memory': True, | |
'shuffle': True | |
} | |
train_kwargs.update(cuda_kwargs) | |
test_kwargs.update(cuda_kwargs) | |
from torch.utils.data import TensorDataset, DataLoader | |
from sklearn.model_selection import train_test_split | |
from sklearn.utils import shuffle | |
traindata, trainlabel = shuffle(traindata, trainlabel, random_state=0) | |
tensor_x = torch.Tensor(traindata) # transform to torch tensor | |
tensor_y = torch.Tensor(trainlabel).type(torch.long) | |
test_y_raw=test_y_raw["label"]#.to_numpy() | |
test_x_raw=test_x_raw#.to_numpy() | |
validx,testx,validy,testy = train_test_split(test_x_raw, test_y_raw, test_size=0.5, random_state=0,stratify=test_y_raw) | |
validx=validx.to_numpy() | |
testx=testx.to_numpy() | |
validy=validy.to_numpy() | |
testy=testy.to_numpy() | |
test_tensor_x = torch.Tensor(testx) # transform to torch tensor | |
test_tensor_y = torch.Tensor(testy).type(torch.long) | |
test_kwargs['batch_size']=testx.shape[0] | |
test_loader = DataLoader(TensorDataset(test_tensor_x,test_tensor_y),**test_kwargs) # create your dataloader | |
valid_tensor_x = torch.Tensor(validx) # transform to torch tensor | |
#import pdb; pdb.set_trace() | |
valid_tensor_y = torch.Tensor(validy).type(torch.long) | |
test_kwargs['batch_size']=validx.shape[0] | |
valid_loader = DataLoader(TensorDataset(valid_tensor_x,valid_tensor_y),**test_kwargs) # create your dataloader | |
cluster_data_folder="classification" | |
try: | |
os.mkdir(cluster_data_folder) | |
except OSError as error: | |
print(error) | |
torch.manual_seed(seed) | |
train_loader = DataLoader(TensorDataset(tensor_x,tensor_y),**train_kwargs) # create your dataloader | |
model = Net(column_training,traindata.shape[1],trainlabel.max()+1,structure=[2,5]).to(device) | |
optimizer = optim.Adam(model.parameters(), lr=learning_rate) | |
model.train() | |
maxvalidauc=-1 | |
minvalidloss=1000 | |
bce = torch.nn.BCEWithLogitsLoss() | |
mlm = torch.nn.MultiMarginLoss() | |
for epoch in range(1, epoches + 1): | |
model.train() | |
for batch_idx, (data, target) in enumerate(train_loader): | |
data, target = data.to(device), target.to(device) | |
optimizer.zero_grad() | |
output,output2 = model(data) | |
l1=ratio*mlm(output, target) | |
l2=(1-ratio)*bce(output2, torch.unsqueeze((target<4),dim=1).type_as(output2)) | |
loss = l1+l2 | |
loss.backward() | |
optimizer.step() | |
model.eval() | |
print(l1.item(),l2.item()) | |
with torch.no_grad(): | |
selected=False | |
for data, target in valid_loader: | |
data, target = data.to(device), target.to(device) | |
output,o2 = model(data) | |
valid_loss=bce(o2, target.unsqueeze(1).type_as(o2)).item() | |
o2 = torch.sigmoid(o2) | |
fpr, tpr, thresholds = metrics.roc_curve(target.cpu(), o2.cpu(), pos_label=1,drop_intermediate=False) | |
valid_auc=metrics.auc(fpr, tpr) | |
if maxvalidauc<=valid_auc: | |
maxvalidauc=valid_auc | |
selected=True | |
for data, target in test_loader: | |
data, target = data.to(device), target.to(device) | |
output,o2 = model(data) | |
o2 = torch.sigmoid(o2) | |
fpr, tpr, thresholds = metrics.roc_curve(target.cpu(), o2.cpu(), pos_label=1,drop_intermediate=False) | |
auc=metrics.auc(fpr, tpr) | |
if selected==True: | |
maxtestedauc=auc | |
print("save model in",epoch) | |
torch.save(model.state_dict(), cluster_data_folder +os.sep+"best_dnn_model_"+str(seed)+"_"+str(learning_rate)+"_"+str(ratio)+".model") | |
print(epoch,"auc is ",valid_auc,auc,maxvalidauc,maxtestedauc,loss.item()) | |
if __name__ == "__main__": | |
conduct_classification(alpha=0.0005,dim=3) |