diff --git a/main.py b/main.py index 54a295e..d6b5a05 100644 --- a/main.py +++ b/main.py @@ -4,22 +4,29 @@ from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score +K = 5 +random_seed = 0 +test_ratio = 0.2 +def read_data(): + with open('./breast-cancer-wisconsin.data', 'r') as data_fid: + lines = data_fid.readlines() + records = [] + for line in lines: + if '?' in line: + line = line.replace('?', '11') + line = line.split(',') + line = [int(item) for item in line][1:] + records.append(line) + records = np.array(records) + X = records[:, :-1] + y = records[:, -1] + return X, y -with open('./breast-cancer-wisconsin.data', 'r') as data_fid: - lines = data_fid.readlines() - records = [] - for line in lines: - if '?' in line: - line = line.replace('?', '11') - line = line.split(',') - line = [int(item) for item in line][1:] - records.append(line) -records = np.array(records) -X = records[:,:-1] -y = records[:,-1] -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) +X, y = read_data() +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_seed) +# Naive Bayes clf = CategoricalNB() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) @@ -29,3 +36,5 @@ # X_2 = X[Y==2] #benign # X_4 = X[Y==4] #cancer + +