Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
import csv
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier
from sklearn import tree
import pydotplus
import numpy as np
df = pd.read_csv("./metpath.csv")
#prints the shape of the dataset
print(df.shape)
# Print the first row of all the entries with 2 drinks.
# The .iloc method on dataframes allows us to index by position.
print(df[df["drinks"] < 5].iloc[0])
# Print the first row of all the entires with drinks greater than 2.
print(df[df["drinks"] > 5].iloc[0])
#prints a histogram of drinks recorded
plt.hist(df["drinks"])
plt.show()
print("MEAN of DRINKS: %d" % df["drinks"].mean())
df = df[df["drinks"] > 5]
df = df.dropna(axis=0)
#Generate a K-Cluster PCS graph to divide the data into unique clusters
kmeans_model = KMeans(n_clusters=5, random_state=1)
good_columns = df._get_numeric_data()
kmeans_model.fit(good_columns)
labels = kmeans_model.labels_
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(good_columns)
plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=labels)
plt.show()
##Print a list of correlaries to determine relationships
#between drinks/day and the metabolites
print(df.corr()["drinks"])
columns = df.columns.tolist()
columns = [c for c in columns if c not in ["drinks", "f"]]
target = "drinks"
#Split into train/test data. I opted to use a 80/20 train/test split to not overfit
train = df.sample(frac=0.8, random_state=1)
test = df.loc[~df.index.isin(train.index)]
print(train.shape)
print(test)
print(test.shape)
#Linear regression model did not give a very nice prediction
model = LinearRegression()
model.fit(train[columns], train[target])
print("$$$$$$$")
predictions = model.predict(test[columns])
print(predictions)
print(mean_squared_error(predictions, test[target]))
print(test[columns])
plt.scatter(train[target],train[columns]["mcv"],color="black")
plt.scatter(train[target],train[columns]["alkphos"],color="blue")
plt.scatter(train[target],train[columns]["sgpt"],color="red")
plt.scatter(train[target],train[columns]["sgot"],color="green")
plt.scatter(train[target],train[columns]["gammagt"],color="yellow")
plt.show()
#A random forrest implementation
forr = RandomForestRegressor(n_estimators=100, min_samples_leaf=10, random_state=1)
forr.fit(train[columns], train[target])
fpredictions = forr.predict(test[columns])
print(fpredictions)
print(mean_squared_error(fpredictions, test[target]))
#A decision tree regressor
dtree = DecisionTreeRegressor(max_depth=2)
dtree.fit(train[columns], train[target])
dpredictions=dtree.predict(test[columns])
print(dpredictions)
print(mean_squared_error(dpredictions, test[target]))
dot_data = tree.export_graphviz(dtree, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
####Used for tree plot
# graph.write_pdf("./outputs/DecisionTreeRegression.pdf")
#A SVR(rbf,linear, and polynomial kernels) implementation
svrr = SVR(kernel='rbf', C=1e3, gamma=0.1)
svrr.fit(train[columns], train[target])
rp = svrr.predict(test[columns])
print(rp)
print(mean_squared_error(rp, test[target]))
exit()