Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
3504-honors-conversion/Tyler Bennett Honors Conversion Project.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
192 lines (136 sloc)
7.67 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# REFERENCES: | |
# https://pandas.pydata.org/pandas-docs/version/1.5/index.html | |
# https://realpython.com/pandas-dataframe/ | |
# https://github.com/afrozchakure/Internity-Summer-Internship-Work/blob/master/Blogs/Random_Forest_Classification/Random%20Forest%20Classifcation.ipynb | |
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html | |
# DEPENDENCIES: | |
# Pandas, Numpy, Matplotlib, Scikit-learn, PyQt5 | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import os | |
from sys import exit | |
EXPLANATORY_VARIABLES = ["EP_POV150", "EP_UNEMP", "EP_HBURD", "EP_NOHSDP", "EP_UNINSUR", "EP_AGE65", | |
"EP_AGE17", "EP_DISABL", "EP_SNGPNT", "EP_LIMENG", "EP_MINRTY", "EP_MUNIT", | |
"EP_MOBILE", "EP_CROWD", "EP_NOVEH", "EP_GROUPQ"] | |
# Replace this with the directory that holds the files "PLACES.csv" and "SVIUSCounty.csv" | |
# Note: Those two files must have those exact names for the program to run properly | |
DIRECTORY = "C:\\Users\\tjb19003\\Documents\\CSE 3504" | |
def extract_lat(point): | |
if type(point) is not str: | |
raise TypeError | |
start_index = point.find('(') # Find the index of the first '(' character | |
end_index = point.find(' ', start_index) # Find the index of the first space after '(' | |
if end_index == -1: # If space is not found after '(', return the subpoint from '(' to the end | |
return float(point[start_index+1:]) | |
return float(point[start_index+1: end_index]) # Return the substring from '(' to the first space | |
def extract_long(point): | |
if type(point) is not str: | |
raise TypeError | |
# Find the index of the first space character | |
first_space_index = point.find(' ') | |
# Find the index of the second space character | |
second_space_index = point.find(' ', first_space_index + 1) | |
# Find the index of the first ')' character after the second space | |
parenthesis_index = point.find(')', second_space_index) | |
# Return the substring between the second space and the first ')' | |
return float(point[second_space_index+1:parenthesis_index]) | |
def county_state_name(county_name, state_name): | |
return county_name + " County, " + state_name | |
################################################################################## | |
if __name__ == "__main__": | |
# ---------- PT. 1: PLACES ---------- # | |
print("Setting up PLACES...") | |
# Reading PLACES dataset | |
places_filepath = os.path.join(DIRECTORY, "PLACES.csv") | |
try: | |
places = pd.read_csv(places_filepath) | |
except Exception: | |
print("!!! ERROR: PLACES.csv could not be found !!!") | |
print("(Check that the directory of the file matches the one specified above)\n") | |
exit() | |
# Narrowing PLACES data to focus on depression outcomes | |
places_condition = ( (places["Category"] == "Health Outcomes") | |
& (places["Measure"] == "Depression among adults aged >=18 years") | |
& (places["Data_Value_Type"] == "Age-adjusted prevalence") ) | |
depression_data = places.loc[places_condition] | |
# Parsing geolocation data (latitude / longitude) from PLACES | |
# geolocation = depression_data["Geolocation"] | |
# geolocation_lat = pd.Series([extract_lat(point) if isinstance(point, str) else None for point in geolocation], copy=False) | |
# geolocation_long = pd.Series([extract_long(point) if isinstance(point, str) else None for point in geolocation], copy=False) | |
# geolocation_value = pd.concat([geolocation_lat, geolocation_long], axis=1) | |
# Extracting depression data and reindexing from 0 | |
depression_value = pd.DataFrame(depression_data["Data_Value"]).reset_index(drop=True) | |
#print(geolocation_value) | |
#print(depression_value) | |
# ---------- PT. 2: SVI ---------- # | |
print("Setting up SVI...") | |
# Reading Social Vulnerability Index (SVI) dataset | |
social_vulnerability_filepath = os.path.join(DIRECTORY, "SVIUSCounty.csv") | |
try: | |
svi = pd.read_csv(social_vulnerability_filepath) | |
except Exception: | |
print("!!! ERROR: SVIUSCounty.csv could not be found !!!") | |
print("(Check that the directory of the file matches the one specified above)\n") | |
exit() | |
svi_values = svi[EXPLANATORY_VARIABLES] | |
#print(svi_values) | |
# ---------- PT. 3: Joining Data ---------- # | |
print("Joining data...") | |
# Pairing SVI results with associated counties (as specified in "LOCATION" col) | |
svi_with_counties = pd.concat([svi["LOCATION"], svi_values], axis=1) | |
# Manipulating depression location data to fit format "{COUNTY_NAME} County, {STATE_NAME}" | |
# This is necessary because this is the same format that is used in "LOCATION" | |
# The new column is called "CountyState" | |
depression_county_state = pd.Series([county_state_name(str(c),str(s)) for c,s in | |
zip(depression_data["LocationName"], depression_data["StateDesc"])], | |
name="CountyState", copy=False) | |
# Pairing depression results with associated counties (via "CountyState" column) | |
depression_with_counties = pd.concat([depression_county_state, depression_value], axis=1) | |
# Joining SVI and depression data by comparing the "LOCATION" and new "CountyState" column | |
# Note: The keyword "inner" ensures that only rows in which the counties match are merged | |
svi_depression_merged = pd.merge(svi_with_counties, depression_with_counties, how="inner", | |
left_on="LOCATION", right_on="CountyState").drop("CountyState",axis=1) | |
#print(svi_depression_merged) | |
# ---------- PT. 4: Building Random Forest ---------- # | |
print("Building Random Forest...") | |
predictors = svi_depression_merged.drop(["LOCATION", "Data_Value"], axis=1) | |
outcome = svi_depression_merged["Data_Value"] | |
X = predictors | |
y = outcome | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) | |
from sklearn.ensemble import RandomForestRegressor | |
# Change to RandomForestRegressor for regression task | |
model = RandomForestRegressor(n_estimators=100, random_state=0) | |
model.fit(X_train, y_train) | |
# Predict continuous values | |
y_pred = model.predict(X_test) | |
# ---------- PT. 5: Assessing Random Forest ---------- # | |
print("Assessing random forest...\n") | |
# Getting the score for our model | |
print("Model Score: ", model.score(X_test, y_test)) | |
# Assess model performance via Mean Squared Error (MSE) | |
from sklearn.metrics import mean_squared_error | |
mse = mean_squared_error(y_test, y_pred) | |
print("Mean Squared Error:", mse,"\n") | |
# Calculating importance scores for each of the explanatory variables (aka "features") | |
importances = model.feature_importances_ | |
forest_importances = pd.Series(importances, index=EXPLANATORY_VARIABLES) | |
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0) | |
# Printing importance scores | |
for feature, score in forest_importances.items(): | |
print(f"Feature : {feature}, Score : {round(score*100, 3)}%") | |
print("\nWould you like a graph of the above data?") | |
ans = "" | |
while not str(ans).isdigit(): | |
ans = input("Enter 0 for No and 1 for Yes: ") | |
if int(ans) == 1: | |
# Setting up plot via Matplotlib | |
fig, ax = plt.subplots() | |
forest_importances.plot.bar(yerr=std, ax=ax) | |
ax.set_title("Feature importances using MDI") | |
ax.set_ylabel("Mean decrease in impurity") | |
fig.tight_layout() | |
# Show graph | |
plt.show() |