Skip to content
Permalink
main
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# REFERENCES:
# https://pandas.pydata.org/pandas-docs/version/1.5/index.html
# https://realpython.com/pandas-dataframe/
# https://github.com/afrozchakure/Internity-Summer-Internship-Work/blob/master/Blogs/Random_Forest_Classification/Random%20Forest%20Classifcation.ipynb
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
# DEPENDENCIES:
# Pandas, Numpy, Matplotlib, Scikit-learn, PyQt5
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sys import exit
EXPLANATORY_VARIABLES = ["EP_POV150", "EP_UNEMP", "EP_HBURD", "EP_NOHSDP", "EP_UNINSUR", "EP_AGE65",
"EP_AGE17", "EP_DISABL", "EP_SNGPNT", "EP_LIMENG", "EP_MINRTY", "EP_MUNIT",
"EP_MOBILE", "EP_CROWD", "EP_NOVEH", "EP_GROUPQ"]
# Replace this with the directory that holds the files "PLACES.csv" and "SVIUSCounty.csv"
# Note: Those two files must have those exact names for the program to run properly
DIRECTORY = "C:\\Users\\tjb19003\\Documents\\CSE 3504"
def extract_lat(point):
if type(point) is not str:
raise TypeError
start_index = point.find('(') # Find the index of the first '(' character
end_index = point.find(' ', start_index) # Find the index of the first space after '('
if end_index == -1: # If space is not found after '(', return the subpoint from '(' to the end
return float(point[start_index+1:])
return float(point[start_index+1: end_index]) # Return the substring from '(' to the first space
def extract_long(point):
if type(point) is not str:
raise TypeError
# Find the index of the first space character
first_space_index = point.find(' ')
# Find the index of the second space character
second_space_index = point.find(' ', first_space_index + 1)
# Find the index of the first ')' character after the second space
parenthesis_index = point.find(')', second_space_index)
# Return the substring between the second space and the first ')'
return float(point[second_space_index+1:parenthesis_index])
def county_state_name(county_name, state_name):
return county_name + " County, " + state_name
##################################################################################
if __name__ == "__main__":
# ---------- PT. 1: PLACES ---------- #
print("Setting up PLACES...")
# Reading PLACES dataset
places_filepath = os.path.join(DIRECTORY, "PLACES.csv")
try:
places = pd.read_csv(places_filepath)
except Exception:
print("!!! ERROR: PLACES.csv could not be found !!!")
print("(Check that the directory of the file matches the one specified above)\n")
exit()
# Narrowing PLACES data to focus on depression outcomes
places_condition = ( (places["Category"] == "Health Outcomes")
& (places["Measure"] == "Depression among adults aged >=18 years")
& (places["Data_Value_Type"] == "Age-adjusted prevalence") )
depression_data = places.loc[places_condition]
# Parsing geolocation data (latitude / longitude) from PLACES
# geolocation = depression_data["Geolocation"]
# geolocation_lat = pd.Series([extract_lat(point) if isinstance(point, str) else None for point in geolocation], copy=False)
# geolocation_long = pd.Series([extract_long(point) if isinstance(point, str) else None for point in geolocation], copy=False)
# geolocation_value = pd.concat([geolocation_lat, geolocation_long], axis=1)
# Extracting depression data and reindexing from 0
depression_value = pd.DataFrame(depression_data["Data_Value"]).reset_index(drop=True)
#print(geolocation_value)
#print(depression_value)
# ---------- PT. 2: SVI ---------- #
print("Setting up SVI...")
# Reading Social Vulnerability Index (SVI) dataset
social_vulnerability_filepath = os.path.join(DIRECTORY, "SVIUSCounty.csv")
try:
svi = pd.read_csv(social_vulnerability_filepath)
except Exception:
print("!!! ERROR: SVIUSCounty.csv could not be found !!!")
print("(Check that the directory of the file matches the one specified above)\n")
exit()
svi_values = svi[EXPLANATORY_VARIABLES]
#print(svi_values)
# ---------- PT. 3: Joining Data ---------- #
print("Joining data...")
# Pairing SVI results with associated counties (as specified in "LOCATION" col)
svi_with_counties = pd.concat([svi["LOCATION"], svi_values], axis=1)
# Manipulating depression location data to fit format "{COUNTY_NAME} County, {STATE_NAME}"
# This is necessary because this is the same format that is used in "LOCATION"
# The new column is called "CountyState"
depression_county_state = pd.Series([county_state_name(str(c),str(s)) for c,s in
zip(depression_data["LocationName"], depression_data["StateDesc"])],
name="CountyState", copy=False)
# Pairing depression results with associated counties (via "CountyState" column)
depression_with_counties = pd.concat([depression_county_state, depression_value], axis=1)
# Joining SVI and depression data by comparing the "LOCATION" and new "CountyState" column
# Note: The keyword "inner" ensures that only rows in which the counties match are merged
svi_depression_merged = pd.merge(svi_with_counties, depression_with_counties, how="inner",
left_on="LOCATION", right_on="CountyState").drop("CountyState",axis=1)
#print(svi_depression_merged)
# ---------- PT. 4: Building Random Forest ---------- #
print("Building Random Forest...")
predictors = svi_depression_merged.drop(["LOCATION", "Data_Value"], axis=1)
outcome = svi_depression_merged["Data_Value"]
X = predictors
y = outcome
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
from sklearn.ensemble import RandomForestRegressor
# Change to RandomForestRegressor for regression task
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
# Predict continuous values
y_pred = model.predict(X_test)
# ---------- PT. 5: Assessing Random Forest ---------- #
print("Assessing random forest...\n")
# Getting the score for our model
print("Model Score: ", model.score(X_test, y_test))
# Assess model performance via Mean Squared Error (MSE)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse,"\n")
# Calculating importance scores for each of the explanatory variables (aka "features")
importances = model.feature_importances_
forest_importances = pd.Series(importances, index=EXPLANATORY_VARIABLES)
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
# Printing importance scores
for feature, score in forest_importances.items():
print(f"Feature : {feature}, Score : {round(score*100, 3)}%")
print("\nWould you like a graph of the above data?")
ans = ""
while not str(ans).isdigit():
ans = input("Enter 0 for No and 1 for Yes: ")
if int(ans) == 1:
# Setting up plot via Matplotlib
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()
# Show graph
plt.show()