Tyler Bennett Honors Conversion Project.py

# REFERENCES:
# https://pandas.pydata.org/pandas-docs/version/1.5/index.html
# https://realpython.com/pandas-dataframe/
# https://github.com/afrozchakure/Internity-Summer-Internship-Work/blob/master/Blogs/Random_Forest_Classification/Random%20Forest%20Classifcation.ipynb
# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

# DEPENDENCIES:
# Pandas, Numpy, Matplotlib, Scikit-learn, PyQt5


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sys import exit

EXPLANATORY_VARIABLES = ["EP_POV150", "EP_UNEMP", "EP_HBURD", "EP_NOHSDP", "EP_UNINSUR", "EP_AGE65",
                         "EP_AGE17", "EP_DISABL", "EP_SNGPNT", "EP_LIMENG", "EP_MINRTY", "EP_MUNIT",
                         "EP_MOBILE", "EP_CROWD", "EP_NOVEH", "EP_GROUPQ"]

# Replace this with the directory that holds the files "PLACES.csv" and "SVIUSCounty.csv"
# Note: Those two files must have those exact names for the program to run properly
DIRECTORY = "C:\\Users\\tjb19003\\Documents\\CSE 3504"

def extract_lat(point):
    if type(point) is not str:
        raise TypeError

    start_index = point.find('(')  # Find the index of the first '(' character

    end_index = point.find(' ', start_index)  # Find the index of the first space after '('
    if end_index == -1:  # If space is not found after '(', return the subpoint from '(' to the end
        return float(point[start_index+1:])

    return float(point[start_index+1: end_index])  # Return the substring from '(' to the first space


def extract_long(point):
    if type(point) is not str:
        raise TypeError

    # Find the index of the first space character
    first_space_index = point.find(' ')

    # Find the index of the second space character
    second_space_index = point.find(' ', first_space_index + 1)

    # Find the index of the first ')' character after the second space
    parenthesis_index = point.find(')', second_space_index)

    # Return the substring between the second space and the first ')'
    return float(point[second_space_index+1:parenthesis_index])


def county_state_name(county_name, state_name):
    return county_name + " County, " + state_name


##################################################################################

if __name__ == "__main__":

# ---------- PT. 1: PLACES ---------- #

    print("Setting up PLACES...")

    # Reading PLACES dataset
    places_filepath = os.path.join(DIRECTORY, "PLACES.csv")
    try:
        places = pd.read_csv(places_filepath)
    except Exception:
        print("!!! ERROR: PLACES.csv could not be found !!!")
        print("(Check that the directory of the file matches the one specified above)\n")
        exit()

    # Narrowing PLACES data to focus on depression outcomes
    places_condition = (  (places["Category"] == "Health Outcomes")
                & (places["Measure"] == "Depression among adults aged >=18 years")
                & (places["Data_Value_Type"] == "Age-adjusted prevalence")  )
    depression_data = places.loc[places_condition]

    # Parsing geolocation data (latitude / longitude) from PLACES
    # geolocation = depression_data["Geolocation"]
    # geolocation_lat = pd.Series([extract_lat(point) if isinstance(point, str) else None for point in geolocation], copy=False)
    # geolocation_long = pd.Series([extract_long(point) if isinstance(point, str) else None for point in geolocation], copy=False)
    # geolocation_value = pd.concat([geolocation_lat, geolocation_long], axis=1)

    # Extracting depression data and reindexing from 0
    depression_value = pd.DataFrame(depression_data["Data_Value"]).reset_index(drop=True)

    #print(geolocation_value)
    #print(depression_value)

# ---------- PT. 2: SVI ---------- #

    print("Setting up SVI...")

    # Reading Social Vulnerability Index (SVI) dataset
    social_vulnerability_filepath = os.path.join(DIRECTORY, "SVIUSCounty.csv")
    try:
        svi = pd.read_csv(social_vulnerability_filepath)
    except Exception:
        print("!!! ERROR: SVIUSCounty.csv could not be found !!!")
        print("(Check that the directory of the file matches the one specified above)\n")
        exit()

    svi_values = svi[EXPLANATORY_VARIABLES]

    #print(svi_values)

# ---------- PT. 3: Joining Data ---------- #

    print("Joining data...")

    # Pairing SVI results with associated counties (as specified in "LOCATION" col)
    svi_with_counties = pd.concat([svi["LOCATION"], svi_values], axis=1)

    # Manipulating depression location data to fit format "{COUNTY_NAME} County, {STATE_NAME}"
    # This is necessary because this is the same format that is used in "LOCATION"
    # The new column is called "CountyState"
    depression_county_state = pd.Series([county_state_name(str(c),str(s)) for c,s in
                                        zip(depression_data["LocationName"], depression_data["StateDesc"])],
                                        name="CountyState", copy=False)

    # Pairing depression results with associated counties (via "CountyState" column)
    depression_with_counties = pd.concat([depression_county_state, depression_value], axis=1)

    # Joining SVI and depression data by comparing the "LOCATION" and new "CountyState" column
    # Note: The keyword "inner" ensures that only rows in which the counties match are merged
    svi_depression_merged = pd.merge(svi_with_counties, depression_with_counties, how="inner",
                                     left_on="LOCATION", right_on="CountyState").drop("CountyState",axis=1)

    #print(svi_depression_merged)

# ---------- PT. 4: Building Random Forest ---------- #

    print("Building Random Forest...")

    predictors = svi_depression_merged.drop(["LOCATION", "Data_Value"], axis=1)
    outcome = svi_depression_merged["Data_Value"]

    X = predictors
    y = outcome

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

    from sklearn.ensemble import RandomForestRegressor

    # Change to RandomForestRegressor for regression task
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)

    # Predict continuous values
    y_pred = model.predict(X_test)

# ---------- PT. 5: Assessing Random Forest ---------- #

    print("Assessing random forest...\n")

    # Getting the score for our model
    print("Model Score: ", model.score(X_test, y_test))

    # Assess model performance via Mean Squared Error (MSE)
    from sklearn.metrics import mean_squared_error
    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error:", mse,"\n")

    # Calculating importance scores for each of the explanatory variables (aka "features")
    importances = model.feature_importances_
    forest_importances = pd.Series(importances, index=EXPLANATORY_VARIABLES)
    std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)

    # Printing importance scores
    for feature, score in forest_importances.items():
        print(f"Feature : {feature},  Score : {round(score*100, 3)}%")

    print("\nWould you like a graph of the above data?")
    ans = ""
    while not str(ans).isdigit():
        ans = input("Enter 0 for No and 1 for Yes: ")

    if int(ans) == 1:
        # Setting up plot via Matplotlib
        fig, ax = plt.subplots()
        forest_importances.plot.bar(yerr=std, ax=ax)
        ax.set_title("Feature importances using MDI")
        ax.set_ylabel("Mean decrease in impurity")
        fig.tight_layout()

        # Show graph
        plt.show()
	# REFERENCES:
	# https://pandas.pydata.org/pandas-docs/version/1.5/index.html
	# https://realpython.com/pandas-dataframe/
	# https://github.com/afrozchakure/Internity-Summer-Internship-Work/blob/master/Blogs/Random_Forest_Classification/Random%20Forest%20Classifcation.ipynb
	# https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

	# DEPENDENCIES:
	# Pandas, Numpy, Matplotlib, Scikit-learn, PyQt5


	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import os
	from sys import exit

	EXPLANATORY_VARIABLES = ["EP_POV150", "EP_UNEMP", "EP_HBURD", "EP_NOHSDP", "EP_UNINSUR", "EP_AGE65",
	"EP_AGE17", "EP_DISABL", "EP_SNGPNT", "EP_LIMENG", "EP_MINRTY", "EP_MUNIT",
	"EP_MOBILE", "EP_CROWD", "EP_NOVEH", "EP_GROUPQ"]

	# Replace this with the directory that holds the files "PLACES.csv" and "SVIUSCounty.csv"
	# Note: Those two files must have those exact names for the program to run properly
	DIRECTORY = "C:\\Users\\tjb19003\\Documents\\CSE 3504"

	def extract_lat(point):
	if type(point) is not str:
	raise TypeError

	start_index = point.find('(') # Find the index of the first '(' character

	end_index = point.find(' ', start_index) # Find the index of the first space after '('
	if end_index == -1: # If space is not found after '(', return the subpoint from '(' to the end
	return float(point[start_index+1:])

	return float(point[start_index+1: end_index]) # Return the substring from '(' to the first space


	def extract_long(point):
	if type(point) is not str:
	raise TypeError

	# Find the index of the first space character
	first_space_index = point.find(' ')

	# Find the index of the second space character
	second_space_index = point.find(' ', first_space_index + 1)

	# Find the index of the first ')' character after the second space
	parenthesis_index = point.find(')', second_space_index)

	# Return the substring between the second space and the first ')'
	return float(point[second_space_index+1:parenthesis_index])


	def county_state_name(county_name, state_name):
	return county_name + " County, " + state_name


	##################################################################################

	if __name__ == "__main__":

	# ---------- PT. 1: PLACES ---------- #

	print("Setting up PLACES...")

	# Reading PLACES dataset
	places_filepath = os.path.join(DIRECTORY, "PLACES.csv")
	try:
	places = pd.read_csv(places_filepath)
	except Exception:
	print("!!! ERROR: PLACES.csv could not be found !!!")
	print("(Check that the directory of the file matches the one specified above)\n")
	exit()

	# Narrowing PLACES data to focus on depression outcomes
	places_condition = ( (places["Category"] == "Health Outcomes")
	& (places["Measure"] == "Depression among adults aged >=18 years")
	& (places["Data_Value_Type"] == "Age-adjusted prevalence") )
	depression_data = places.loc[places_condition]

	# Parsing geolocation data (latitude / longitude) from PLACES
	# geolocation = depression_data["Geolocation"]
	# geolocation_lat = pd.Series([extract_lat(point) if isinstance(point, str) else None for point in geolocation], copy=False)
	# geolocation_long = pd.Series([extract_long(point) if isinstance(point, str) else None for point in geolocation], copy=False)
	# geolocation_value = pd.concat([geolocation_lat, geolocation_long], axis=1)

	# Extracting depression data and reindexing from 0
	depression_value = pd.DataFrame(depression_data["Data_Value"]).reset_index(drop=True)

	#print(geolocation_value)
	#print(depression_value)

	# ---------- PT. 2: SVI ---------- #

	print("Setting up SVI...")

	# Reading Social Vulnerability Index (SVI) dataset
	social_vulnerability_filepath = os.path.join(DIRECTORY, "SVIUSCounty.csv")
	try:
	svi = pd.read_csv(social_vulnerability_filepath)
	except Exception:
	print("!!! ERROR: SVIUSCounty.csv could not be found !!!")
	print("(Check that the directory of the file matches the one specified above)\n")
	exit()

	svi_values = svi[EXPLANATORY_VARIABLES]

	#print(svi_values)

	# ---------- PT. 3: Joining Data ---------- #

	print("Joining data...")

	# Pairing SVI results with associated counties (as specified in "LOCATION" col)
	svi_with_counties = pd.concat([svi["LOCATION"], svi_values], axis=1)

	# Manipulating depression location data to fit format "{COUNTY_NAME} County, {STATE_NAME}"
	# This is necessary because this is the same format that is used in "LOCATION"
	# The new column is called "CountyState"
	depression_county_state = pd.Series([county_state_name(str(c),str(s)) for c,s in
	zip(depression_data["LocationName"], depression_data["StateDesc"])],
	name="CountyState", copy=False)

	# Pairing depression results with associated counties (via "CountyState" column)
	depression_with_counties = pd.concat([depression_county_state, depression_value], axis=1)

	# Joining SVI and depression data by comparing the "LOCATION" and new "CountyState" column
	# Note: The keyword "inner" ensures that only rows in which the counties match are merged
	svi_depression_merged = pd.merge(svi_with_counties, depression_with_counties, how="inner",
	left_on="LOCATION", right_on="CountyState").drop("CountyState",axis=1)

	#print(svi_depression_merged)

	# ---------- PT. 4: Building Random Forest ---------- #

	print("Building Random Forest...")

	predictors = svi_depression_merged.drop(["LOCATION", "Data_Value"], axis=1)
	outcome = svi_depression_merged["Data_Value"]

	X = predictors
	y = outcome

	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

	from sklearn.ensemble import RandomForestRegressor

	# Change to RandomForestRegressor for regression task
	model = RandomForestRegressor(n_estimators=100, random_state=0)
	model.fit(X_train, y_train)

	# Predict continuous values
	y_pred = model.predict(X_test)

	# ---------- PT. 5: Assessing Random Forest ---------- #

	print("Assessing random forest...\n")

	# Getting the score for our model
	print("Model Score: ", model.score(X_test, y_test))

	# Assess model performance via Mean Squared Error (MSE)
	from sklearn.metrics import mean_squared_error
	mse = mean_squared_error(y_test, y_pred)
	print("Mean Squared Error:", mse,"\n")

	# Calculating importance scores for each of the explanatory variables (aka "features")
	importances = model.feature_importances_
	forest_importances = pd.Series(importances, index=EXPLANATORY_VARIABLES)
	std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)

	# Printing importance scores
	for feature, score in forest_importances.items():
	print(f"Feature : {feature}, Score : {round(score*100, 3)}%")

	print("\nWould you like a graph of the above data?")
	ans = ""
	while not str(ans).isdigit():
	ans = input("Enter 0 for No and 1 for Yes: ")

	if int(ans) == 1:
	# Setting up plot via Matplotlib
	fig, ax = plt.subplots()
	forest_importances.plot.bar(yerr=std, ax=ax)
	ax.set_title("Feature importances using MDI")
	ax.set_ylabel("Mean decrease in impurity")
	fig.tight_layout()

	# Show graph
	plt.show()