From c843d7005f0253d23669101ae6bc5551f3eb0298 Mon Sep 17 00:00:00 2001 From: Shaheer Siddique Date: Mon, 19 Jul 2021 21:07:07 -0400 Subject: [PATCH] Add files via upload --- COVID-19 data analytics project.py | 511 +++++++++++++++++++++++++++++ 1 file changed, 511 insertions(+) create mode 100644 COVID-19 data analytics project.py diff --git a/COVID-19 data analytics project.py b/COVID-19 data analytics project.py new file mode 100644 index 0000000..92ce7f6 --- /dev/null +++ b/COVID-19 data analytics project.py @@ -0,0 +1,511 @@ +# -*- coding: utf-8 -*- +"""Copy of cse1010_covid19_assignment.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/10ehuisByyfzhoXMEwep0i5LIcd4__C5r + +# Tracking the COVID-19 Epidemic +## CSE-1010 Data Science Homework + +### Introduction + +[Coronavirus disease 2019 (or COVID-19)](https://www.cdc.gov/coronavirus/2019-ncov/downloads/2019-ncov-factsheet.pdf) is a highly contagious respiratory disease caused by a novel coronavirus. It is rapidly spreading across the globe and is being tracked by a variety of sources. In this assignment, your task is to use your programming and data science knowledge to explore COVID-19 data and answer several hypotheses. + +### Data + +The data we will be using is the [Novel Corona Virus 2019 Dataset](https://github.com/beoutbreakprepared/nCoV2019/tree/master/latest_data). + +### Hypotheses + +* Incidence of COVID-19 differs by sex. + +* Are mortality rates correlated with age? + +* Are reported cases of COVID-19 more prevalent in colder climates? + + +First, we install packages into the Google Colaboratory virtual machine. +""" + +# Commented out IPython magic to ensure Python compatibility. +# install packages +!apt-get install libgeos-dev +!pip install https://github.com/matplotlib/basemap/archive/master.zip +!pip install pyproj==1.9.6 +# %matplotlib inline + +"""Next, we import the libraries that we will need for this analysis.""" + +# import libraries +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import requests +from io import StringIO + +# For improved table display in the notebook +from IPython.display import display + +# for plotting on maps +import mpl_toolkits +from mpl_toolkits.basemap import Basemap + +# set printing options for pandas +pd.set_option('max_rows', 2000) + +"""The first step in our data science model in the week 9 lectures, is to collect or identify data. I have provided for you a direct link to the COVID-19 data on Google drive. We can access it by treating the file as a StringIO object and passing that into the pandas ```read_csv()``` function.""" + +data_url = requests.get('https://drive.google.com/uc?export=download&id=1pRnqYs1nuBEbwUJAuQApmyWETSYEVWDy') +csv_raw = StringIO(data_url.text) +covid19_data = pd.read_csv(csv_raw,low_memory=False) + +"""We first describe how to extract elements from a pandas data frame.""" + +# You can extract values from a data frame in many different ways. +# To retrieve a column we can use df[colname] +print(covid19_data['country'].head(n=3)) # here we use head simply to suppress the large amount of output + +# We can also use this syntax +print(covid19_data.country.head(n=3)) + +# or pass a list to get multiple columns +print(covid19_data[['country','province']].head(n=3)) + +#.loc() can be used to set a range of rows and/or columns (by name) +print(covid19_data.loc[10:15,['country','province']]) + +#.iloc() can be used to set a range of rows and/or columns (by index) +print(covid19_data.iloc[10:15,0:4]) + +"""## Exploratory Data Analysis + +After we've loaded in the data, the second step in our data science model is to clean and prepare the data. But, in order to do that, we should try to understand it first. + +**Problem 1)** + +* Write a function named ```rows_and_columns``` that takes in a pandas data frame and returns the string: + + The data has X rows and Y columns. + + where X is the number of rows and Y is the number of columns. For example, if the data frame has 100 rows and 10 columns, the function should return the string: + + The data has 100 rows and 10 columns. + +* Write a function named ```get_min_max``` that takes in a pandas data frame and a column name as a string, and returns the minimum and maximum value of that column in a tuple + +* Write a function named ```odd_get_min_max``` that takes in a pandas data frame and a column name as a string, and returns the minimum and maximum values for the odd rows and that column in a tuple + +""" + +# Problem 1) write your first function here +def rows_and_columns(df): + x = len(df.index) + y = len(df.columns) + return 'The data has '+str(x)+" rows and "+str(y)+" columns." +# Problem 1) write your second function here +def get_min_max(df,colname): + return ( (df[colname].min()) , (df[colname].max()) ) +# Problem 1) write your third function here +def odd_get_min_max(df,colname): + return ((df.loc[range(1,df.shape[0],2)][colname].min()),(df.loc[range(1,df.shape[0],2)][colname].max())) + +"""And we can test our functions!""" + +# here we call your functions +print(rows_and_columns(covid19_data)) + +print(get_min_max(covid19_data,'latitude')) + +print(odd_get_min_max(covid19_data,'latitude')) + +"""To get a sense of the data, let's view the column names and a sample of the data.""" + +print(covid19_data.columns) +print(covid19_data.head()) + +"""The data contain information on where the individual was diagnosed, how old they are, when they were diagnosed, and various other information. But simply looking at the column names and first few rows does not give us too much information. The dataframe ```info()``` function is a great way to get a summary of the input data.""" + +covid19_data.info() + +"""```info()``` shows us that most columns have significantly high levels of missing data. Typically, columns with high levels of missing data are removed or imputed. Here, we will ignore the missing data. The ```describe()``` function is more useful when you have numerical data, but it still provides useful information on how our data are distributed.""" + +covid19_data.describe(include="all") + +"""In the absence of documentation, the values in the data allow us to interpret the data columns. + +#### Data Cleaning and Wrangling + +The data are messy. Various parties have contibuted to the dataset without following a consistent formatting for the columns. If we are interested in questions about age, for example, we need to clean the age column. First, let's visualize the age column data by counting the unique fields. + +**Problem 2)** Write a function named "get_uniq" that takes in a pandas data frame and a column name, and returns a numpy ndarray containing the unique values in that column. + +*Hint: use the DataSeries.unique() function: https://pandas.pydata.org/pandas-docs/stable/reference/series.html* + +""" + +# Problem 2) write your function here +def get_uniq(df, colname): + uni = df[colname].unique() + return uni + +"""Let's use your function to print out the unique elements in the age column.""" + +print(get_uniq(covid19_data,'age')) + +"""We can also compute the counts for each of the unique elements. Pandas gives us a handy function to do this: ```value_counts()```. By default, ```value_counts()``` ignores NaN values.""" + +print(covid19_data['age'].value_counts()) + +"""**Problem 3)** Define a function named "unique_nonNaN_cnt" that takes a pandas data frame, a column name as a string, and returns the sum of unique non-NaN values.""" + +# Problem 3) write your function here +def unique_nonNaN_cnt(df,colname): + non_nan_cnt = df[colname].count() + return non_nan_cnt + +"""and test our function...""" + +print("Total of " + str(unique_nonNaN_cnt(covid19_data,'age')) + " non-NaN age entries.") + +"""It's clear that the individuals entering the data were not following the same standard or format! We will need to clean this data before we can use it. + +There is a large amount of missing data, and a large variety of entries. We should clean the age columns. Let's convert the ages to age ranges for plotting. For the existing ranges in the data, let's consider the mean age. +""" + +# cleaning the age column +# We observe that the age column does not follow a nice format + +# defining the age ranges +age_ranges = [] +for age in range(0,100,10): + age_ranges.append((age,age+10)) +print("Considering age ranges",age_ranges) + +# helper function that takes in an numerical age, and a list of ranges and +# returns the particular range that the age falls into +def findRange(age,arange): + for ager in arange: + if age >= ager[0] and age < ager[1]: + return str(ager[0])+"-"+str(ager[1]) + +# a function that will fix our age entries +def fixAge(age): + if isinstance(age, str): # check if the age is a string + if 'weeks' in age: + age = age.replace('weeks','') + age = str(float(age)/52.0) + if '-' in age: # if the string has a hyphen (is a range) + parts = age.split("-") + if len(parts)==1: # if the range was poorly formatted... + return findRange(float(parts[0]),age_ranges) + elif len(parts)==2: # if the range was properly formatted, findRange of the mean + return findRange(np.mean([float(parts[0]),float(parts[0])]),age_ranges) + else: + print(age) + raise InputError("Age " + str(age) + " not correctly handled.") + else: + return findRange(float(age),age_ranges) + elif np.isnan(age): + return np.nan + else: + raise InputError("Age " + str(age) + " not correctly handled.") + +# create a new column that holds the new age ranges +# this code will run the fixAge function for each row of the covid data frame and +# store the result in the newly created 'age_range' column +covid19_data['age_range'] = covid19_data.apply(lambda row : fixAge(row['age']), axis = 1) + +"""The ```apply()``` function is very important: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html. It allows us to apply a function across some data frame axis (e.g. for each row, or each column). It's combined with the ```kambda``` keyword here. ```lambda``` defines an anonymous function (a function without a name). You can use these wherever functions are required, but allow you to define the functionality in a single expression. Let's do some quality control to verify we didn't make any mistakes. + +""" + +print("the total number of rows with non-NaN ages is " + str(sum(covid19_data['age'].value_counts()))) +print("the total number of rows with non-NaN age_ranges is " + str(sum(covid19_data['age_range'].value_counts()))) + +"""The number of non-NaN age_ranges is 1 less, and if we inspect the original data, we can find that one age did not fall in our age ranges of 0-100. This age was 121 and is likely a mistake in entry since the oldest person currently alive is 117 years old, so we can safely discard this outlier. + +The next step in our data science model is exploratory data analysis. Let's visualize the COVID-19 cases by age group. There are a number of plots we can use here; a bar plot is a good choice. +""" + +# distribution of cases by age +age_range_labels = [str(x[0])+"-"+str(x[1]) for x in age_ranges] +counts = covid19_data.age_range.value_counts()[age_range_labels] + +# create plot +fig, ax = plt.subplots(figsize=(20, 10)) +index = np.arange(len(age_ranges)) +bar_width = 0.35 +opacity = 0.8 + +# docs are here: https://matplotlib.org/3.2.1/api/_as_gen/matplotlib.pyplot.bar.html +rects1 = plt.bar(index, counts, bar_width,alpha=opacity,color='b') + +plt.xlabel('Age Range') +plt.ylabel('Count') +plt.title('Corona Cases per Age Group') +plt.xticks(index, ["["+str(x[0])+","+str(x[1])+")" for x in age_ranges]) +plt.legend() + +plt.tight_layout() + +"""Let's address the **incidence of COVID-19 differs by sex** hypothesis we made above. We can make a bar plot stratified by sex. + +**Problem 4)** Fill in the relevant prompts below to create the bar plot of COVID-19 cases by sex. + +As a hint, we can select a subset of rows based on the value in a column with the syntax: + +```dataframe[dataframe[colname]==value]``` + +where dataframe is a pandas data frame, colname is the column name, and value is some value for the colname. You can use other logical comparisons as well, e.g., to get all rows with latitude > 0, we can use the syntax: + +```covid19_data[covid19_data.latitude>0]``` + + + +""" + +# distribution of cases by age and sex +# Problem 4) Complete where we have indicated below +def create_bar_plot_by_sex(covid19_data, age_ranges): + age_range_labels = [str(x[0])+"-"+str(x[1]) for x in age_ranges] + # from the covid19_data, select the age_range for female rows + female_age_ranges = covid19_data[covid19_data['sex']=='female'].age_range # problem 4, fill this in + counts_female = female_age_ranges.value_counts()[age_range_labels] + + # from the covid19_data, select the age_range for male rows + male_age_ranges = covid19_data[covid19_data['sex']=='male'].age_range # problem 4, fill this in + counts_male = male_age_ranges.value_counts()[age_range_labels] + + # create plot + fig, ax = plt.subplots(figsize=(20, 10)) + index = np.arange(len(age_ranges)) + bar_width = 0.35 + opacity = 0.8 + + # the bar function draws a bar plot, the first two arugments are the x position of the bar, and its height + rects1 = plt.bar(index, counts_male, bar_width, # problem 4, fill in first two arguments + alpha=opacity,color='b',label='Male') + + rects2 = plt.bar(index+bar_width, counts_female, bar_width, # problem 4, fill in first two arguments hint: you have to use the bar_width in the first argument + alpha=opacity,color='g',label='Female') + + plt.xlabel('Age Range') + plt.ylabel('Count') + plt.title('Corona Cases per Age Group') + #plt.xticks(index + bar_width, age_ranges) + plt.xticks(index, ["["+str(x[0])+","+str(x[1])+")" for x in age_ranges]) + plt.legend() + + plt.tight_layout() + return counts_female, counts_male + +"""Now let's run our visualization.""" + +cnts_f, cnts_m = create_bar_plot_by_sex(covid19_data, age_ranges) + +"""Let's view the COVID-19 cases by country. The relevant column name is "country".""" + +# distribution of cases by country +def create_bar_plot_by_country(covid19_data): + country_cnts = covid19_data.country.value_counts() + + n_groups = len(country_cnts) + counts = country_cnts + + # create plot + fig, ax = plt.subplots(figsize=(20, 10)) + index = np.arange(n_groups) + bar_width = 0.35 + opacity = 0.8 + + rects1 = plt.bar(index, counts, bar_width, + alpha=opacity,color='b') + + plt.xlabel('Country') + plt.ylabel('Count') + plt.title('Corona Cases per Country') + #plt.xticks(index + bar_width, age_ranges) + plt.xticks(index, country_cnts.index.values) + plt.legend() + + plt.tight_layout() + return n_groups, counts +ngrps, cnts = create_bar_plot_by_country(covid19_data) + +"""This is difficult to interpret due to the squashed x-axis. + +**Problem 5)** Print the same bar plot by country, but limit the plot to countries that have >1000 cases. +""" + +# distribution of cases by country with >1000 cases +# Problem 5) Complete where we have indicated below +def create_bar_plot_by_country(covid19_data): + country_cnts = covid19_data.country.value_counts() + # get number of countries with >1000 cases + n_groups = len(country_cnts[country_cnts>1000])# Problem 5, fill this in + # get the counts for countries with >1000 cases + counts = country_cnts[country_cnts>1000] # Problem 5, fill this in + + # create plot + fig, ax = plt.subplots(figsize=(20, 10)) + index = np.arange(n_groups) + bar_width = 0.35 + opacity = 0.8 + + rects1 = plt.bar(index, counts, bar_width, + alpha=opacity,color='b') + + plt.xlabel('Country') + plt.ylabel('Count') + plt.title('Corona Cases per Country') + plt.xticks(index, counts) # Problem 5, fill this in + plt.legend() + + plt.tight_layout() + return n_groups, counts + +"""Now let's run our visualization.""" + +ngrps, cnts = create_bar_plot_by_country(covid19_data) + +"""Perhaps a more interesting visualization is to view the COVID-19 cases on the world map. To do this, we will make use of another very useful data frame function is ```groupby()```. The ```groupby()``` method groups our data rows by a specific columns or column values. Here we group our data by their rounded latitude, longitude, and country. We then count the IDs which gives us the number of cases in a latitude-longitude region.""" + +map_intensities = covid19_data.groupby([covid19_data.latitude.round(1), + covid19_data.longitude.round(1), + covid19_data.country]).ID.count().reset_index() + +"""Now let's plot the cases across the globe using matplotlib Basemaps. Try to play around with the various parameters to get an idea of what they all do!""" + +# set the colors for countries +map_intensities['labels_enc'] = pd.factorize(map_intensities['country'])[0] + +plt.figure(figsize=(12, 12)) + +# Make the background map +m=Basemap(llcrnrlon=-180, llcrnrlat=-65,urcrnrlon=180,urcrnrlat=80) +m.drawmapboundary(fill_color='#A6CAE0', linewidth=0) +m.fillcontinents(color='grey', alpha=0.4) +m.drawcoastlines(linewidth=0.6, color="white") + +m.scatter(map_intensities['longitude'], map_intensities['latitude'], s=map_intensities['ID']/10, alpha=0.4, c=map_intensities['labels_enc'], cmap="Set1") + +"""We see hotspots in the northeast USA (centered around NYC), the Iberian Peninsula, the U.K., China, Iran, and South Korea. Given recent news articles, we also notice that there is some underreporting in our data. We do not have the statistical language to formulate our questions as formal statisical hypotheses, but we can produce plots that support a particular interpretation of each hypothesis. We consider our first hypothesis: + +**Are mortality rates correlated with age?** + +First we need to clean and subset our data. We count the outcomes. +""" + +print(covid19_data['outcome'].value_counts()) + +"""From these results, we can see that there really is not enough data to draw definitive statistically robust conclusions. We can still do our analysis in anticipation that the data will be updated as the disease progresses. Let's group our outcomes into positive and negative classes.""" + +pos=['discharge','stable','discharged','recovered','stable condition','Alive','Stable','released from quarantine','Recovered','Discharged from hospital','Discharged'] +neg=['died','death','Dead','severe','critical condition, intubated as of 14.02.2020','dead','Death','Deceased','severe illness','unstable','Died','Critical condition'] + +def setOutcomeClass(outcome): + if outcome in pos: + return 1 + elif outcome in neg: + return 0 + else: + return np.nan + +covid19_data['outcome_class'] = covid19_data.apply(lambda row : setOutcomeClass(row['outcome']), axis = 1) + +"""Now let's create a different type of plot, a line graph. Here we want to visualize patterns of case severity across age groups, but each age group can have a different number of samples. Therefore, we compute an empirical probability of a positive outcome but also include $\pm$ 1 standard deviation. We also include Spearman's correlation on the plot.""" + +# subset the data by age range and outcome class, then group by age range, +# and use the agg (aggregate) function to compute the mean, count, and +# standard deviation by age group +outcomes_per_age = covid19_data[['age_range','outcome_class']].groupby(['age_range']).agg(['mean','count','std']).reset_index() +x = outcomes_per_age.age_range +y = outcomes_per_age.outcome_class['mean'] +error = outcomes_per_age.outcome_class['std'] + +fig, ax = plt.subplots(figsize=(20, 10)) +ax.errorbar(x, y, yerr=error, fmt='-o') +plt.ylabel('Relative Frequency', fontsize=14) +plt.xlabel('Age Group', fontsize=14) + +fig.text(0.2,0.2,"spearman correlation = " + str(covid19_data['age_range'].corr(covid19_data['outcome'],method='spearman')), fontsize=14) + +"""The visualization shows that the older the individual, the more likely the negative outcome. Spearman's negative correlation confirms this. + +**Problem 6)** Professor Derek is worried about outcomes over time for his age bracket (30-40). He wants you to plot the relative frequency of positive outcomes (y-axis) over time (x-axis) while also including 1 standard deviation above and below each point. You should not compute Spearman's correlation here. Fill in the function below. + +""" + +# Problem 6) Complete where we have indicated below +def create_bar_plot_for_derek(covid19_data): + # first we subset the data by the appropriate age bracket and do a bit of cleaning + prof_age_data = covid19_data[covid19_data.age_range=="30-40"] + prof_age_data=prof_age_data.replace(to_replace='25.02.2020 - 26.02.2020',value='25.02.2020') + + # and we convert the column to a date-time + prof_age_data['date_confirmation']=pd.to_datetime(prof_age_data['date_confirmation'],dayfirst=True) + + outcomes_per_age = prof_age_data[['date_confirmation','outcome_class']].groupby(['date_confirmation']).agg(['mean','count','std']).reset_index() # Problem 6) fill in here + + outcomes_per_age = outcomes_per_age.dropna() # we should drop the rows with missing values + + x = outcomes_per_age.date_confirmation # Problem 6) fill in here + y = outcomes_per_age.outcome_class['mean'] # Problem 6) fill in here + error = outcomes_per_age.outcome_class['std'] # Problem 6) fill in here + + fig, ax = plt.subplots(figsize=(20, 10)) + ax.errorbar(x, y, yerr=error, fmt='-o') + plt.ylabel('Relative Frequency', fontsize=14) + plt.xlabel('Date', fontsize=14) + return x, y, error + +"""Now let's run our visualization.""" + +x,y,error = create_bar_plot_for_derek(covid19_data) + +"""Let's move on to our last hypothesis. + +**Are reported cases of COVID-19 more prevalant in colder climates?** + +Here we use the absolute value of the latitude as a proxy for temperature. +""" + +# latitude data ranges from -90 (south pole) to 90 (north pole) +print(covid19_data['latitude'].describe()) + +fig, ax = plt.subplots(figsize=(20, 10)) + +num_bins = 90 + +# the histogram of the data +ax.hist(abs(covid19_data['latitude']), num_bins, density=1,alpha=0.3) +plt.ylabel('Density', fontsize=14) +plt.xlabel('Absolute Latitude Degree', fontsize=14) + +"""This is an interesting plot, but what if most people in the world live between latitudes 20 and 60? We would expect there to be more infections here. Let's also plot the density of individuals across the global at each latitude.""" + +# I downloaded and prepared the data for you. +population_data_url = requests.get('https://drive.google.com/uc?export=download&id=19BjvYrh_MkzE2NMJBOSJzJUaXaw3S85X') +population_csv = StringIO(population_data_url.text) +population_data = pd.read_csv(population_csv, delimiter=" ", header=None) +abs_latitude=np.linspace(0,90,360) + +# population data goes from 90 degrees to -90 degrees in increments of 0.25 degrees +lat_sums=np.sum(population_data,axis=1) +lat_by_degree = lat_sums.groupby(np.arange(len(lat_sums))//4).sum() +population_sums = lat_by_degree.groupby(np.concatenate((np.arange(0,len(lat_by_degree)/2),np.arange(0,len(lat_by_degree)/2)[::-1]))).sum() + +fig, ax = plt.subplots(figsize=(20, 10)) + +num_bins = 90 + +# the histogram of the data +ax.hist(abs(covid19_data['latitude']), num_bins, density=1,alpha=0.3) +ax.hist(range(num_bins)[::-1],bins=num_bins, density=1, weights=population_sums,alpha=0.3) +plt.ylabel('Density', fontsize=14) +plt.xlabel('Absolute Latitude Degree', fontsize=14) + +"""We see that there are a considerable number of people who live close to the equator (latitude=0) so infections indeed are more prevalant in colder regions. Note that there are other factors in play here, such as, where the disease began and travel patterns.""" +