JianingShenSource.R

#Packages used in this project
library(openxlsx)
library(vcd)
library(Hmisc)



#1. RAW DATA TIDYING UP AND SELECTION FOR POSSIBLE FACTORS BY SUBSETTING

#Tidy up the raw date, delete all blank variables or variables that seems to be  irrelevant to the suicide
#Extract all suicide data for quicker processing the suicide data

mort19 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort19.csv")
mort18 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort18.csv")
mort17 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort17.csv")
mort16 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort16.csv")
mort15 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort15.csv")
mort14 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort14.csv")

#Delete unnecessary and blank  variables

mort19 <- mort19[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]
mort18 <- mort18[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]
mort17 <- mort17[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]
mort16 <- mort16[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]
mort15 <- mort15[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]
mort14 <- mort14[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]

#Select all the suicide cases
S19 <- mort19[which(mort19$Manner_of_Death == 2),] 
S19 <- S19[,-7]
S18 <- mort18[which(mort18$Manner_of_Death == 2),] 
S18 <- S18[,-7]
S17 <- mort17[which(mort17$Manner_of_Death == 2),] 
S17 <- S17[,-7]
S16 <- mort16[which(mort16$Manner_of_Death == 2),] 
S16 <- S16[,-7]
S15 <- mort15[which(mort15$Manner_of_Death == 2),] 
S15 <- S15[,-7]
S14 <- mort14[which(mort14$Manner_of_Death == 2),] 
S14 <- S14[,-7]

#File back-up
write.csv(mort19, file = "D:/FinalProject/DATA/Analysis/Death/mort19.csv")
write.csv(mort18, file = "D:/FinalProject/DATA/Analysis/Death/mort18.csv")
write.csv(mort17, file = "D:/FinalProject/DATA/Analysis/Death/mort17.csv")
write.csv(mort16, file = "D:/FinalProject/DATA/Analysis/Death/mort16.csv")
write.csv(mort15, file = "D:/FinalProject/DATA/Analysis/Death/mort15.csv")
write.csv(mort14, file = "D:/FinalProject/DATA/Analysis/Death/mort14.csv")

write.csv(S19, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort19.csv")
write.csv(S18, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort18.csv")
write.csv(S17, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort17.csv")
write.csv(S16, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort16.csv")
write.csv(S15, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort15.csv")
write.csv(S14, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort14.csv")



#2. IMPORT THE DEATH & SUICIDE FILES 

#Import the death data
mort19 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort19.csv")
mort18 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort18.csv")
mort17 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort17.csv")
mort16 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort16.csv")
mort15 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort15.csv")
mort14 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort14.csv")

#Import the suicide data
S19 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort19.csv")
S18 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort18.csv")
S17 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort17.csv")
S16 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort16.csv")
S15 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort15.csv")
S14 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort14.csv")

#In "Manner_of_Death" column, "0" stands for non-suicide cases, while "1" stands for suicide cases
mort14[which(mort14$Manner_of_Death != 2),8] <- 0
mort14[which(mort14$Manner_of_Death == 2),8] <- 1
mort15[which(mort15$Manner_of_Death != 2),8] <- 0
mort15[which(mort15$Manner_of_Death == 2),8] <- 1
mort16[which(mort16$Manner_of_Death != 2),8] <- 0
mort16[which(mort16$Manner_of_Death == 2),8] <- 1
mort17[which(mort17$Manner_of_Death != 2),8] <- 0
mort17[which(mort17$Manner_of_Death == 2),8] <- 1
mort18[which(mort18$Manner_of_Death != 2),8] <- 0
mort18[which(mort18$Manner_of_Death == 2),8] <- 1
mort19[which(mort19$Manner_of_Death != 2),8] <- 0
mort19[which(mort19$Manner_of_Death == 2),8] <- 1

#Notify the year
mort14$Year <- 2014
mort15$Year <- 2015
mort16$Year <- 2016
mort17$Year <- 2017
mort18$Year <- 2018
mort19$Year <- 2019

S14$Year <- 2014
S15$Year <- 2015
S16$Year <- 2016
S17$Year <- 2017
S18$Year <- 2018
S19$Year <- 2019



#3. The data analysis

#Pull together the data needed 

#Table of death and suicide number every year
year <- c(2014:2019)
Suicide <- c(nrow(S14),nrow(S15),nrow(S16),nrow(S17),nrow(S18),nrow(S19))
Death <- c(nrow(mort14),nrow(mort15),nrow(mort16),nrow(mort17),nrow(mort18),nrow(mort19))
Everyyear <- data.frame(year,Death,Suicide)
Everyyear$SuicidePercentage <- Suicide/Death*100
Everyyear$SuicidePercentage <- format(Everyyear$SuicidePercentage,digits = 3)

#Table of all suicide cases
AllSuicide <- rbind(S14,S15,S16,S17,S18,S19)

#Table of all death cases
AllDeath <- rbind(mort14,mort15,mort16,mort17,mort18,mort19)


#3.1 Change of the suicide number and percentages among overall death number over years

#3,1,1 Descriptive statistics of the suicide changes over the years

#Generate the plot
opar <- par(no.readonly = TRUE)
par(mfrow = c(1,2))
attach(Everyyear)
plot(Year,Suicide,type = "b",main = "Number of suicide cases from 2014-2019",
     xlab = "Year", ylab = "Number of suicide case",xlim = c(2013,2020),ylim = c(42000,50000),
     pch = 15, lty = 5)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)

plot(Year,SuicidePercentage,type = "b",main = "Percentages of suicide cases from 2014-2019",
     xlab = "Year", ylab = "Percentage of suicide case",xlim = c(2013,2020),ylim = c(1.6,1.75),
     pch = 16, lty = 3)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
detach(Everyyear)
par(opar)
#ConClusion: Both suicide number and suicide percentage has a raising trend over the years

#3.1.2 Chi-Square test for suicide-year independence

AllDeath$Manner_of_Death <- as.factor(AllDeath$Manner_of_Death)
AllDeath$Year <- as.factor(AllDeath$Year)
Year <- table(AllDeath$Year,AllDeath$Manner_of_Death)
chisq.test(Year)
#Result:
#Pearson's Chi-squared test

#data:  Year
#X-squared = 56.786, df = 5, p-value = 5.597e-11
#Conclusion: The suicide status changes over the years


#3.2 Effect of gender on suicide

#3.2.1 Descriptive plot of gender distribution on suicide

#Generate sex table for grouped bar plot all the years
AllSuicide$Sex <- as.factor(AllSuicide$Sex)
AllSuicide$Year <- as.factor(AllSuicide$Year)

SSex <- table(AllSuicide$Sex,AllSuicide$Year)
Sex <- table(AllDeath$Sex,AllDeath$Manner_of_Death)

#check the summed suicide number for both gender
margin.table(SSex,1) 

#Pick 2 colors
Gray <- gray(0:15/15)
pie(rep(1:15),labels = Gray,col = Gray)

#Form a grouped bar plot
par(mfrow = c(1,2))
barplot(SSex, main = "Suicide distribution on gender in 2009-2019",ylim = c(0,50000),
        xlab = "Year",ylab = "Number of suicide",col = c("#000000","#EEEEEE"),beside = TRUE)
legend("topleft",inset = .05,title = "Gender",c("F","M"),fill = c("#000000","#EEEEEE"),cex = 0.8)
minor.tick(nx = 1,ny = 2,tick.ratio = 0.5)

barplot(c(61699,214782), main = "Overall distribution on gender",
        xlab = "Gender",ylab = "Number of suicide",col = c("#000000","#EEEEEE"))
legend("topleft",inset = .05,title = "Gender",c("F","M"),fill = c("#000000","#EEEEEE"),cex = 0.8)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)


#3.2.2 Using Chi-square test to make sure there's significant difference on suicide number between male and female
chisq.test(c(61699,214782))
#Result:
#Pearson's Chi-squared test
#
#data:  c(61699, 214782)
#X-squared = 84760, df = 1, p-value < 2.2e-16
#Conclusion: There's significant difference on suicide number between male and female

#3.2.3 Chi-Square test for suicide-gender independence
Sex <- table(AllDeath$Sex,AllDeath$Manner_of_Death)
chisq.test(Sex)
#Result:
#Pearson's Chi-squared test with Yates' continuity correction

#data:  Sex
#X-squared = 77774, df = 1, p-value < 2.2e-16
#Conclusion: There's dependence between gender and suicide


#3.3 Effect of education on suicide

#3.3.1 Descriptive statistics of education level on suicide

#Make a new rank on education level, old rank and my new rank is down below
# Old rank
#1... 8th grade or less
#2... 9 - 12th grade, no diploma
#3... high school graduate or GED completed
#4... some college credit, but no degree
#5... Associate degree
#6... Bachelor¡¯s degree
#7... Master¡¯s degree
#8... Doctorate or professional degree

#New rank
#1... 8th grade or less
#2... 9th grade-high school graduate
#3... College experience - Bachelor¡¯s degree
#4... MS¡¯s degree or higher

#Adapt the datasets for all death and suicide case to new rank

AllDeath[which(AllDeath$Education2003 == 3),2] <- 2
AllDeath[which(AllDeath$Education2003 == 4),2] <- 3
AllDeath[which(AllDeath$Education2003 == 5),2] <- 3
AllDeath[which(AllDeath$Education2003 == 6),2] <- 3
AllDeath[which(AllDeath$Education2003 == 7),2] <- 4
AllDeath[which(AllDeath$Education2003 == 8),2] <- 4

AllSuicide[which(AllSuicide$Education2003 == 3),2] <- 2
AllSuicide[which(AllSuicide$Education2003 == 4),2] <- 3
AllSuicide[which(AllSuicide$Education2003 == 5),2] <- 3
AllSuicide[which(AllSuicide$Education2003 == 6),2] <- 3
AllSuicide[which(AllSuicide$Education2003 == 7),2] <- 4
AllSuicide[which(AllSuicide$Education2003 == 8),2] <- 4

#Due to the fact there's suicide cases with unknown education status, thus I'll have to delete those missing values
#Make copies of origin files
AllSuicide_Edu <- AllSuicide
AllSuicide_Edu <- AllSuicide_Edu[-which(AllSuicide_Edu$Education2003 == 9),]

AllDeath_Edu <- AllDeath
AllDeath_Edu <- AllDeath_Edu[-which(AllDeath_Edu$Education2003 == 9),]

#Generate the table
AllSuicide_Edu$Education2003 <- as.factor(AllSuicide_Edu$Education2003)
SEducation <- table(AllSuicide_Edu$Education2003,AllSuicide_Edu$Year)

#Pick colors
Color <- rainbow(4,s = 0.3, start = 0,end = 0.6)
pie(rep(1:4),labels = Color,col = Color)

#Generate the table for summed suicide number for each education level
DEducation <- margin.table(SEducation,1)


#Generate the descriptive statistics
par(mfrow = c(1,2))
barplot(SEducation, main = "Suicide distribution on education level in 2014-2019",ylim = c(0,40000),
        xlab = "Year",ylab = "Number of suicide",col = Color,beside = TRUE)
legend("topleft",inset = .05,title = "Education level",
       c("8th grade or less","9-12 grade no diploma","High school graduate or GED completed","College experience and higher degree"),
       fill = Color,cex = 0.75)
minor.tick(nx = 1,ny = 2,tick.ratio = 0.5)

barplot(DEducation, main = "Overall distribution on education level",ylim = c(10000,250000),
        cex.names = 1,xlab = "Education level",ylab = "Number of suicide",col = Color)
legend("topleft",inset = .05,title = "Education level",
       c("8th grad or less", "9th grade - high school graduate", "College experience - Bachelor's degreee","MS's degree or higher"),
       fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)

#3.3.2 Chi-Square test for suicide-education independence
Education <- table(AllDeath_Edu$Education2003,AllDeath_Edu$Manner_of_Death)
chisq.test(Education)
#Result:
#Chi-squared test for given probabilities

#data:  Education
#X-squared = 18461, df = 3, p-value < 2.2e-16
#Conclusion:There's dependence between education level and suicide

#3.4 Effect of month of death on suicide

#I divide the data into 4 quarters of year
#3.4.1 Descriptive statistics on death of month

#Generate table
AllDeath[which(AllDeath$Month_of_Death == 1),3] <- 1
AllDeath[which(AllDeath$Month_of_Death == 2),3] <- 1
AllDeath[which(AllDeath$Month_of_Death == 3),3] <- 1
AllDeath[which(AllDeath$Month_of_Death == 4),3] <- 2
AllDeath[which(AllDeath$Month_of_Death == 5),3] <- 2
AllDeath[which(AllDeath$Month_of_Death == 6),3] <- 2
AllDeath[which(AllDeath$Month_of_Death == 7),3] <- 3
AllDeath[which(AllDeath$Month_of_Death == 8),3] <- 3
AllDeath[which(AllDeath$Month_of_Death == 9),3] <- 3
AllDeath[which(AllDeath$Month_of_Death == 10),3] <- 4
AllDeath[which(AllDeath$Month_of_Death == 11),3] <- 4
AllDeath[which(AllDeath$Month_of_Death == 12),3] <- 4

AllSuicide[which(AllSuicide$Month_of_Death == 1),3] <- 1
AllSuicide[which(AllSuicide$Month_of_Death == 2),3] <- 1
AllSuicide[which(AllSuicide$Month_of_Death == 3),3] <- 1
AllSuicide[which(AllSuicide$Month_of_Death == 4),3] <- 2
AllSuicide[which(AllSuicide$Month_of_Death == 5),3] <- 2
AllSuicide[which(AllSuicide$Month_of_Death == 6),3] <- 2
AllSuicide[which(AllSuicide$Month_of_Death == 7),3] <- 3
AllSuicide[which(AllSuicide$Month_of_Death == 8),3] <- 3
AllSuicide[which(AllSuicide$Month_of_Death == 9),3] <- 3
AllSuicide[which(AllSuicide$Month_of_Death == 10),3] <- 4
AllSuicide[which(AllSuicide$Month_of_Death == 11),3] <- 4
AllSuicide[which(AllSuicide$Month_of_Death == 12),3] <- 4

SMonth <- table(AllSuicide$Month_of_Death,AllSuicide$Year)

DMonth <- margin.table(SMonth,1)

Month <- table(AllDeath$Month_of_Death,AllDeath$Manner_of_Death)

#Pick color for the plot
Color <- rainbow(4,s = 0.3, start = 0,end = 0.8)
pie(rep(1:4),labels = Color,col = Color)

#Generate plots
par(mfrow = c(1,2))
barplot(SMonth, main = "Suicide distribution on quarter in 2014-2019",ylim = c(0,20000),
        xlab = "Year",ylab = "Number of suicide",col = Color,beside = TRUE)
legend("topleft",inset = .05,title = "Quarter",
       c("1st Quarter","2nd Quarter","3rd Quarter","4th Quarter"),fill = Color,cex = 0.75)
minor.tick(nx = 1,ny = 2,tick.ratio = 0.5)

barplot(DMonth, main = "Overall distribution on quarter",ylim = c(0,100000),
        cex.names = 1,xlab = "Quarter",ylab = "Number of suicide",col = Color)
legend(locator(1),inset = .05,title = "Quarter",c("1st Quarter","2nd Quarter","3rd Quarter","4th Quarter"),
       fill = Color,cex = 0.65)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)

#Observation:
#No obvious trend observed

#3.5 Effect of age on suicide

#3.5.1 Descriptive statistics of age distribution on suicide 

#Due to the fact that there were cases with unknown ages, and there's no suicide  so I have to make a subset of origin data 
#And I'll adapt a new rank for the data, I'll put both two ranks down below
#Because in the age rage below 15, the number of death is relative small 
#compare to the overall death number and suicide case is quite few, so in this study, I discuss only 
#cases older than 15 years

#Old rank
#1 ... Under 1 year (includes not stated infant ages)
#2 ... 1 - 4 years
#3 ... 5 - 14 years
#4 ... 15 - 24 years
#5 ... 25 - 34 years
#6 ... 35 - 44 years
#7 ... 45 - 54 years
#8 ... 55 - 64 years
#9 ... 65 - 74 years
#10 ... 75 - 84 years
#11 ... 85 years and over

#New rank
#1 ... 15 - 24 years
#2 ... 25 - 34 years
#3 ... 35 - 44 years
#4 ... 45 - 54 years
#5 ... 55 - 64 years
#6 ... 65 - 74 years
#7 ... Older than 75 years

#Adapt the origin data to new rank
AllDeath_Age <- AllDeath[-which(AllDeath$Age_Recode_12 == 12),]
AllDeath_Age <- AllDeath_Age[-which(AllDeath_Age$Age_Recode_12 == 1),]
AllDeath_Age <- AllDeath_Age[-which(AllDeath_Age$Age_Recode_12 == 2),]
AllDeath_Age <- AllDeath_Age[-which(AllDeath_Age$Age_Recode_12 == 3),]

AllSuicide_Age <- AllSuicide[-which(AllSuicide$Age_Recode_12 == 12),]
AllSuicide_Age <- AllSuicide_Age[-which(AllSuicide_Age$Age_Recode_12 == 3),]

AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 4),5] <- 1
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 5),5] <- 2
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 6),5] <- 3
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 7),5] <- 4
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 8),5] <- 5
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 9),5] <- 6
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 10),5] <- 7
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 11),5] <- 7

AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 4),5] <- 1
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 5),5] <- 2
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 6),5] <- 3
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 7),5] <- 4
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 8),5] <- 5
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 9),5] <- 6
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 10),5] <- 7
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 11),5] <- 7

Age <- table(AllDeath_Age$Age_Recode_12,AllDeath_Age$Manner_of_Death)
SAge <- table(AllSuicide_Age$Age_Recode_12,AllSuicide_Age$Year)
DAge <- margin.table(SAge,1)

#Pick colors
Color <- rainbow(7,s = 0.3, start = 0,end = 0.6)
pie(rep(1:7),labels = Color,col = Color)

#Generate plots
S15_24 <- SAge[1,]
S25_34 <- SAge[2,]
S35_44 <- SAge[3,]
S45_54 <- SAge[4,]
S55_64 <- SAge[5,]
S65_74 <- SAge[6,]
S75 <- SAge[7,]

par(mfrow = c(1,2))
barplot(DAge, main = "Overall distribution on Age",
        names.arg = c("15-24","25-34","35-44","45-54","55-64","65-74","75+"),cex.names = 1,
        xlab = "Age",ylab = "Number of suicide",col = Color)
legend(locator(1),inset = .05,title = "Age",c("24-","25-34","35-44","45-54","55-64","65-74","75+"),
       fill = Color,cex = 0.6)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)

plot(year,S15_24,type = "b",pch = 1,lty = 1,ylim = c(0,8500),main = "Suicide number change in 2014-2019",
     xlab = "Year",ylab = "Suicide number")
lines(year,S25_34,type = "b",lty = 1,pch = 2,col = "red")
lines(year,S35_44,type = "b",lty = 2,pch = 3,col = "blue")
lines(year,S45_54,type = "b",lty = 3,pch = 4,col = "green")
lines(year,S55_64,type = "b",lty = 4,pch = 5,col = "grey")
lines(year,S65_74,type = "b",lty = 5,pch = 6,col = "purple")
lines(year,S75,type = "b",lty = 6,pch = 7,col = "yellow")
legend("bottomright",inset = .05,title = "Age",c("24-","25-34","35-44","45-54","55-64","65-74","75+"),
       fill = c("black","red","blue","green","grey","purple","yellow"),cex = 0.6)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)

#3.5.2 Chi-square test for age-suicide independence 

chisq.test(Age)
#Result
#Pearson's Chi-squared test

#data:  Age
#X-squared = 892217, df = 6, p-value < 2.2e-16
#Conclusion: There's dependence between age and suicide

#3.6 Effect of marital status

#Descriptive statistics 
#Due to the fact that there's cases with unknown marital status, subsetting the origin datasets is needed

AllDeath_Marital <- AllDeath
AllDeath_Marital <- AllDeath_Marital[-which(AllDeath_Marital$Marital_Status == "U"),]

AllSuicide_Marital <- AllSuicide
AllSuicide_Marital <- AllSuicide_Marital[-which(AllSuicide_Marital$Marital_Status == "U"),]

SMarital <- table(AllSuicide_Marital$Marital_Status,AllSuicide_Marital$Year)

DMarital <- margin.table(SMarital,1)

Marital <- table(AllDeath_Marital$Marital_Status,AllDeath_Marital$Manner_of_Death)

#Pick colors
Color <- rainbow(4,s = 0.3, start = 0,end = 0.6)
pie(rep(1:4),labels = Color,col = Color)

#Generate plots
par(mfrow = c(1,2))
barplot(SMarital, main = "Suicide distribution on marital status in 2014-2019",ylim = c(0,30000),
        xlab = "Year",ylab = "Number of suicide",col = Color,beside = TRUE)
legend("topleft",inset = .05,title = "Marital status",
       c("Divorced","Married","Single","Widowed"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)

barplot(DMarital, main = "Overall distribution on marital status",names.arg = c("Divorced","Married","Single","Widowed"),
        ylim = c(0,150000),cex.names = 1,xlab = "Marital status",ylab = "Number of suicide",col = Color)
legend(locator(1),inset = .05,title = "Education level",
       c("Divorced","Married","Single","Widowed"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)

#3.6.2 Chi-square test for marital--suicide independence 
chisq.test(Marital)
#Result:
#Pearson's Chi-squared test

#data:  Marital
#X-squared = 181158, df = 3, p-value < 2.2e-16
#There's association between marital status and suicide

#3.7 Effect of Day of week on suicide

#I divide the data of every day of week into 2 categories-weekday, and weekend
#Generate a new dataset with no unknown day cases on day of week 

AllDeath_Day <- AllDeath
AllDeath_Day <- AllDeath_Day[-which(AllDeath_Day$Day_of_Week_of_Death == 9),]
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 2),7] <- "WD"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 3),7] <- "WD"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 4),7] <- "WD"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 5),7] <- "WD"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 6),7] <- "WD"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 7),7] <- "WE"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 1),7] <- "WE"

AllSuicide_Day <- AllSuicide
AllSuicide_Day <- AllSuicide_Day[-which(AllSuicide_Day$Day_of_Week_of_Death == 9),]
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 2),7] <- "WD"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 3),7] <- "WD"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 4),7] <- "WD"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 5),7] <- "WD"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 6),7] <- "WD"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 7),7] <- "WE"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 1),7] <- "WE"

SDay <- table(AllSuicide_Day$Day_of_Week_of_Death,AllSuicide_Day$Year)

DDay <- margin.table(SDay,1)

Day <- table(AllDeath_Day$Day_of_Week_of_Death,AllDeath_Day$Manner_of_Death)

#Pick colors
Color <- rainbow(2,s = 0.3, start = 0,end = 0.6)
pie(rep(1:2),labels = Color,col = Color)

#Generate plots
par(mfrow = c(1,2))
barplot(SDay, main = "Suicide distribution on day of week in 2014-2019",ylim = c(0,50000),
        xlab = "Year",ylab = "Number of suicide",col = Color,beside = TRUE)
legend("topleft",inset = .05,title = "Day of week",c("Workday","Weekend"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)

barplot(DDay, main = "Overall distribution on day of week",names.arg = c("Workday","Weekend"),
        ylim = c(0,300000),cex.names = 1,xlab = "Day of week",ylab = "Number of suicide",col = Color)
legend("topleft",inset = .05,title = "Day of week",c("Workday","Weekend"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)

#3.7.2 chi-square test for between day of week on suicide
chisq.test(Day)
#Result:
#Pearson's Chi-squared test with Yates' continuity correction

#data:  Day
#X-squared = 477.19, df = 1, p-value < 2.2e-16
#Conclusion: There's association between day of week on suicide

#3.8 Effect of race on suicide 

#I divided the data into two categories-White(1) and minority(2)

AllDeath[which(AllDeath$Race_Recode_3 == 3),9] <- 2
AllSuicide[which(AllSuicide$Race_Recode_3 == 3),8] <- 2
SRace <- table(AllSuicide$Race_Recode_3,AllSuicide$Year)
DRace <- margin.table(SRace,1)
Race <- table(AllDeath$Race_Recode_3,AllDeath$Manner_of_Death)

#Pick colors
Color <- rainbow(2,s = 0.3, start = 0,end = 0.6)
pie(rep(1:2),labels = Color,col = Color)    
     
#Generate plots
par(mfrow = c(1,2))
barplot(SRace, main = "Suicide distribution on race in 2014-2019",ylim = c(0,60000),
        xlab = "Year",ylab = "Number of suicide",col = Color,beside = TRUE)
legend("topleft",inset = .05,title = "Race",c("White","Minority"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)

barplot(DDay, main = "Overall distribution on race",names.arg = c("White","Minority"),
        ylim = c(0,300000),cex.names = 1,xlab = "Race",ylab = "Number of suicide",col = Color)
legend("topleft",inset = .05,title = "Day of week",c("White","Minority"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar) 



#4.Build the regression mode


#4.1 retidy up the data 
AllDeath_Model <- rbind(mort14,mort15,mort16,mort17,mort18,mort19)

#Gender
#0 ... Female
#1 ... Male
AllDeath_Model[which(AllDeath_Model$Sex == "M"),4] <- 1
AllDeath_Model[which(AllDeath_Model$Sex == "F"),4] <- 0

#Education
#1... 8th grade or less
#2... 9th grade-high school graduate
#3... College experience - Bachelor¡¯s degree
#4... MS¡¯s degree or higher
AllDeath_Model[which(AllDeath_Model$Education2003 == 3),2] <- 2
AllDeath_Model[which(AllDeath_Model$Education2003 == 4),2] <- 3
AllDeath_Model[which(AllDeath_Model$Education2003 == 5),2] <- 3
AllDeath_Model[which(AllDeath_Model$Education2003 == 6),2] <- 3
AllDeath_Model[which(AllDeath_Model$Education2003 == 7),2] <- 4
AllDeath_Model[which(AllDeath_Model$Education2003 == 8),2] <- 4
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Education2003 == 9),]

#Month of death
#01... 1st quarter
#02... 2nd quarter
#03... 3rd quarter
#04... 4th quarter
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 1),3] <- 1
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 2),3] <- 1
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 3),3] <- 1
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 4),3] <- 2
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 5),3] <- 2
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 6),3] <- 2
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 7),3] <- 3
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 8),3] <- 3
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 9),3] <- 3
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 10),3] <- 4
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 11),3] <- 4
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 12),3] <- 4

#Age
#01 ... 15 - 24 years
#02 ... 25 - 34 years
#03 ... 35 - 44 years
#04 ... 45 - 54 years
#05 ... 55 - 64 years
#06 ... 65 - 74 years
#07 ... Older than 75 years
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Age_Recode_12 == 12),]
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Age_Recode_12 == 1),]
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Age_Recode_12 == 2),]
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Age_Recode_12 == 3),]

AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 4),5] <- 1
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 5),5] <- 2
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 6),5] <- 3
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 7),5] <- 4
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 8),5] <- 5
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 9),5] <- 6
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 10),5] <- 7
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 11),5] <- 7

#Marital status
#S ... Never married, single
#M ... Married
#W ... Widowed 
#D ... Divorced 
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Marital_Status == "U"),]

#Day of week
#WD ... Weekday
#WE ... Weekend
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Day_of_Week_of_Death == 9),]
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 2),7] <- "WD"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 3),7] <- "WD"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 4),7] <- "WD"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 5),7] <- "WD"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 6),7] <- "WD"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 7),7] <- "WE"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 1),7] <- "WE"

#Race
#0 ... Minority
#1 ... White
AllDeath_Model[which(AllDeath_Model$Race_Recode_3 == 3),9] <- 0
AllDeath_Model[which(AllDeath_Model$Race_Recode_3 == 2),9] <- 0

#4.2 Generate the mode

#4.2.1 Model 1
colnames(AllDeath_Model) <- c("ID","EducationLevel","Month","Gender","Age","MaritalStatus","Day","Suicide","Race","Year")

AllDeath_Model$Month <- as.character(AllDeath_Model$Month)
AllDeath_Model$Gender <- as.numeric(AllDeath_Model$Gender)
AllDeath_Model$Year <- as.character(AllDeath_Model$Year)
AllDeath_Model[which(AllDeath_Model$Day == "WD"),7] <- 1
AllDeath_Model[which(AllDeath_Model$Day == "WE"),7] <- 0
AllDeath_Model$Day <- as.numeric(AllDeath_Model$Day)

Model1 <- glm(Suicide ~ EducationLevel + Month + Gender + Age + MaritalStatus +  Day + Race + Year, data = AllDeath_Model, family = binomial())

summary(Model1)
#Result
#Call:
#        glm(formula = Suicide ~ EducationLevel + Month + Gender + Age + 
#                    MaritalStatus + Day + Race + Year, family = binomial(), data = AllDeath_Model)
#
#Deviance Residuals: 
#        Min       1Q   Median       3Q      Max  
#-1.3102  -0.1675  -0.1076  -0.0714   3.9310  
#
#Coefficients:
#        Estimate Std. Error  z value Pr(>|z|)    
#(Intercept)    -2.3599498  2.4870351   -0.949    0.343    
#EducationLevel  0.3755246  0.0031110  120.709   <2e-16 ***
#        Month2          0.1020167  0.0058768   17.359   <2e-16 ***
#        Month3          0.1227404  0.0058415   21.012   <2e-16 ***
#        Month4          0.0021775  0.0059525    0.366    0.715    
#Gender          0.8226879  0.0049734  165.416   <2e-16 ***
#        Age            -0.7671713  0.0014228 -539.189   <2e-16 ***
#        MaritalStatusM -0.2719331  0.0055790  -48.742   <2e-16 ***
#        MaritalStatusS -0.5936760  0.0064473  -92.081   <2e-16 ***
#        MaritalStatusW -0.5870634  0.0094604  -62.055   <2e-16 ***
#       Day             0.1689263  0.0046698   36.174   <2e-16 ***
#        Race            0.8942945  0.0066781  133.915   <2e-16 ***
#        Year            0.0001071  0.0012333    0.087    0.931    
#---
#        Signif. codes:  0 ¡®***¡¯ 0.001 ¡®**¡¯ 0.01 ¡®*¡¯ 0.05 ¡®.¡¯ 0.1 ¡® ¡¯ 1
#
#(Dispersion parameter for binomial family taken to be 1)
#
#Null deviance: 2596673  on 13914429  degrees of freedom
#Residual deviance: 2031714  on 13914417  degrees of freedom
#(2116736 observations deleted due to missingness)
#AIC: 2031740

#Number of Fisher Scoring iterations: 8

exp(coef(Model1))
#Result;
#(Intercept) EducationLevel         Month2         Month3         Month4         Gender            Age 
#0.09442496     1.45575494     1.10740202     1.13059083     1.00217986     2.27661103     0.46432465 
#MaritalStatusM MaritalStatusS MaritalStatusW            Day           Race           Year 
#0.76190521     0.55229332     0.55595754     1.18403292     2.44560991     1.00010712 

#Conclusion: Year doesn't have significant effect on suicide possibility, and there's no significant difference on 
#            suicide possibility between the 1st and 4th quarter of year

#4.2.2 Explanation for insignificant factors

#The year
EYear <- table(AllDeath_Model$Year,AllDeath_Model$Suicide)
A <- EYear[,1]
B <- EYear[,2]
EYear <- data.frame(A,B)
EYear$C <- c(2200503,2191493,2376695,2453301,2479804,2507040)
EYear$D <- EYear$B/EYear$C*100
colnames(EYear) <- c("Non-suicide","Suicide","Total","Suicide Percentage")

#chisq.test(EYear$`Suicide Percentage`)
#Result
#Chi-squared test for given probabilities

#data:  EYear$`Suicide Percentage`
#X-squared = 0.0039407, df = 5, p-value = 1
#Conclusion: Ratio of suicide in 2014-2019 is independent from each other

#The quarter
Quarter <- table(AllDeath_Model$Month,AllDeath_Model$Suicide)
Quarter14 <- Quarter[c(1,4),]
Quarter23 <- Quarter[c(2,3),]
margin.table(Quarter14,1)
margin.table(Quarter23,1)
A <- Quarter14[,1]
B <- Quarter14[,2]
Quarter14 <- data.frame(A,B)
Quarter14$C <- c(3776860,3622997)
Quarter14$D <- Quarter14$B/Quarter14$C*100
colnames(Quarter14) <- c("Non-suicide","Suicide","Total","Suicide Percentage")

A <- Quarter23[,1]
B <- Quarter23[,2]
Quarter23 <- data.frame(A,B)
Quarter23$C <- c(3776860,3622997)
Quarter23$D <- Quarter23$B/Quarter23$C*100
colnames(Quarter23) <- c("Non-suicide","Suicide","Total","Suicide Percentage")

chisq.test(Quarter14$`Suicide Percentage`)
#Result
#Chi-squared test for given probabilities

#data:  Quarter14$`Suicide Percentage`
#X-squared = 0.0016337, df = 1, p-value = 0.9678

chisq.test(Quarter23$`Suicide Percentage`)
#Result 
#Chi-squared test for given probabilities

#data:  Quarter23$`Suicide Percentage`
#X-squared = 0.0046919, df = 1, p-value = 0.9454

#Conclusion: The effects of 1st quarter and 4th Quarter were quite unified, 
#            while the effects of 2nd quarter and 3rd Quarter were quite unified.