Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
#Packages used in this project
library(openxlsx)
library(vcd)
library(Hmisc)
#1. RAW DATA TIDYING UP AND SELECTION FOR POSSIBLE FACTORS BY SUBSETTING
#Tidy up the raw date, delete all blank variables or variables that seems to be irrelevant to the suicide
#Extract all suicide data for quicker processing the suicide data
mort19 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort19.csv")
mort18 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort18.csv")
mort17 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort17.csv")
mort16 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort16.csv")
mort15 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort15.csv")
mort14 <- read.csv("D:/FinalProject/DATA/CSVfiles/mort14.csv")
#Delete unnecessary and blank variables
mort19 <- mort19[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]
mort18 <- mort18[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]
mort17 <- mort17[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]
mort16 <- mort16[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]
mort15 <- mort15[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]
mort14 <- mort14[,-c(1,2,4,7:10,12,13,16,17,19:52,54:57)]
#Select all the suicide cases
S19 <- mort19[which(mort19$Manner_of_Death == 2),]
S19 <- S19[,-7]
S18 <- mort18[which(mort18$Manner_of_Death == 2),]
S18 <- S18[,-7]
S17 <- mort17[which(mort17$Manner_of_Death == 2),]
S17 <- S17[,-7]
S16 <- mort16[which(mort16$Manner_of_Death == 2),]
S16 <- S16[,-7]
S15 <- mort15[which(mort15$Manner_of_Death == 2),]
S15 <- S15[,-7]
S14 <- mort14[which(mort14$Manner_of_Death == 2),]
S14 <- S14[,-7]
#File back-up
write.csv(mort19, file = "D:/FinalProject/DATA/Analysis/Death/mort19.csv")
write.csv(mort18, file = "D:/FinalProject/DATA/Analysis/Death/mort18.csv")
write.csv(mort17, file = "D:/FinalProject/DATA/Analysis/Death/mort17.csv")
write.csv(mort16, file = "D:/FinalProject/DATA/Analysis/Death/mort16.csv")
write.csv(mort15, file = "D:/FinalProject/DATA/Analysis/Death/mort15.csv")
write.csv(mort14, file = "D:/FinalProject/DATA/Analysis/Death/mort14.csv")
write.csv(S19, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort19.csv")
write.csv(S18, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort18.csv")
write.csv(S17, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort17.csv")
write.csv(S16, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort16.csv")
write.csv(S15, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort15.csv")
write.csv(S14, file = "D:/FinalProject/DATA/Analysis/Suicide/Smort14.csv")
#2. IMPORT THE DEATH & SUICIDE FILES
#Import the death data
mort19 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort19.csv")
mort18 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort18.csv")
mort17 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort17.csv")
mort16 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort16.csv")
mort15 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort15.csv")
mort14 <- read.csv("D:/FinalProject/DATA/Analysis/Death/mort14.csv")
#Import the suicide data
S19 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort19.csv")
S18 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort18.csv")
S17 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort17.csv")
S16 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort16.csv")
S15 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort15.csv")
S14 <- read.csv("D:/FinalProject/DATA/Analysis/Suicide/Smort14.csv")
#In "Manner_of_Death" column, "0" stands for non-suicide cases, while "1" stands for suicide cases
mort14[which(mort14$Manner_of_Death != 2),8] <- 0
mort14[which(mort14$Manner_of_Death == 2),8] <- 1
mort15[which(mort15$Manner_of_Death != 2),8] <- 0
mort15[which(mort15$Manner_of_Death == 2),8] <- 1
mort16[which(mort16$Manner_of_Death != 2),8] <- 0
mort16[which(mort16$Manner_of_Death == 2),8] <- 1
mort17[which(mort17$Manner_of_Death != 2),8] <- 0
mort17[which(mort17$Manner_of_Death == 2),8] <- 1
mort18[which(mort18$Manner_of_Death != 2),8] <- 0
mort18[which(mort18$Manner_of_Death == 2),8] <- 1
mort19[which(mort19$Manner_of_Death != 2),8] <- 0
mort19[which(mort19$Manner_of_Death == 2),8] <- 1
#Notify the year
mort14$Year <- 2014
mort15$Year <- 2015
mort16$Year <- 2016
mort17$Year <- 2017
mort18$Year <- 2018
mort19$Year <- 2019
S14$Year <- 2014
S15$Year <- 2015
S16$Year <- 2016
S17$Year <- 2017
S18$Year <- 2018
S19$Year <- 2019
#3. The data analysis
#Pull together the data needed
#Table of death and suicide number every year
year <- c(2014:2019)
Suicide <- c(nrow(S14),nrow(S15),nrow(S16),nrow(S17),nrow(S18),nrow(S19))
Death <- c(nrow(mort14),nrow(mort15),nrow(mort16),nrow(mort17),nrow(mort18),nrow(mort19))
Everyyear <- data.frame(year,Death,Suicide)
Everyyear$SuicidePercentage <- Suicide/Death*100
Everyyear$SuicidePercentage <- format(Everyyear$SuicidePercentage,digits = 3)
#Table of all suicide cases
AllSuicide <- rbind(S14,S15,S16,S17,S18,S19)
#Table of all death cases
AllDeath <- rbind(mort14,mort15,mort16,mort17,mort18,mort19)
#3.1 Change of the suicide number and percentages among overall death number over years
#3,1,1 Descriptive statistics of the suicide changes over the years
#Generate the plot
opar <- par(no.readonly = TRUE)
par(mfrow = c(1,2))
attach(Everyyear)
plot(Year,Suicide,type = "b",main = "Number of suicide cases from 2014-2019",
xlab = "Year", ylab = "Number of suicide case",xlim = c(2013,2020),ylim = c(42000,50000),
pch = 15, lty = 5)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
plot(Year,SuicidePercentage,type = "b",main = "Percentages of suicide cases from 2014-2019",
xlab = "Year", ylab = "Percentage of suicide case",xlim = c(2013,2020),ylim = c(1.6,1.75),
pch = 16, lty = 3)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
detach(Everyyear)
par(opar)
#ConClusion: Both suicide number and suicide percentage has a raising trend over the years
#3.1.2 Chi-Square test for suicide-year independence
AllDeath$Manner_of_Death <- as.factor(AllDeath$Manner_of_Death)
AllDeath$Year <- as.factor(AllDeath$Year)
Year <- table(AllDeath$Year,AllDeath$Manner_of_Death)
chisq.test(Year)
#Result:
#Pearson's Chi-squared test
#data: Year
#X-squared = 56.786, df = 5, p-value = 5.597e-11
#Conclusion: The suicide status changes over the years
#3.2 Effect of gender on suicide
#3.2.1 Descriptive plot of gender distribution on suicide
#Generate sex table for grouped bar plot all the years
AllSuicide$Sex <- as.factor(AllSuicide$Sex)
AllSuicide$Year <- as.factor(AllSuicide$Year)
SSex <- table(AllSuicide$Sex,AllSuicide$Year)
Sex <- table(AllDeath$Sex,AllDeath$Manner_of_Death)
#check the summed suicide number for both gender
margin.table(SSex,1)
#Pick 2 colors
Gray <- gray(0:15/15)
pie(rep(1:15),labels = Gray,col = Gray)
#Form a grouped bar plot
par(mfrow = c(1,2))
barplot(SSex, main = "Suicide distribution on gender in 2009-2019",ylim = c(0,50000),
xlab = "Year",ylab = "Number of suicide",col = c("#000000","#EEEEEE"),beside = TRUE)
legend("topleft",inset = .05,title = "Gender",c("F","M"),fill = c("#000000","#EEEEEE"),cex = 0.8)
minor.tick(nx = 1,ny = 2,tick.ratio = 0.5)
barplot(c(61699,214782), main = "Overall distribution on gender",
xlab = "Gender",ylab = "Number of suicide",col = c("#000000","#EEEEEE"))
legend("topleft",inset = .05,title = "Gender",c("F","M"),fill = c("#000000","#EEEEEE"),cex = 0.8)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)
#3.2.2 Using Chi-square test to make sure there's significant difference on suicide number between male and female
chisq.test(c(61699,214782))
#Result:
#Pearson's Chi-squared test
#
#data: c(61699, 214782)
#X-squared = 84760, df = 1, p-value < 2.2e-16
#Conclusion: There's significant difference on suicide number between male and female
#3.2.3 Chi-Square test for suicide-gender independence
Sex <- table(AllDeath$Sex,AllDeath$Manner_of_Death)
chisq.test(Sex)
#Result:
#Pearson's Chi-squared test with Yates' continuity correction
#data: Sex
#X-squared = 77774, df = 1, p-value < 2.2e-16
#Conclusion: There's dependence between gender and suicide
#3.3 Effect of education on suicide
#3.3.1 Descriptive statistics of education level on suicide
#Make a new rank on education level, old rank and my new rank is down below
# Old rank
#1... 8th grade or less
#2... 9 - 12th grade, no diploma
#3... high school graduate or GED completed
#4... some college credit, but no degree
#5... Associate degree
#6... Bachelor¡¯s degree
#7... Master¡¯s degree
#8... Doctorate or professional degree
#New rank
#1... 8th grade or less
#2... 9th grade-high school graduate
#3... College experience - Bachelor¡¯s degree
#4... MS¡¯s degree or higher
#Adapt the datasets for all death and suicide case to new rank
AllDeath[which(AllDeath$Education2003 == 3),2] <- 2
AllDeath[which(AllDeath$Education2003 == 4),2] <- 3
AllDeath[which(AllDeath$Education2003 == 5),2] <- 3
AllDeath[which(AllDeath$Education2003 == 6),2] <- 3
AllDeath[which(AllDeath$Education2003 == 7),2] <- 4
AllDeath[which(AllDeath$Education2003 == 8),2] <- 4
AllSuicide[which(AllSuicide$Education2003 == 3),2] <- 2
AllSuicide[which(AllSuicide$Education2003 == 4),2] <- 3
AllSuicide[which(AllSuicide$Education2003 == 5),2] <- 3
AllSuicide[which(AllSuicide$Education2003 == 6),2] <- 3
AllSuicide[which(AllSuicide$Education2003 == 7),2] <- 4
AllSuicide[which(AllSuicide$Education2003 == 8),2] <- 4
#Due to the fact there's suicide cases with unknown education status, thus I'll have to delete those missing values
#Make copies of origin files
AllSuicide_Edu <- AllSuicide
AllSuicide_Edu <- AllSuicide_Edu[-which(AllSuicide_Edu$Education2003 == 9),]
AllDeath_Edu <- AllDeath
AllDeath_Edu <- AllDeath_Edu[-which(AllDeath_Edu$Education2003 == 9),]
#Generate the table
AllSuicide_Edu$Education2003 <- as.factor(AllSuicide_Edu$Education2003)
SEducation <- table(AllSuicide_Edu$Education2003,AllSuicide_Edu$Year)
#Pick colors
Color <- rainbow(4,s = 0.3, start = 0,end = 0.6)
pie(rep(1:4),labels = Color,col = Color)
#Generate the table for summed suicide number for each education level
DEducation <- margin.table(SEducation,1)
#Generate the descriptive statistics
par(mfrow = c(1,2))
barplot(SEducation, main = "Suicide distribution on education level in 2014-2019",ylim = c(0,40000),
xlab = "Year",ylab = "Number of suicide",col = Color,beside = TRUE)
legend("topleft",inset = .05,title = "Education level",
c("8th grade or less","9-12 grade no diploma","High school graduate or GED completed","College experience and higher degree"),
fill = Color,cex = 0.75)
minor.tick(nx = 1,ny = 2,tick.ratio = 0.5)
barplot(DEducation, main = "Overall distribution on education level",ylim = c(10000,250000),
cex.names = 1,xlab = "Education level",ylab = "Number of suicide",col = Color)
legend("topleft",inset = .05,title = "Education level",
c("8th grad or less", "9th grade - high school graduate", "College experience - Bachelor's degreee","MS's degree or higher"),
fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)
#3.3.2 Chi-Square test for suicide-education independence
Education <- table(AllDeath_Edu$Education2003,AllDeath_Edu$Manner_of_Death)
chisq.test(Education)
#Result:
#Chi-squared test for given probabilities
#data: Education
#X-squared = 18461, df = 3, p-value < 2.2e-16
#Conclusion:There's dependence between education level and suicide
#3.4 Effect of month of death on suicide
#I divide the data into 4 quarters of year
#3.4.1 Descriptive statistics on death of month
#Generate table
AllDeath[which(AllDeath$Month_of_Death == 1),3] <- 1
AllDeath[which(AllDeath$Month_of_Death == 2),3] <- 1
AllDeath[which(AllDeath$Month_of_Death == 3),3] <- 1
AllDeath[which(AllDeath$Month_of_Death == 4),3] <- 2
AllDeath[which(AllDeath$Month_of_Death == 5),3] <- 2
AllDeath[which(AllDeath$Month_of_Death == 6),3] <- 2
AllDeath[which(AllDeath$Month_of_Death == 7),3] <- 3
AllDeath[which(AllDeath$Month_of_Death == 8),3] <- 3
AllDeath[which(AllDeath$Month_of_Death == 9),3] <- 3
AllDeath[which(AllDeath$Month_of_Death == 10),3] <- 4
AllDeath[which(AllDeath$Month_of_Death == 11),3] <- 4
AllDeath[which(AllDeath$Month_of_Death == 12),3] <- 4
AllSuicide[which(AllSuicide$Month_of_Death == 1),3] <- 1
AllSuicide[which(AllSuicide$Month_of_Death == 2),3] <- 1
AllSuicide[which(AllSuicide$Month_of_Death == 3),3] <- 1
AllSuicide[which(AllSuicide$Month_of_Death == 4),3] <- 2
AllSuicide[which(AllSuicide$Month_of_Death == 5),3] <- 2
AllSuicide[which(AllSuicide$Month_of_Death == 6),3] <- 2
AllSuicide[which(AllSuicide$Month_of_Death == 7),3] <- 3
AllSuicide[which(AllSuicide$Month_of_Death == 8),3] <- 3
AllSuicide[which(AllSuicide$Month_of_Death == 9),3] <- 3
AllSuicide[which(AllSuicide$Month_of_Death == 10),3] <- 4
AllSuicide[which(AllSuicide$Month_of_Death == 11),3] <- 4
AllSuicide[which(AllSuicide$Month_of_Death == 12),3] <- 4
SMonth <- table(AllSuicide$Month_of_Death,AllSuicide$Year)
DMonth <- margin.table(SMonth,1)
Month <- table(AllDeath$Month_of_Death,AllDeath$Manner_of_Death)
#Pick color for the plot
Color <- rainbow(4,s = 0.3, start = 0,end = 0.8)
pie(rep(1:4),labels = Color,col = Color)
#Generate plots
par(mfrow = c(1,2))
barplot(SMonth, main = "Suicide distribution on quarter in 2014-2019",ylim = c(0,20000),
xlab = "Year",ylab = "Number of suicide",col = Color,beside = TRUE)
legend("topleft",inset = .05,title = "Quarter",
c("1st Quarter","2nd Quarter","3rd Quarter","4th Quarter"),fill = Color,cex = 0.75)
minor.tick(nx = 1,ny = 2,tick.ratio = 0.5)
barplot(DMonth, main = "Overall distribution on quarter",ylim = c(0,100000),
cex.names = 1,xlab = "Quarter",ylab = "Number of suicide",col = Color)
legend(locator(1),inset = .05,title = "Quarter",c("1st Quarter","2nd Quarter","3rd Quarter","4th Quarter"),
fill = Color,cex = 0.65)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)
#Observation:
#No obvious trend observed
#3.5 Effect of age on suicide
#3.5.1 Descriptive statistics of age distribution on suicide
#Due to the fact that there were cases with unknown ages, and there's no suicide so I have to make a subset of origin data
#And I'll adapt a new rank for the data, I'll put both two ranks down below
#Because in the age rage below 15, the number of death is relative small
#compare to the overall death number and suicide case is quite few, so in this study, I discuss only
#cases older than 15 years
#Old rank
#1 ... Under 1 year (includes not stated infant ages)
#2 ... 1 - 4 years
#3 ... 5 - 14 years
#4 ... 15 - 24 years
#5 ... 25 - 34 years
#6 ... 35 - 44 years
#7 ... 45 - 54 years
#8 ... 55 - 64 years
#9 ... 65 - 74 years
#10 ... 75 - 84 years
#11 ... 85 years and over
#New rank
#1 ... 15 - 24 years
#2 ... 25 - 34 years
#3 ... 35 - 44 years
#4 ... 45 - 54 years
#5 ... 55 - 64 years
#6 ... 65 - 74 years
#7 ... Older than 75 years
#Adapt the origin data to new rank
AllDeath_Age <- AllDeath[-which(AllDeath$Age_Recode_12 == 12),]
AllDeath_Age <- AllDeath_Age[-which(AllDeath_Age$Age_Recode_12 == 1),]
AllDeath_Age <- AllDeath_Age[-which(AllDeath_Age$Age_Recode_12 == 2),]
AllDeath_Age <- AllDeath_Age[-which(AllDeath_Age$Age_Recode_12 == 3),]
AllSuicide_Age <- AllSuicide[-which(AllSuicide$Age_Recode_12 == 12),]
AllSuicide_Age <- AllSuicide_Age[-which(AllSuicide_Age$Age_Recode_12 == 3),]
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 4),5] <- 1
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 5),5] <- 2
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 6),5] <- 3
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 7),5] <- 4
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 8),5] <- 5
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 9),5] <- 6
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 10),5] <- 7
AllDeath_Age[which(AllDeath_Age$Age_Recode_12 == 11),5] <- 7
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 4),5] <- 1
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 5),5] <- 2
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 6),5] <- 3
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 7),5] <- 4
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 8),5] <- 5
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 9),5] <- 6
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 10),5] <- 7
AllSuicide_Age[which(AllSuicide_Age$Age_Recode_12 == 11),5] <- 7
Age <- table(AllDeath_Age$Age_Recode_12,AllDeath_Age$Manner_of_Death)
SAge <- table(AllSuicide_Age$Age_Recode_12,AllSuicide_Age$Year)
DAge <- margin.table(SAge,1)
#Pick colors
Color <- rainbow(7,s = 0.3, start = 0,end = 0.6)
pie(rep(1:7),labels = Color,col = Color)
#Generate plots
S15_24 <- SAge[1,]
S25_34 <- SAge[2,]
S35_44 <- SAge[3,]
S45_54 <- SAge[4,]
S55_64 <- SAge[5,]
S65_74 <- SAge[6,]
S75 <- SAge[7,]
par(mfrow = c(1,2))
barplot(DAge, main = "Overall distribution on Age",
names.arg = c("15-24","25-34","35-44","45-54","55-64","65-74","75+"),cex.names = 1,
xlab = "Age",ylab = "Number of suicide",col = Color)
legend(locator(1),inset = .05,title = "Age",c("24-","25-34","35-44","45-54","55-64","65-74","75+"),
fill = Color,cex = 0.6)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
plot(year,S15_24,type = "b",pch = 1,lty = 1,ylim = c(0,8500),main = "Suicide number change in 2014-2019",
xlab = "Year",ylab = "Suicide number")
lines(year,S25_34,type = "b",lty = 1,pch = 2,col = "red")
lines(year,S35_44,type = "b",lty = 2,pch = 3,col = "blue")
lines(year,S45_54,type = "b",lty = 3,pch = 4,col = "green")
lines(year,S55_64,type = "b",lty = 4,pch = 5,col = "grey")
lines(year,S65_74,type = "b",lty = 5,pch = 6,col = "purple")
lines(year,S75,type = "b",lty = 6,pch = 7,col = "yellow")
legend("bottomright",inset = .05,title = "Age",c("24-","25-34","35-44","45-54","55-64","65-74","75+"),
fill = c("black","red","blue","green","grey","purple","yellow"),cex = 0.6)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)
#3.5.2 Chi-square test for age-suicide independence
chisq.test(Age)
#Result
#Pearson's Chi-squared test
#data: Age
#X-squared = 892217, df = 6, p-value < 2.2e-16
#Conclusion: There's dependence between age and suicide
#3.6 Effect of marital status
#Descriptive statistics
#Due to the fact that there's cases with unknown marital status, subsetting the origin datasets is needed
AllDeath_Marital <- AllDeath
AllDeath_Marital <- AllDeath_Marital[-which(AllDeath_Marital$Marital_Status == "U"),]
AllSuicide_Marital <- AllSuicide
AllSuicide_Marital <- AllSuicide_Marital[-which(AllSuicide_Marital$Marital_Status == "U"),]
SMarital <- table(AllSuicide_Marital$Marital_Status,AllSuicide_Marital$Year)
DMarital <- margin.table(SMarital,1)
Marital <- table(AllDeath_Marital$Marital_Status,AllDeath_Marital$Manner_of_Death)
#Pick colors
Color <- rainbow(4,s = 0.3, start = 0,end = 0.6)
pie(rep(1:4),labels = Color,col = Color)
#Generate plots
par(mfrow = c(1,2))
barplot(SMarital, main = "Suicide distribution on marital status in 2014-2019",ylim = c(0,30000),
xlab = "Year",ylab = "Number of suicide",col = Color,beside = TRUE)
legend("topleft",inset = .05,title = "Marital status",
c("Divorced","Married","Single","Widowed"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
barplot(DMarital, main = "Overall distribution on marital status",names.arg = c("Divorced","Married","Single","Widowed"),
ylim = c(0,150000),cex.names = 1,xlab = "Marital status",ylab = "Number of suicide",col = Color)
legend(locator(1),inset = .05,title = "Education level",
c("Divorced","Married","Single","Widowed"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)
#3.6.2 Chi-square test for marital--suicide independence
chisq.test(Marital)
#Result:
#Pearson's Chi-squared test
#data: Marital
#X-squared = 181158, df = 3, p-value < 2.2e-16
#There's association between marital status and suicide
#3.7 Effect of Day of week on suicide
#I divide the data of every day of week into 2 categories-weekday, and weekend
#Generate a new dataset with no unknown day cases on day of week
AllDeath_Day <- AllDeath
AllDeath_Day <- AllDeath_Day[-which(AllDeath_Day$Day_of_Week_of_Death == 9),]
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 2),7] <- "WD"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 3),7] <- "WD"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 4),7] <- "WD"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 5),7] <- "WD"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 6),7] <- "WD"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 7),7] <- "WE"
AllDeath_Day[which(AllDeath_Day$Day_of_Week_of_Death == 1),7] <- "WE"
AllSuicide_Day <- AllSuicide
AllSuicide_Day <- AllSuicide_Day[-which(AllSuicide_Day$Day_of_Week_of_Death == 9),]
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 2),7] <- "WD"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 3),7] <- "WD"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 4),7] <- "WD"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 5),7] <- "WD"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 6),7] <- "WD"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 7),7] <- "WE"
AllSuicide_Day[which(AllSuicide_Day$Day_of_Week_of_Death == 1),7] <- "WE"
SDay <- table(AllSuicide_Day$Day_of_Week_of_Death,AllSuicide_Day$Year)
DDay <- margin.table(SDay,1)
Day <- table(AllDeath_Day$Day_of_Week_of_Death,AllDeath_Day$Manner_of_Death)
#Pick colors
Color <- rainbow(2,s = 0.3, start = 0,end = 0.6)
pie(rep(1:2),labels = Color,col = Color)
#Generate plots
par(mfrow = c(1,2))
barplot(SDay, main = "Suicide distribution on day of week in 2014-2019",ylim = c(0,50000),
xlab = "Year",ylab = "Number of suicide",col = Color,beside = TRUE)
legend("topleft",inset = .05,title = "Day of week",c("Workday","Weekend"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
barplot(DDay, main = "Overall distribution on day of week",names.arg = c("Workday","Weekend"),
ylim = c(0,300000),cex.names = 1,xlab = "Day of week",ylab = "Number of suicide",col = Color)
legend("topleft",inset = .05,title = "Day of week",c("Workday","Weekend"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)
#3.7.2 chi-square test for between day of week on suicide
chisq.test(Day)
#Result:
#Pearson's Chi-squared test with Yates' continuity correction
#data: Day
#X-squared = 477.19, df = 1, p-value < 2.2e-16
#Conclusion: There's association between day of week on suicide
#3.8 Effect of race on suicide
#I divided the data into two categories-White(1) and minority(2)
AllDeath[which(AllDeath$Race_Recode_3 == 3),9] <- 2
AllSuicide[which(AllSuicide$Race_Recode_3 == 3),8] <- 2
SRace <- table(AllSuicide$Race_Recode_3,AllSuicide$Year)
DRace <- margin.table(SRace,1)
Race <- table(AllDeath$Race_Recode_3,AllDeath$Manner_of_Death)
#Pick colors
Color <- rainbow(2,s = 0.3, start = 0,end = 0.6)
pie(rep(1:2),labels = Color,col = Color)
#Generate plots
par(mfrow = c(1,2))
barplot(SRace, main = "Suicide distribution on race in 2014-2019",ylim = c(0,60000),
xlab = "Year",ylab = "Number of suicide",col = Color,beside = TRUE)
legend("topleft",inset = .05,title = "Race",c("White","Minority"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
barplot(DDay, main = "Overall distribution on race",names.arg = c("White","Minority"),
ylim = c(0,300000),cex.names = 1,xlab = "Race",ylab = "Number of suicide",col = Color)
legend("topleft",inset = .05,title = "Day of week",c("White","Minority"),fill = Color,cex = 0.75)
minor.tick(nx = 0,ny = 2,tick.ratio = 0.5)
par(opar)
#4.Build the regression mode
#4.1 retidy up the data
AllDeath_Model <- rbind(mort14,mort15,mort16,mort17,mort18,mort19)
#Gender
#0 ... Female
#1 ... Male
AllDeath_Model[which(AllDeath_Model$Sex == "M"),4] <- 1
AllDeath_Model[which(AllDeath_Model$Sex == "F"),4] <- 0
#Education
#1... 8th grade or less
#2... 9th grade-high school graduate
#3... College experience - Bachelor¡¯s degree
#4... MS¡¯s degree or higher
AllDeath_Model[which(AllDeath_Model$Education2003 == 3),2] <- 2
AllDeath_Model[which(AllDeath_Model$Education2003 == 4),2] <- 3
AllDeath_Model[which(AllDeath_Model$Education2003 == 5),2] <- 3
AllDeath_Model[which(AllDeath_Model$Education2003 == 6),2] <- 3
AllDeath_Model[which(AllDeath_Model$Education2003 == 7),2] <- 4
AllDeath_Model[which(AllDeath_Model$Education2003 == 8),2] <- 4
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Education2003 == 9),]
#Month of death
#01... 1st quarter
#02... 2nd quarter
#03... 3rd quarter
#04... 4th quarter
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 1),3] <- 1
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 2),3] <- 1
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 3),3] <- 1
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 4),3] <- 2
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 5),3] <- 2
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 6),3] <- 2
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 7),3] <- 3
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 8),3] <- 3
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 9),3] <- 3
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 10),3] <- 4
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 11),3] <- 4
AllDeath_Model[which(AllDeath_Model$Month_of_Death == 12),3] <- 4
#Age
#01 ... 15 - 24 years
#02 ... 25 - 34 years
#03 ... 35 - 44 years
#04 ... 45 - 54 years
#05 ... 55 - 64 years
#06 ... 65 - 74 years
#07 ... Older than 75 years
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Age_Recode_12 == 12),]
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Age_Recode_12 == 1),]
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Age_Recode_12 == 2),]
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Age_Recode_12 == 3),]
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 4),5] <- 1
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 5),5] <- 2
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 6),5] <- 3
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 7),5] <- 4
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 8),5] <- 5
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 9),5] <- 6
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 10),5] <- 7
AllDeath_Model[which(AllDeath_Model$Age_Recode_12 == 11),5] <- 7
#Marital status
#S ... Never married, single
#M ... Married
#W ... Widowed
#D ... Divorced
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Marital_Status == "U"),]
#Day of week
#WD ... Weekday
#WE ... Weekend
AllDeath_Model <- AllDeath_Model[-which(AllDeath_Model$Day_of_Week_of_Death == 9),]
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 2),7] <- "WD"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 3),7] <- "WD"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 4),7] <- "WD"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 5),7] <- "WD"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 6),7] <- "WD"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 7),7] <- "WE"
AllDeath_Model[which(AllDeath_Model$Day_of_Week_of_Death == 1),7] <- "WE"
#Race
#0 ... Minority
#1 ... White
AllDeath_Model[which(AllDeath_Model$Race_Recode_3 == 3),9] <- 0
AllDeath_Model[which(AllDeath_Model$Race_Recode_3 == 2),9] <- 0
#4.2 Generate the mode
#4.2.1 Model 1
colnames(AllDeath_Model) <- c("ID","EducationLevel","Month","Gender","Age","MaritalStatus","Day","Suicide","Race","Year")
AllDeath_Model$Month <- as.character(AllDeath_Model$Month)
AllDeath_Model$Gender <- as.numeric(AllDeath_Model$Gender)
AllDeath_Model$Year <- as.character(AllDeath_Model$Year)
AllDeath_Model[which(AllDeath_Model$Day == "WD"),7] <- 1
AllDeath_Model[which(AllDeath_Model$Day == "WE"),7] <- 0
AllDeath_Model$Day <- as.numeric(AllDeath_Model$Day)
Model1 <- glm(Suicide ~ EducationLevel + Month + Gender + Age + MaritalStatus + Day + Race + Year, data = AllDeath_Model, family = binomial())
summary(Model1)
#Result
#Call:
# glm(formula = Suicide ~ EducationLevel + Month + Gender + Age +
# MaritalStatus + Day + Race + Year, family = binomial(), data = AllDeath_Model)
#
#Deviance Residuals:
# Min 1Q Median 3Q Max
#-1.3102 -0.1675 -0.1076 -0.0714 3.9310
#
#Coefficients:
# Estimate Std. Error z value Pr(>|z|)
#(Intercept) -2.3599498 2.4870351 -0.949 0.343
#EducationLevel 0.3755246 0.0031110 120.709 <2e-16 ***
# Month2 0.1020167 0.0058768 17.359 <2e-16 ***
# Month3 0.1227404 0.0058415 21.012 <2e-16 ***
# Month4 0.0021775 0.0059525 0.366 0.715
#Gender 0.8226879 0.0049734 165.416 <2e-16 ***
# Age -0.7671713 0.0014228 -539.189 <2e-16 ***
# MaritalStatusM -0.2719331 0.0055790 -48.742 <2e-16 ***
# MaritalStatusS -0.5936760 0.0064473 -92.081 <2e-16 ***
# MaritalStatusW -0.5870634 0.0094604 -62.055 <2e-16 ***
# Day 0.1689263 0.0046698 36.174 <2e-16 ***
# Race 0.8942945 0.0066781 133.915 <2e-16 ***
# Year 0.0001071 0.0012333 0.087 0.931
#---
# Signif. codes: 0 ¡®***¡¯ 0.001 ¡®**¡¯ 0.01 ¡®*¡¯ 0.05 ¡®.¡¯ 0.1 ¡® ¡¯ 1
#
#(Dispersion parameter for binomial family taken to be 1)
#
#Null deviance: 2596673 on 13914429 degrees of freedom
#Residual deviance: 2031714 on 13914417 degrees of freedom
#(2116736 observations deleted due to missingness)
#AIC: 2031740
#Number of Fisher Scoring iterations: 8
exp(coef(Model1))
#Result;
#(Intercept) EducationLevel Month2 Month3 Month4 Gender Age
#0.09442496 1.45575494 1.10740202 1.13059083 1.00217986 2.27661103 0.46432465
#MaritalStatusM MaritalStatusS MaritalStatusW Day Race Year
#0.76190521 0.55229332 0.55595754 1.18403292 2.44560991 1.00010712
#Conclusion: Year doesn't have significant effect on suicide possibility, and there's no significant difference on
# suicide possibility between the 1st and 4th quarter of year
#4.2.2 Explanation for insignificant factors
#The year
EYear <- table(AllDeath_Model$Year,AllDeath_Model$Suicide)
A <- EYear[,1]
B <- EYear[,2]
EYear <- data.frame(A,B)
EYear$C <- c(2200503,2191493,2376695,2453301,2479804,2507040)
EYear$D <- EYear$B/EYear$C*100
colnames(EYear) <- c("Non-suicide","Suicide","Total","Suicide Percentage")
#chisq.test(EYear$`Suicide Percentage`)
#Result
#Chi-squared test for given probabilities
#data: EYear$`Suicide Percentage`
#X-squared = 0.0039407, df = 5, p-value = 1
#Conclusion: Ratio of suicide in 2014-2019 is independent from each other
#The quarter
Quarter <- table(AllDeath_Model$Month,AllDeath_Model$Suicide)
Quarter14 <- Quarter[c(1,4),]
Quarter23 <- Quarter[c(2,3),]
margin.table(Quarter14,1)
margin.table(Quarter23,1)
A <- Quarter14[,1]
B <- Quarter14[,2]
Quarter14 <- data.frame(A,B)
Quarter14$C <- c(3776860,3622997)
Quarter14$D <- Quarter14$B/Quarter14$C*100
colnames(Quarter14) <- c("Non-suicide","Suicide","Total","Suicide Percentage")
A <- Quarter23[,1]
B <- Quarter23[,2]
Quarter23 <- data.frame(A,B)
Quarter23$C <- c(3776860,3622997)
Quarter23$D <- Quarter23$B/Quarter23$C*100
colnames(Quarter23) <- c("Non-suicide","Suicide","Total","Suicide Percentage")
chisq.test(Quarter14$`Suicide Percentage`)
#Result
#Chi-squared test for given probabilities
#data: Quarter14$`Suicide Percentage`
#X-squared = 0.0016337, df = 1, p-value = 0.9678
chisq.test(Quarter23$`Suicide Percentage`)
#Result
#Chi-squared test for given probabilities
#data: Quarter23$`Suicide Percentage`
#X-squared = 0.0046919, df = 1, p-value = 0.9454
#Conclusion: The effects of 1st quarter and 4th Quarter were quite unified,
# while the effects of 2nd quarter and 3rd Quarter were quite unified.