Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Add files via upload
  • Loading branch information
hyb13001 committed Jul 10, 2020
1 parent 056b36f commit 28eae5b
Show file tree
Hide file tree
Showing 7 changed files with 636 additions and 0 deletions.
Binary file added B12quartiles.RData
Binary file not shown.
Binary file added ERdat2006.RData
Binary file not shown.
236 changes: 236 additions & 0 deletions ERwithBootstrap.R
@@ -0,0 +1,236 @@
rm(list=ls())
library("QREM")
boot.QREM <- function(func, linmod, dframe0, qn, n, sampleFrom=NULL,
B=100, err=10, maxit=1000, tol=0.001,
maxInvLambda=300, seedno=71371, showEst=FALSE) {
t0 <- Sys.time()
set.seed(seedno)
bs_set <- sample(n, replace = TRUE)
if (! is.null(sampleFrom))
colNum <- which(colnames(dframe0) == sampleFrom)
else
colNum <- 0
if (! is.null(sampleFrom))
dframe <- dframe0[which(dframe0[,colNum] %in% bs_set),]
else
dframe <- dframe0[bs_set,]
qremFit0 <- QREM(func, linmod, dframe, qn, err=err, maxit=maxit,
tol=tol, maxInvLambda=maxInvLambda)
if (B == 1)
return(qremFit0$coef$beta)

oneIt <- as.numeric(difftime(Sys.time(), t0, units = "secs"))
useCores <- detectCores() - 1
if (showEst){
cat("One iteration ", ceiling(oneIt), "seconds\n")
cat("Estimated completion time, using", useCores, " cores >",
ceiling(oneIt*ceiling(B/useCores))," seconds\n")
}
t0 <- Sys.time()
n_coefs <- length(qremFit0$coef$beta)
# Initiate cluster
cl <- makeCluster(useCores)
clusterExport(cl,varlist=c("func","linmod", "dframe0", "qn","n","QREM",
"lm","lmer", "gam",
"getME","colNum","fixef","ranef","seedno",
"err", "maxit","tol","maxInvLambda","seedno"),
envir=environment())
QREMpar=parLapply(cl, 1:(B-1),
function(repnum) {
set.seed(seedno + 19 * repnum)
bs_set <- sample(n, replace = TRUE)
if (! is.null(sampleFrom))
dframe <- dframe0[which(dframe0[,colNum] %in% bs_set),]
else
dframe <- dframe0[bs_set,]
qremFit <- QREM(func, linmod, dframe, qn, err=err, maxit=maxit,
tol=tol, maxInvLambda=maxInvLambda)
qremFit$coef$beta
}
)
stopCluster(cl)
if (showEst)
cat("Actual completion time =", ceiling(as.numeric(difftime(Sys.time(), t0, units = "secs")))," seconds\n")
rbind(qremFit0$coef$beta, matrix(unlist(QREMpar), ncol = n_coefs, byrow = TRUE))
}


zalpha <- qnorm(0.025)

# data from
# ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHAMCS

load("ERdat2006.RData")
# pre-process the data and create the variables and the data frame
# show some diagnostics plots and tables
#
y <- log(ERdat$LOV+1,base=60)
hist(y,breaks=20)
Month <- as.factor(ERdat$month)
barplot(table(Month))
DOW <- as.factor(ERdat$dow)
barplot(table(DOW))
Sex <- as.factor(ERdat$sex)
levels(Sex) <- c("F","M")
pie(table(Sex))
plot(y~Month)
plot(y~DOW)
Race <- as.factor(ERdat$race)
barplot(table(Race))
Race2 <- Race
Race2[which(as.numeric(as.character(Race)) %in% c(-9,3,4,5,6))] <- 3
Race2 <- factor(Race2)
levels(Race2) = c("W","B","O")
barplot(table(Race2))
Age <- ERdat$age/100
hist(Age)
PayType <- as.factor(ERdat$paytype)
# 0 = Blank, 1 = Private insurance, 2 = Medicare, 3 = Medicaid/SCHIP
# 4 = Worker's Compensation, 5 = Self-pay, 6 = No charge/charity
# 7 = Other, 8 = Unknown
barplot(table(PayType))
# combine 0,7,8,
PayType2 <- PayType
PayType2[which(as.numeric(as.character(PayType)) %in% c(2,3,4))] <- 2
PayType2[which(as.numeric(as.character(PayType)) %in% c(5))] <- 3
PayType2[which(as.numeric(as.character(PayType)) %in% c(-9,-8,0,6,7,8))] <- 4
PayType2 <- factor(PayType2)
levels(PayType2) = c("Private","GovEmp","Self","Other")
barplot(table(PayType2))

Temp <- ERdat$temp/10
hist(Temp,breaks=20)
Pulse <- ERdat$pulse
hist(Pulse,breaks=20)
SBP <- ERdat$sbp
hist(SBP,breaks=20)
DBP <- ERdat$dbp
hist(DBP,breaks=20)

Pain <- as.factor(ERdat$pain)
barplot(table(Pain))

Residence <- as.factor(ERdat$resid)
barplot(table(Residence))
ArrivalMode <- as.factor(ERdat$arrmode)
barplot(table(ArrivalMode))
ArrivalTime <- as.factor(floor(ERdat$arrtime/100))
ArrivalTime2 <- floor(ERdat$arrtime/100)
ArrivalTime2[which(ArrivalTime2 >8 & ArrivalTime2 < 20)] <- "AM"
ArrivalTime2[which(ArrivalTime2 != "AM")] <- "PM"
barplot(table(ArrivalTime))
ArrivalTime2 = as.factor(ArrivalTime2)
barplot(table(ArrivalTime2))

Region <- as.factor(ERdat$region)
levels(Region) <- c("NE","MW","S","W")
barplot(table(Region))
Metro <- as.factor(ERdat$metro)
levels(Metro) <- c("Yes", "No")
barplot(table(Metro))
Owner <- as.factor(ERdat$owner)
barplot(table(Owner))
HospCode <- ERdat$hosp
hist(HospCode, breaks=30)

RecentVisit <- rep("N",length(HospCode))
if(length(ERdat$disch7 ) == 0) { RecentVisit[which(ERdat$seen72 == 1)] <- "Y"
} else { RecentVisit[which(ERdat$seen72 == 1 | ERdat$disch7 ==1)] <- "Y" }
RecentVisit <- as.factor(RecentVisit)
table(RecentVisit)


dframe <- data.frame(y,Sex,Month,DOW,Race2,Age,PayType2,Temp,Pulse,
SBP,DBP,Pain,Residence,ArrivalMode,ArrivalTime2,
Region, Metro, #Owner,
HospCode, RecentVisit)
qs <- c(seq(0.05, 0.95, by=0.05))
# no batch effect
linmod <- y~ Sex+Race2+Age+Region+Metro+
PayType2+ArrivalTime2+DOW+RecentVisit
ncols <- nlevels(Sex)-1+nlevels(Race2)-1+1+
nlevels(PayType2)-1+
nlevels(Region)-1+nlevels(Metro)-1+nlevels(ArrivalTime2)-1+
nlevels(DOW)-1+nlevels(RecentVisit)-1+1

res1 <- matrix(0,nrow=length(qs), ncol=2*ncols)
qqp <- matrix(0, nrow=length(qs), ncol=3)
for (i in 1:length(qs)) {
cat(i,qs[i],"\n")
qremFit <- QREM(lm,linmod, dframe, qs[i])
varKED <- bcov(qremFit, linmod=linmod, dframe, qs[i])
res1[i,] <- c(as.numeric(qremFit$fitted.mod$coefficients), sqrt(diag(varKED)))
}


# hospital is a random effect:
linmodrnd <- y~ Sex+Race2+Age+Region+Metro+
PayType2+ArrivalTime2+DOW+
RecentVisit+ (1|HospCode)
ncols2 <- nlevels(Sex)-1+nlevels(Race2)-1+1+
nlevels(PayType2)-1+
nlevels(Region)-1+nlevels(Metro)-1+nlevels(ArrivalTime2)-1+
nlevels(DOW)-1+nlevels(RecentVisit)-1+1

# set onlyEstimate = FALSE if you want to get regression coefficient estimates
# without running the bootstrap (which takes a long time):
res2 <- matrix(0,nrow=length(qs), ncol=2*ncols2)
onlyEstimate <- TRUE
if (onlyEstimate) {
for (i in 1:length(qs)) {
cat(i,qs[i],"\n")
qremFit <- QREM(lmer,linmodrnd, dframe, qs[i], maxit = 2000)
res2[i,] <- c(as.numeric(qremFit$coef$beta), rep(0,ncols2))
}
} else {
B <- 99
for (i in 1:length(qs)) {
cat(i,qs[i],"\n")
bsv <- boot.QREM(lmer, linmodrnd, dframe, qs[i], 100, #length(unique(HospCode)),
"HospCode", maxit = 2000, B=B, seedno=336621, showEst = TRUE)
res2[i,] <- c(colMeans(bsv), apply(bsv,2,sd))
}
}

#save(res1,res2,file="ERresults1120.RData")

# parameter estimates with 95% confidence intervals for each predictor, by quantile
# for the two models (with/without random effect) using smooth splines
# names(fixef(qremFit$fitted.mod))
varnames <- c("(Intercept)", "SexM", "Race2B", "Race2O", "Age", "RegionMW",
"RegionS", "RegionW", "MetroNo", "PayType2GovEmp", "PayType2Self", "PayType2Other" ,
"ArrivalTime2PM", "DOW2", "DOW3", "DOW4", "DOW5", "DOW6",
"DOW7", "RecentVisitY")
ciCols <- c("navyblue","darkred","orange")
sspldf=10
for (j in 1:(ncol(res1)/2)) {
mm <- min(res1[,j]-abs(zalpha)*res1[,j+ncol(res1)/2], res2[,j]-abs(zalpha)*res2[,j+ncol(res2)/2])
mm <- mm - abs(mm)*0.1
MM <- max(res1[,j]+abs(zalpha)*res1[,j+ncol(res1)/2], res2[,j]+abs(zalpha)*res2[,j+ncol(res2)/2])
MM <- MM + abs(MM)*0.1
#pdf(sprintf("fig/ER%02d.pdf",j),width = 5, height = 5)
plot(smooth.spline(qs,res1[,j],df=sspldf),type='l', axes=F, ylim=c(mm,MM),
main=varnames[j], ylab="Coef.", xlab="quantile",col=ciCols[1],lwd=2)
axis(1,labels=seq(0,1,by=0.1), at=seq(0,1,by=.1)); axis(2)
lines(smooth.spline(qs,res2[,j],df=sspldf),col=ciCols[2],lwd=2,lty=2)
yyl <- c(res1[,j]-abs(zalpha)*res1[,j+ncol(res1)/2])
yyu <- c(res1[,j]+abs(zalpha)*res1[,j+ncol(res1)/2])
sspl <- smooth.spline(qs, yyl, df=sspldf)
sspu <- smooth.spline(qs, yyu, df=sspldf)
xx <- c(sspl$x, rev(sspu$x))
yy <- c(sspl$y, rev(sspu$y))
polygon(xx, yy, col = adjustcolor(ciCols[1], alpha.f=0.1),
border = ciCols[1], lty=1)

yyl <- c(res2[,j]-abs(zalpha)*res2[,j+ncol(res2)/2])
yyu <- c(res2[,j]+abs(zalpha)*res2[,j+ncol(res2)/2])
sspl <- smooth.spline(qs, yyl, df=sspldf)
sspu <- smooth.spline(qs, yyu, df=sspldf)
xx <- c(sspl$x, rev(sspu$x))
yy <- c(sspl$y, rev(sspu$y))
polygon(xx, yy, col = adjustcolor(ciCols[2], alpha.f=0.1),
border = ciCols[2], lty=1)

abline(h=0,lwd=2,col="grey66")
#dev.off()
}
128 changes: 128 additions & 0 deletions QREMVSB12.R
@@ -0,0 +1,128 @@
library(QREM)
library(SEMMS)

res <- list()

qn <- 0.25 # change to 0.5 and 0.75
maxReps <- 20
fn <- "riboflavin1.csv"
responseCol <- 1
predictors <- 2:4089
dataYXZ <- readInputFile(fn, ycol=responseCol, Zcols=predictors)
nn <- 10
n <- dataYXZ$N
y0 <- dataYXZ$Y
K <- dataYXZ$K
# initialize:
zval <- rep(0, K)
for (i in seq(1,K,by=5)) {
if (i %% 101 == 0) { cat(i,"...\n") }
preds <- paste0(colnames(dataYXZ$Z)[i:min(i+4,K)], collapse = " + ")
linmod <- as.formula(paste("Y ~", preds))
dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,i:min(i+4,K)]))
colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[i:min(i+4,K)])
qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
zval[i:min(i+4,K)] <- qremFit$coef$beta[-1]/sqrt(diag(bcov(qremFit,linmod,dframetmp,qn)))[-1]
}
nnset <- order(abs(zval),decreasing = TRUE)[1:nn]
#cat("initial set", nnset,"\n")

for (rep in 1:maxReps) {
# create a subset of the selected columns and run QREM
preds <- paste(colnames(dataYXZ$Z)[nnset], collapse = "+")
linmod <- as.formula(paste("Y ~", preds))
dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,nnset]))
colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[nnset])
qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
# apply the weights found by QREM and rerun SEMMS
dataYXZtmp <- dataYXZ
#dataYXZtmp$Z <- diag(sqrt(qremFit$weights))%*%dataYXZ$Z
dataYXZtmp$Y <- (dataYXZ$Y-(1-2*qn)/qremFit$weights)#*sqrt(qremFit$weights)
#plot(dataYXZ$Y, dataYXZtmp$Y); abline(0,1,col=2)

fittedVSnew <- fitSEMMS(dataYXZtmp,distribution = 'N', mincor=0.8,rnd=F,
nnset=nnset, minchange = 1, maxst = 20)
foundSEMMSnew <- sort(union(which(fittedVSnew$gam.out$lockedOut != 0),
fittedVSnew$gam.out$nn))
cat(rep,"\n",fittedVSnew$gam.out$nn,"\n",nnset,"\n", qremFit$empq,"\n\n")
if (length(fittedVSnew$gam.out$nn) == length(nnset)) {
if (all(fittedVSnew$gam.out$nn == nnset)) {
break
}
}
nnset <- fittedVSnew$gam.out$nn
}

# after the loop
fittedGLM <- runLinearModel(dataYXZtmp,nnset, "N")
print(summary(fittedGLM$mod))
plotMDS(dataYXZ, fittedVSnew, fittedGLM, ttl="... Data")
plotFit(fittedGLM)
plot(y0, col=(2+(qremFit$ui>0)), cex=0.7, pch=19)

if(length(nnset) > 0) {
for (i in 1:length(nnset)) {
plot(dataYXZ$Z[,nnset[i]],dataYXZ$Y, col=1+(qremFit$ui>0))
}
}

res[[1]] <- list(y0, nnset, qremFit, fittedGLM)

# loop over all the columns, and get the estimates for q=0.25, 0.5, 0.75
#
# This part can take a while... 4088 quantile regression models with
# variable selection

for (k in 1:K) {
cat(k,"...\n")
dataYXZ <- readInputFile(fn, ycol=k+1, Zcols=setdiff(predictors,k+1))
n <- dataYXZ$N
y0 <- dataYXZ$Y
K <- dataYXZ$K
zval <- rep(0, K)
for (i in seq(1,K,by=5)) {
if (i %% 101 == 0) { cat(i,"...\n") }
preds <- paste0(colnames(dataYXZ$Z)[i:min(i+4,K)], collapse = " + ")
linmod <- as.formula(paste("Y ~", preds))
dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,i:min(i+4,K)]))
colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[i:min(i+4,K)])
qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
zval[i:min(i+4,K)] <- qremFit$coef$beta[-1]/sqrt(diag(bcov(qremFit,linmod,dframetmp,qn)))[-1]
}
nnset = order(abs(zval),decreasing = TRUE)[1:nn]
#cat("initial set", nnset,"\n")
for (rep in 1:maxReps) {
# create a subset of the selected columns and run QREM
preds <- paste(colnames(dataYXZ$Z)[nnset], collapse = "+")
linmod <- as.formula(paste("Y ~", preds))
dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,nnset]))
colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[nnset])
qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
# apply the weights found by QREM and rerun SEMMS
dataYXZtmp <- dataYXZ
#dataYXZtmp$Z <- diag(sqrt(qremFit$weights))%*%dataYXZ$Z
dataYXZtmp$Y <- (dataYXZ$Y-(1-2*qn)/qremFit$weights)#*sqrt(qremFit$weights)
fittedVSnew <- fitSEMMS(dataYXZtmp,distribution = 'N', mincor=0.8,rnd=F,
nnset=nnset, minchange = 1, maxst = 20)
foundSEMMSnew <- sort(union(which(fittedVSnew$gam.out$lockedOut != 0),
fittedVSnew$gam.out$nn))
cat(rep,"\n",fittedVSnew$gam.out$nn,"\n",nnset,"\n", qremFit$empq,"\n\n")
if (length(fittedVSnew$gam.out$nn) == length(nnset)) {
if (all(fittedVSnew$gam.out$nn == nnset)) {
break
}
}
nnset <- fittedVSnew$gam.out$nn
if (length(nnset) == 0)
break
}

if (length(nnset) == 0) {
res[[k+1]] <- list()
next
}
fittedGLM <- runLinearModel(dataYXZtmp,nnset, "N")
res[[k+1]] <- list(y0, nnset, qremFit, fittedGLM)
}
save(res, file="B2q25.RData") # change to B2q50 and B2q75 when using different
# quartiles

0 comments on commit 28eae5b

Please sign in to comment.