Add files via upload

hyb13001 · Jul 10, 2020 · 28eae5b · 28eae5b
1 parent 056b36f
commit 28eae5b
Show file tree

Hide file tree

Showing 7 changed files with 636 additions and 0 deletions.
diff --git a/B12quartiles.RData b/B12quartiles.RData
diff --git a/ERdat2006.RData b/ERdat2006.RData
diff --git a/ERwithBootstrap.R b/ERwithBootstrap.R
@@ -0,0 +1,236 @@
+rm(list=ls())
+library("QREM")
+boot.QREM <- function(func, linmod, dframe0, qn, n, sampleFrom=NULL, 
+                      B=100, err=10, maxit=1000, tol=0.001, 
+                      maxInvLambda=300, seedno=71371, showEst=FALSE) {
+  t0 <- Sys.time()
+  set.seed(seedno)
+  bs_set <- sample(n, replace = TRUE)
+  if (! is.null(sampleFrom))
+    colNum <- which(colnames(dframe0) == sampleFrom)
+  else
+    colNum <- 0
+  if (! is.null(sampleFrom))
+    dframe <- dframe0[which(dframe0[,colNum] %in% bs_set),]
+  else
+    dframe <- dframe0[bs_set,]
+  qremFit0 <- QREM(func, linmod, dframe, qn, err=err, maxit=maxit,
+                   tol=tol, maxInvLambda=maxInvLambda)
+  if (B == 1)
+    return(qremFit0$coef$beta)
+
+  oneIt <- as.numeric(difftime(Sys.time(), t0, units = "secs"))
+  useCores <- detectCores() - 1
+  if (showEst){
+    cat("One iteration ", ceiling(oneIt), "seconds\n")
+    cat("Estimated completion time, using", useCores, " cores >", 
+        ceiling(oneIt*ceiling(B/useCores))," seconds\n")
+  }
+  t0 <- Sys.time()
+  n_coefs <- length(qremFit0$coef$beta)
+  # Initiate cluster
+  cl <- makeCluster(useCores)
+  clusterExport(cl,varlist=c("func","linmod", "dframe0", "qn","n","QREM",
+                             "lm","lmer", "gam",
+                             "getME","colNum","fixef","ranef","seedno",
+                             "err", "maxit","tol","maxInvLambda","seedno"),
+                envir=environment())
+  QREMpar=parLapply(cl, 1:(B-1),
+                    function(repnum) {
+                      set.seed(seedno + 19 * repnum)
+                      bs_set <- sample(n, replace = TRUE)
+                      if (! is.null(sampleFrom))
+                        dframe <- dframe0[which(dframe0[,colNum] %in% bs_set),]
+                      else
+                        dframe <- dframe0[bs_set,]
+                      qremFit <- QREM(func, linmod, dframe, qn, err=err, maxit=maxit,
+                                      tol=tol, maxInvLambda=maxInvLambda)
+                      qremFit$coef$beta
+                    }
+  )
+  stopCluster(cl)
+  if (showEst)
+    cat("Actual completion time =", ceiling(as.numeric(difftime(Sys.time(), t0, units = "secs")))," seconds\n")
+  rbind(qremFit0$coef$beta, matrix(unlist(QREMpar), ncol = n_coefs, byrow = TRUE))
+}
+
+
+zalpha <- qnorm(0.025)
+
+# data from
+# ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Datasets/NHAMCS
+
+load("ERdat2006.RData")
+# pre-process the data and create the variables and the data frame
+# show some diagnostics plots and tables
+#
+y <- log(ERdat$LOV+1,base=60)
+hist(y,breaks=20)
+Month <- as.factor(ERdat$month)
+barplot(table(Month))
+DOW <- as.factor(ERdat$dow)
+barplot(table(DOW))
+Sex <- as.factor(ERdat$sex)
+levels(Sex)  <- c("F","M")
+pie(table(Sex))
+plot(y~Month)
+plot(y~DOW)
+Race <- as.factor(ERdat$race)
+barplot(table(Race))
+Race2 <- Race
+Race2[which(as.numeric(as.character(Race)) %in% c(-9,3,4,5,6))] <- 3
+Race2 <- factor(Race2)
+levels(Race2) = c("W","B","O")
+barplot(table(Race2))
+Age <- ERdat$age/100
+hist(Age)
+PayType <- as.factor(ERdat$paytype)
+# 0 = Blank, 1 = Private insurance, 2 = Medicare, 3 = Medicaid/SCHIP
+# 4 = Worker's Compensation, 5 = Self-pay, 6 = No charge/charity
+# 7 = Other, 8 = Unknown
+barplot(table(PayType))
+# combine 0,7,8, 
+PayType2 <- PayType
+PayType2[which(as.numeric(as.character(PayType)) %in% c(2,3,4))] <- 2
+PayType2[which(as.numeric(as.character(PayType)) %in% c(5))] <- 3
+PayType2[which(as.numeric(as.character(PayType)) %in% c(-9,-8,0,6,7,8))] <- 4
+PayType2 <- factor(PayType2)
+levels(PayType2) = c("Private","GovEmp","Self","Other")
+barplot(table(PayType2))
+
+Temp <- ERdat$temp/10
+hist(Temp,breaks=20)
+Pulse <- ERdat$pulse
+hist(Pulse,breaks=20)
+SBP <- ERdat$sbp
+hist(SBP,breaks=20)
+DBP <- ERdat$dbp
+hist(DBP,breaks=20)
+
+Pain <- as.factor(ERdat$pain)
+barplot(table(Pain))
+
+Residence <- as.factor(ERdat$resid)
+barplot(table(Residence))
+ArrivalMode <- as.factor(ERdat$arrmode)
+barplot(table(ArrivalMode))
+ArrivalTime <- as.factor(floor(ERdat$arrtime/100))
+ArrivalTime2 <- floor(ERdat$arrtime/100)
+ArrivalTime2[which(ArrivalTime2 >8 & ArrivalTime2 < 20)] <- "AM"
+ArrivalTime2[which(ArrivalTime2 != "AM")] <- "PM"
+barplot(table(ArrivalTime))
+ArrivalTime2 = as.factor(ArrivalTime2)
+barplot(table(ArrivalTime2))
+
+Region <- as.factor(ERdat$region)
+levels(Region) <- c("NE","MW","S","W")
+barplot(table(Region))
+Metro <- as.factor(ERdat$metro)
+levels(Metro) <- c("Yes", "No")
+barplot(table(Metro))
+Owner <- as.factor(ERdat$owner)
+barplot(table(Owner))
+HospCode <- ERdat$hosp 
+hist(HospCode, breaks=30)
+
+RecentVisit <- rep("N",length(HospCode))
+if(length(ERdat$disch7 ) == 0) { RecentVisit[which(ERdat$seen72 == 1)] <- "Y"
+} else {  RecentVisit[which(ERdat$seen72 == 1 | ERdat$disch7 ==1)] <- "Y" }
+RecentVisit <- as.factor(RecentVisit)
+table(RecentVisit)
+
+
+dframe <- data.frame(y,Sex,Month,DOW,Race2,Age,PayType2,Temp,Pulse,
+                     SBP,DBP,Pain,Residence,ArrivalMode,ArrivalTime2,
+                     Region, Metro, #Owner,
+                     HospCode, RecentVisit)
+qs <- c(seq(0.05, 0.95, by=0.05))
+# no batch effect
+linmod <- y~ Sex+Race2+Age+Region+Metro+
+  PayType2+ArrivalTime2+DOW+RecentVisit
+ncols <- nlevels(Sex)-1+nlevels(Race2)-1+1+
+  nlevels(PayType2)-1+
+  nlevels(Region)-1+nlevels(Metro)-1+nlevels(ArrivalTime2)-1+
+  nlevels(DOW)-1+nlevels(RecentVisit)-1+1
+
+res1 <- matrix(0,nrow=length(qs), ncol=2*ncols)
+qqp  <- matrix(0, nrow=length(qs), ncol=3)
+for (i in 1:length(qs)) {
+  cat(i,qs[i],"\n")
+  qremFit <- QREM(lm,linmod, dframe, qs[i])
+  varKED <- bcov(qremFit, linmod=linmod, dframe, qs[i])
+  res1[i,] <- c(as.numeric(qremFit$fitted.mod$coefficients), sqrt(diag(varKED)))
+}
+
+
+# hospital is a random effect:
+linmodrnd <- y~ Sex+Race2+Age+Region+Metro+
+  PayType2+ArrivalTime2+DOW+
+  RecentVisit+ (1|HospCode)
+ncols2 <- nlevels(Sex)-1+nlevels(Race2)-1+1+
+  nlevels(PayType2)-1+
+  nlevels(Region)-1+nlevels(Metro)-1+nlevels(ArrivalTime2)-1+
+  nlevels(DOW)-1+nlevels(RecentVisit)-1+1
+
+# set onlyEstimate = FALSE if you want to get regression coefficient estimates
+# without running the bootstrap (which takes a long time):
+res2 <- matrix(0,nrow=length(qs), ncol=2*ncols2)
+onlyEstimate <- TRUE
+if (onlyEstimate) {
+  for (i in 1:length(qs)) {
+    cat(i,qs[i],"\n")
+    qremFit <- QREM(lmer,linmodrnd, dframe, qs[i], maxit = 2000)
+    res2[i,] <- c(as.numeric(qremFit$coef$beta), rep(0,ncols2))
+  }
+} else {
+  B <- 99
+  for (i in 1:length(qs)) {
+    cat(i,qs[i],"\n")
+    bsv <-   boot.QREM(lmer, linmodrnd, dframe, qs[i], 100, #length(unique(HospCode)), 
+                       "HospCode", maxit = 2000, B=B, seedno=336621, showEst = TRUE)
+    res2[i,] <- c(colMeans(bsv), apply(bsv,2,sd))
+  }
+}
+
+#save(res1,res2,file="ERresults1120.RData")
+
+# parameter estimates with 95% confidence intervals for each predictor, by quantile
+# for the two models (with/without random effect) using smooth splines
+# names(fixef(qremFit$fitted.mod))
+varnames <- c("(Intercept)", "SexM", "Race2B", "Race2O",  "Age", "RegionMW",   
+              "RegionS", "RegionW", "MetroNo", "PayType2GovEmp", "PayType2Self", "PayType2Other" ,
+              "ArrivalTime2PM", "DOW2", "DOW3", "DOW4", "DOW5", "DOW6",      
+               "DOW7", "RecentVisitY")
+ciCols <- c("navyblue","darkred","orange") 
+sspldf=10
+for (j in 1:(ncol(res1)/2)) {
+  mm <- min(res1[,j]-abs(zalpha)*res1[,j+ncol(res1)/2], res2[,j]-abs(zalpha)*res2[,j+ncol(res2)/2])
+  mm <- mm - abs(mm)*0.1
+  MM <- max(res1[,j]+abs(zalpha)*res1[,j+ncol(res1)/2], res2[,j]+abs(zalpha)*res2[,j+ncol(res2)/2])
+  MM <- MM + abs(MM)*0.1
+  #pdf(sprintf("fig/ER%02d.pdf",j),width = 5, height = 5)
+  plot(smooth.spline(qs,res1[,j],df=sspldf),type='l', axes=F, ylim=c(mm,MM),
+       main=varnames[j], ylab="Coef.", xlab="quantile",col=ciCols[1],lwd=2)
+  axis(1,labels=seq(0,1,by=0.1), at=seq(0,1,by=.1)); axis(2)
+  lines(smooth.spline(qs,res2[,j],df=sspldf),col=ciCols[2],lwd=2,lty=2)
+  yyl <- c(res1[,j]-abs(zalpha)*res1[,j+ncol(res1)/2])
+  yyu <- c(res1[,j]+abs(zalpha)*res1[,j+ncol(res1)/2])
+  sspl <- smooth.spline(qs, yyl, df=sspldf)
+  sspu <- smooth.spline(qs, yyu, df=sspldf)
+  xx <- c(sspl$x, rev(sspu$x))
+  yy <- c(sspl$y, rev(sspu$y))
+  polygon(xx, yy, col = adjustcolor(ciCols[1], alpha.f=0.1), 
+          border = ciCols[1], lty=1)
+
+  yyl <- c(res2[,j]-abs(zalpha)*res2[,j+ncol(res2)/2])
+  yyu <- c(res2[,j]+abs(zalpha)*res2[,j+ncol(res2)/2])
+  sspl <- smooth.spline(qs, yyl, df=sspldf)
+  sspu <- smooth.spline(qs, yyu, df=sspldf)
+  xx <- c(sspl$x, rev(sspu$x))
+  yy <- c(sspl$y, rev(sspu$y))
+  polygon(xx, yy, col = adjustcolor(ciCols[2], alpha.f=0.1), 
+          border = ciCols[2], lty=1)
+
+  abline(h=0,lwd=2,col="grey66")
+  #dev.off()
+}
diff --git a/QREMVSB12.R b/QREMVSB12.R
@@ -0,0 +1,128 @@
+library(QREM)
+library(SEMMS)
+
+res <- list()
+
+qn <- 0.25 # change to 0.5 and 0.75
+maxReps <- 20
+fn <- "riboflavin1.csv"
+responseCol <- 1
+predictors <- 2:4089
+dataYXZ <- readInputFile(fn, ycol=responseCol, Zcols=predictors)
+nn <- 10
+n <- dataYXZ$N
+y0 <- dataYXZ$Y
+K <- dataYXZ$K
+# initialize:
+zval <- rep(0, K)
+for (i in seq(1,K,by=5)) {
+  if (i %% 101 == 0) { cat(i,"...\n") }
+  preds <- paste0(colnames(dataYXZ$Z)[i:min(i+4,K)], collapse = " + ")
+  linmod <- as.formula(paste("Y ~", preds))
+  dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,i:min(i+4,K)]))
+  colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[i:min(i+4,K)])
+  qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
+  zval[i:min(i+4,K)] <- qremFit$coef$beta[-1]/sqrt(diag(bcov(qremFit,linmod,dframetmp,qn)))[-1]
+}
+nnset <- order(abs(zval),decreasing = TRUE)[1:nn]
+#cat("initial set", nnset,"\n")
+
+for (rep in 1:maxReps) {
+  # create a subset of the selected columns and run QREM
+  preds <- paste(colnames(dataYXZ$Z)[nnset], collapse = "+")
+  linmod <- as.formula(paste("Y ~", preds))
+  dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,nnset]))
+  colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[nnset])
+  qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
+  # apply the weights found by QREM and rerun SEMMS
+  dataYXZtmp <- dataYXZ
+  #dataYXZtmp$Z <- diag(sqrt(qremFit$weights))%*%dataYXZ$Z
+  dataYXZtmp$Y <- (dataYXZ$Y-(1-2*qn)/qremFit$weights)#*sqrt(qremFit$weights)
+  #plot(dataYXZ$Y, dataYXZtmp$Y); abline(0,1,col=2)
+
+  fittedVSnew <- fitSEMMS(dataYXZtmp,distribution = 'N', mincor=0.8,rnd=F,
+                          nnset=nnset, minchange = 1, maxst = 20)
+  foundSEMMSnew <- sort(union(which(fittedVSnew$gam.out$lockedOut != 0),
+                              fittedVSnew$gam.out$nn))
+  cat(rep,"\n",fittedVSnew$gam.out$nn,"\n",nnset,"\n", qremFit$empq,"\n\n")
+  if (length(fittedVSnew$gam.out$nn) == length(nnset)) {
+    if (all(fittedVSnew$gam.out$nn == nnset)) {
+      break
+    }
+  }
+  nnset <- fittedVSnew$gam.out$nn
+}
+
+# after the loop
+fittedGLM <- runLinearModel(dataYXZtmp,nnset, "N")
+print(summary(fittedGLM$mod))
+plotMDS(dataYXZ, fittedVSnew, fittedGLM, ttl="... Data")
+plotFit(fittedGLM)
+plot(y0, col=(2+(qremFit$ui>0)), cex=0.7, pch=19)
+
+if(length(nnset) > 0) {
+  for (i in 1:length(nnset)) {
+    plot(dataYXZ$Z[,nnset[i]],dataYXZ$Y, col=1+(qremFit$ui>0))
+  }
+}
+
+res[[1]] <-  list(y0, nnset, qremFit, fittedGLM)
+
+# loop over all the columns, and get the estimates for q=0.25, 0.5, 0.75
+#
+#  This part can take a while... 4088 quantile regression models with
+#    variable selection
+
+for (k in 1:K) {
+  cat(k,"...\n")
+  dataYXZ <- readInputFile(fn, ycol=k+1, Zcols=setdiff(predictors,k+1))
+  n <- dataYXZ$N
+  y0 <- dataYXZ$Y
+  K <- dataYXZ$K
+  zval <- rep(0, K)
+  for (i in seq(1,K,by=5)) {
+    if (i %% 101 == 0) { cat(i,"...\n") }
+    preds <- paste0(colnames(dataYXZ$Z)[i:min(i+4,K)], collapse = " + ")
+    linmod <- as.formula(paste("Y ~", preds))
+    dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,i:min(i+4,K)]))
+    colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[i:min(i+4,K)])
+    qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
+    zval[i:min(i+4,K)] <- qremFit$coef$beta[-1]/sqrt(diag(bcov(qremFit,linmod,dframetmp,qn)))[-1]
+  }
+  nnset = order(abs(zval),decreasing = TRUE)[1:nn]
+  #cat("initial set", nnset,"\n")
+  for (rep in 1:maxReps) {
+    # create a subset of the selected columns and run QREM
+    preds <- paste(colnames(dataYXZ$Z)[nnset], collapse = "+")
+    linmod <- as.formula(paste("Y ~", preds))
+    dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,nnset]))
+    colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[nnset])
+    qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
+    # apply the weights found by QREM and rerun SEMMS
+    dataYXZtmp <- dataYXZ
+    #dataYXZtmp$Z <- diag(sqrt(qremFit$weights))%*%dataYXZ$Z
+    dataYXZtmp$Y <- (dataYXZ$Y-(1-2*qn)/qremFit$weights)#*sqrt(qremFit$weights)
+    fittedVSnew <- fitSEMMS(dataYXZtmp,distribution = 'N', mincor=0.8,rnd=F,
+                            nnset=nnset, minchange = 1, maxst = 20)
+    foundSEMMSnew <- sort(union(which(fittedVSnew$gam.out$lockedOut != 0),
+                                fittedVSnew$gam.out$nn))
+    cat(rep,"\n",fittedVSnew$gam.out$nn,"\n",nnset,"\n", qremFit$empq,"\n\n")
+    if (length(fittedVSnew$gam.out$nn) == length(nnset)) {
+      if (all(fittedVSnew$gam.out$nn == nnset)) {
+        break
+      }
+    }
+    nnset <- fittedVSnew$gam.out$nn
+    if (length(nnset) == 0)
+      break
+  }
+
+  if (length(nnset) == 0) {
+    res[[k+1]] <- list()
+    next
+  }
+  fittedGLM <- runLinearModel(dataYXZtmp,nnset, "N")
+  res[[k+1]] <- list(y0, nnset, qremFit, fittedGLM)
+}
+save(res, file="B2q25.RData") # change to B2q50 and B2q75 when using different
+                              # quartiles