From a590b6e86e24101906ddba85febdaea85c5d7143 Mon Sep 17 00:00:00 2001 From: Haim Bar Date: Fri, 10 Jul 2020 21:07:53 -0400 Subject: [PATCH] Add files via upload --- SimulationAR1.R | 129 +++++++++++++++++++++++++++++++++++++++++ SimulationsTable3.R | 136 ++++++++++++++++++++++++++++++++++++++++++++ sims.R | 42 ++++++++++++++ 3 files changed, 307 insertions(+) create mode 100644 SimulationAR1.R create mode 100644 SimulationsTable3.R create mode 100644 sims.R diff --git a/SimulationAR1.R b/SimulationAR1.R new file mode 100644 index 0000000..b41c9d3 --- /dev/null +++ b/SimulationAR1.R @@ -0,0 +1,129 @@ +rm(list=ls()) +library("SEMMS") +library("QREM") +library("rqPen") +library("MASS") + +nn <- 5 +maxRep <- 40 +plots <- FALSE + + +simulateAR1 <- function(P, N, corcoef=0.9) { + Sigma <- toeplitz(1:P) + for (i in 2:P) { + Sigma[which(Sigma == i)] <- corcoef^(i-1) + } + mvrnorm(N, rep(0,P), Sigma) +} + +confmat <- function(allidx, actual, estimated) { + TP <- which(estimated %in% actual) + FP <- which(estimated %in% setdiff(allidx,actual)) + TN <- which(setdiff(allidx,estimated) %in% setdiff(allidx,actual)) + FN <- which(setdiff(allidx,estimated) %in% actual) + c(length(TP), length(FP),length(TN), length(FN)) +} + +checkLoss <- function(ui,qn) { + ui*(qn - as.numeric(ui < 0)) +} + +runSEMMSandQREM <- function(filename, qn, nnset=NULL) { + dataYXZ <- readInputFile(filename, ycol=1, Zcols=2:1001) + n <- dataYXZ$N + K <- dataYXZ$K + t0=Sys.time() + if (is.null(nnset)) { + cat("initializing...\n") + zval <- rep(0, K) + rnd <- sample(K, replace = FALSE) + m <- 5 + for (i in 1:(1000/m)) { + idx <- ((i-1)*m+1) : (i*m) + preds <- paste0(colnames(dataYXZ$Z)[rnd[idx]], collapse = " + ") + linmod <- as.formula(paste("Y ~", preds)) + dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,rnd[idx]])) + colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[rnd[idx]]) + qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000) + zval[rnd[idx]] <- qremFit$coef$beta[-1]/sqrt(diag(bcov(qremFit,linmod,dframetmp,qn)))[-1] + } + nnset <- order(abs(zval),decreasing = TRUE)[1:nn] + } + + t1=Sys.time() + cat(round(difftime(t1,t0, units = "secs")),"seconds. initial set", nnset,"\n") + ll <- 0 + for (repno in 1:maxRep) { + # create a subset of the selected columns and run QREM + preds <- paste(colnames(dataYXZ$Z)[nnset], collapse = "+") + linmod <- as.formula(paste("Y ~", preds)) + dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,nnset])) + colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[nnset]) + qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000) + newll <- sum(checkLoss(qremFit$ui,qn)) + if (abs(ll - newll) < 1e-3) { + nnset <- sort(c(fittedVSnew$gam.out$nn, which(fittedVSnew$gam.out$lockedOut == 1 ))) + cat(repno,"steps\n") + return(nnset) + } + ll <- newll + # apply the weights found by QREM and rerun SEMMS + dataYXZtmp <- dataYXZ + dataYXZtmp$Y <- (dataYXZ$Y-(1-2*qn)/qremFit$weights) + fittedVSnew <- fitSEMMS(dataYXZtmp,distribution = 'N', mincor=0.8,rnd=F, + nnset=nnset, minchange = 1, maxst = 20) + if (length(fittedVSnew$gam.out$nn) == 0) { + return(fittedVSnew$gam.out$nn) + } + t2=Sys.time() + cat(round(difftime(t2,t1,units = "secs")),"seconds.",repno,":\t",fittedVSnew$gam.out$nn,":\t", qremFit$empq,"\n") + t1=t2 + nnset <- fittedVSnew$gam.out$nn + } + if (plots) { + fittedGLM <- runLinearModel(dataYXZtmp,nnset, "N") + print(summary(fittedGLM$mod)) + plotMDS(dataYXZ, fittedVSnew, fittedGLM, ttl="...") + plotFit(fittedGLM) + plot(dataYXZ$Y, col=(2+(qremFit$ui>0)), cex=0.7, pch=19) + for (i in 2:ncol(dframetmp)) { + qrdiag <- QRdiagnostics(dframetmp[,i],colnames(dframetmp)[i],qremFit$ui,qn) + } + } + nnset <- sort(c(fittedVSnew$gam.out$nn, which(fittedVSnew$gam.out$lockedOut == 1 ))) + t3=Sys.time() + cat(round(difftime(t3,t0,units = "secs")),"seconds (total). Done\n") + nnset +} + +res <- matrix(0,ncol=11,nrow=100*9) +cnt <- 1 +simno = 9 +qns = seq(0.1,0.9,by=0.1) +for (i in 1:100) { + M0 = simulateAR1(980,100,0) + M = simulateAR1(20,100, corcoef=0.95) + truepreds = 1:20 + X = cbind(M,M0) + y = rowSums(X[,1:20]+rnorm(100,0,0.1)) + dframe <- as.data.frame(cbind(y,X)) + datfn <- "simAR.RData" + save(dframe, file=datfn) + for (qn in qns) { + L1fit <- rq.lasso.fit(X,y,lambda=0.13,tau = qn) + selectedL1 <- which(L1fit$coefficients[-1]!=0) + if (length(selectedL1) > 2) { + selected <- runSEMMSandQREM(datfn, qn, nnset=selectedL1) + } else { + selected <- runSEMMSandQREM(datfn, qn) + } + res[cnt,] <- c(simno, i, qn, confmat(1:1000, truepreds, selected), + confmat(1:1000, truepreds, selectedL1)) + cat(c(simno, i, qn, confmat(1:1000, truepreds, selected), + confmat(1:1000, truepreds, selectedL1)),"\n") + cnt <- cnt+1 + } +} + +save(res,file="resSim.RData") diff --git a/SimulationsTable3.R b/SimulationsTable3.R new file mode 100644 index 0000000..e0cbc1b --- /dev/null +++ b/SimulationsTable3.R @@ -0,0 +1,136 @@ +# Simulation for the QREM+SEMMS paper +# see sims.R for simulation configurations. + +# compare with mean-model (SEMMS) +# plots - after simulations, quantile functions + +rm(list=ls()) +library("SEMMS") +library("QREM") +source("sims.R") +nn <- 4 +maxRep <- 40 +plots <- FALSE +simsToRun <- 8 # see sims.R +res <- matrix(0,ncol=7,nrow=100*9*length(simsToRun)) + +qn <- 0.8 +simno = 1 +i=1 + +confmat <- function(allidx, actual, estimated) { + TP <- which(estimated %in% actual) + FP <- which(estimated %in% setdiff(allidx,actual)) + TN <- which(setdiff(allidx,estimated) %in% setdiff(allidx,actual)) + FN <- which(setdiff(allidx,estimated) %in% actual) + c(length(TP), length(FP),length(TN), length(FN)) +} + +runSEMMSandQREM <- function(filename, qn) { + dataYXZ <- readInputFile(filename, ycol=1, Zcols=2:501) + n <- dataYXZ$N + K <- dataYXZ$K + t0=Sys.time() + cat("initializing...\n") + pval <- rep(0, K) ### need to speed up pval initialization! + for (i in 1:K) { + linmod <- as.formula(paste("Y ~", colnames(dataYXZ$Z)[i])) + dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,i])) + colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[i]) + qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000) + sm <- summary(qremFit$fitted.mod) + pval[i] <- sm$coefficients[2,4] + } + nnset = order(pval)[1:nn] + t1=Sys.time() + cat(round(difftime(t1,t0, units = "secs")),"seconds. initial set", nnset,"\n") + for (rep in 1:maxRep) { + # create a subset of the selected columns and run QREM + preds <- paste(colnames(dataYXZ$Z)[nnset], collapse = "+") + linmod <- as.formula(paste("Y ~", preds)) + dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,nnset])) + colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[nnset]) + qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000) + # apply the weights found by QREM and rerun SEMMS + dataYXZtmp <- dataYXZ + dataYXZtmp$Y <- (dataYXZ$Y-(1-2*qn)/qremFit$weights) + fittedVSnew <- fitSEMMS(dataYXZtmp,distribution = 'N', mincor=0.8,rnd=F, + nnset=nnset, minchange = 1, maxst = 20) + if (length(fittedVSnew$gam.out$nn) == 0) { + return(fittedVSnew$gam.out$nn) + } + #foundSEMMSnew <- sort(union(which(fittedVSnew$gam.out$lockedOut != 0), + # fittedVSnew$gam.out$nn)) + t2=Sys.time() + cat(round(difftime(t2,t1,units = "secs")),"seconds.",rep,":\t",fittedVSnew$gam.out$nn,":\t", qremFit$empq,"\n") + t1=t2 + if (length(fittedVSnew$gam.out$nn) == length(nnset)) { + if (all(fittedVSnew$gam.out$nn == nnset)) { + break + } + } + nnset <- fittedVSnew$gam.out$nn + } + if (plots) { + fittedGLM <- runLinearModel(dataYXZtmp,nnset, "N") + print(summary(fittedGLM$mod)) + plotMDS(dataYXZ, fittedVSnew, fittedGLM, ttl="...") + plotFit(fittedGLM) + plot(dataYXZ$Y, col=(2+(qremFit$ui>0)), cex=0.7, pch=19) + for (i in 2:ncol(dframetmp)) { + qrdiag <- QRdiagnostics(dframetmp[,i],colnames(dframetmp)[i],qremFit$ui,qn) + } + } + t3=Sys.time() + cat(round(difftime(t3,t0,units = "secs")),"seconds (total). Done\n") + nnset +} + +cnt <- 1 +for (simno in 1:length(sims)) { + if (simno %in% simsToRun) { + cat("\nSim. #",simno,"\n") + n <- sims[[simno]]$n + qns <- sims[[simno]]$qns + reps <- sims[[simno]]$reps + coefs <- sims[[simno]]$coefs + lp <- as.list(attr(terms(sims[[simno]]$mod), "variables"))[-(1:2)] + truepreds = rep(0,length(lp)) + for(i in 1:length(lp)) { + truepreds[i] = gsub("[a-zA-Z]","",lp[[i]], perl=TRUE) + } + truepreds <- as.numeric(truepreds) + + xrng <- matrix(sims[[simno]]$xrng, nrow=(length(coefs)-1), ncol=2) + for (i in 1:reps) { + set.seed(simno*100000 + sims[[simno]]$seed + i) + X <- matrix(0,nrow=n, ncol=length(coefs)) + X[,1] <- rep(1,n) + for (jj in 1:(length(coefs)-1)) { + X[,jj+1] <- runif(n, min=xrng[jj,1], max=xrng[jj,2]) + } + colnames(X) <- c("const", paste("X",1:(length(coefs)-1),sep="")) + if (sims[[simno]]$errvar == 0) { + errs <- sims[[simno]]$err(rep(0,n)) + } else { + if (length(sims[[simno]]$errvar) == 2) { + errs <- sims[[simno]]$err(X[,sims[[simno]]$errvar[1]+1], X[,sims[[simno]]$errvar[2]+1]) + } else { + errs <- sims[[simno]]$err(X[,sims[[simno]]$errvar+1]) + } + } + y <- X%*%coefs + errs + dframe <- data.frame(y,X[,-1], matrix(rnorm((500-ncol(X)+1)*n,0,0.1), + nrow=n, ncol=(500-ncol(X)+1))) + datfn <- sprintf("data/sim%02d.RData",simno) + save(dframe, file=datfn) + for (qn in qns) { + selected <- runSEMMSandQREM(datfn, qn) + res[cnt,] <- c(simno, i, qn, confmat(1:500, truepreds, selected)) + cat(c(simno, i, qn, confmat(1:500, truepreds, selected)),"\n") + cnt <- cnt+1 + } + } + save(res,file=sprintf("results/resSim%02d.RData",simno)) + } +} diff --git a/sims.R b/sims.R new file mode 100644 index 0000000..8f61754 --- /dev/null +++ b/sims.R @@ -0,0 +1,42 @@ +sims <- list( + # 1. intercept only, but increasing variance + list(seed=11001, n=200, err=function(x) {rnorm(length(x), 0, 0.1+x)}, + errvar=1, mod=y~X1, coefs=c(3,0), + qns=seq(0.1,0.9,by=0.1), + B=0, reps=100, xrng=rbind(c(0,1))), + # 2. SLR, constant variance + list(seed=11001, n=200, err=function(x) {rnorm(length(x), 0, 0.3)}, + errvar=0, mod=y~X1, coefs=c(5,-1), + qns=seq(0.1,0.9,by=0.1), + B=0, reps=100, xrng=rbind(c(0,1))), + # 3. SLR, increasing variance + list(seed=11001, n=200, err=function(x) {rnorm(length(x), 0, 0.1+x)}, + errvar=1, mod=y~X1, coefs=c(5,-1), + qns=seq(0.1,0.9,by=0.1), + B=0, reps=100, xrng=rbind(c(0,1))), + # 4. multiple variables, constant variance + list(seed=11001, n=200, err=function(x) {rnorm(length(x), 0, 0.1)}, + errvar=0, mod=y~X1+X2+X3+X4+X5, coefs=c(1,-3,2,2,-1,-2), + qns=seq(0.1,0.9,by=0.1), + B=0, reps=100, xrng=rbind(c(0,1),c(0,1),c(0,1),c(0,1),c(0,1))), + # 5. multiple variables, non-constant variance: + list(seed=11001, n=200, err=function(x) {rnorm(length(x), 0, 0.1+x)}, + errvar=1, mod=y~X1+X2+X3+X4+X5, coefs=c(1,-3,2,2,-1,-2), + qns=seq(0.1,0.9,by=0.1), + B=0, reps=100, xrng=rbind(c(0,1),c(0,1),c(0,1),c(0,1),c(0,1))), + # 6. variance depends on x1, x2 + list(seed=11001, n=200, err=function(x1, x2) {rnorm(length(x1), 0, 0.1+x1+1.3*x2)}, + errvar=c(1,2), mod=y~X1+X2+X3+X4+X5, coefs=c(1,-3,2,2,-1,-2), + qns=seq(0.1,0.9,by=0.1), + B=0, reps=100, xrng=rbind(c(0,1),c(0,1),c(0,1),c(0,1),c(0,1))), + # 7. non-normal errors + list(seed=11001, n=200, err=function(x) {rlnorm(length(x), 0, 0.75)}, + errvar=0, mod=y~X1+X2+X3+X4+X5, coefs=c(1,-3,2,2,-1,-2), + qns=seq(0.1,0.9,by=0.1), + B=0, reps=100, xrng=rbind(c(0,1),c(0,1),c(0,1),c(0,1),c(0,1))), + # 8. non-normal errors which depend on a variable + list(seed=11001, n=200, err=function(x) {rlnorm(length(x), 0, 0.25+0.5*x)}, + errvar=1, mod=y~X1, coefs=c(2, -2), + qns=seq(0.1,0.9,by=0.1), + B=0, reps=100, xrng=rbind(c(0,1))) +)