Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Add files via upload
  • Loading branch information
hyb13001 committed Jul 11, 2020
1 parent cf0bbb5 commit a590b6e
Show file tree
Hide file tree
Showing 3 changed files with 307 additions and 0 deletions.
129 changes: 129 additions & 0 deletions SimulationAR1.R
@@ -0,0 +1,129 @@
rm(list=ls())
library("SEMMS")
library("QREM")
library("rqPen")
library("MASS")

nn <- 5
maxRep <- 40
plots <- FALSE


simulateAR1 <- function(P, N, corcoef=0.9) {
Sigma <- toeplitz(1:P)
for (i in 2:P) {
Sigma[which(Sigma == i)] <- corcoef^(i-1)
}
mvrnorm(N, rep(0,P), Sigma)
}

confmat <- function(allidx, actual, estimated) {
TP <- which(estimated %in% actual)
FP <- which(estimated %in% setdiff(allidx,actual))
TN <- which(setdiff(allidx,estimated) %in% setdiff(allidx,actual))
FN <- which(setdiff(allidx,estimated) %in% actual)
c(length(TP), length(FP),length(TN), length(FN))
}

checkLoss <- function(ui,qn) {
ui*(qn - as.numeric(ui < 0))
}

runSEMMSandQREM <- function(filename, qn, nnset=NULL) {
dataYXZ <- readInputFile(filename, ycol=1, Zcols=2:1001)
n <- dataYXZ$N
K <- dataYXZ$K
t0=Sys.time()
if (is.null(nnset)) {
cat("initializing...\n")
zval <- rep(0, K)
rnd <- sample(K, replace = FALSE)
m <- 5
for (i in 1:(1000/m)) {
idx <- ((i-1)*m+1) : (i*m)
preds <- paste0(colnames(dataYXZ$Z)[rnd[idx]], collapse = " + ")
linmod <- as.formula(paste("Y ~", preds))
dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,rnd[idx]]))
colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[rnd[idx]])
qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
zval[rnd[idx]] <- qremFit$coef$beta[-1]/sqrt(diag(bcov(qremFit,linmod,dframetmp,qn)))[-1]
}
nnset <- order(abs(zval),decreasing = TRUE)[1:nn]
}

t1=Sys.time()
cat(round(difftime(t1,t0, units = "secs")),"seconds. initial set", nnset,"\n")
ll <- 0
for (repno in 1:maxRep) {
# create a subset of the selected columns and run QREM
preds <- paste(colnames(dataYXZ$Z)[nnset], collapse = "+")
linmod <- as.formula(paste("Y ~", preds))
dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,nnset]))
colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[nnset])
qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
newll <- sum(checkLoss(qremFit$ui,qn))
if (abs(ll - newll) < 1e-3) {
nnset <- sort(c(fittedVSnew$gam.out$nn, which(fittedVSnew$gam.out$lockedOut == 1 )))
cat(repno,"steps\n")
return(nnset)
}
ll <- newll
# apply the weights found by QREM and rerun SEMMS
dataYXZtmp <- dataYXZ
dataYXZtmp$Y <- (dataYXZ$Y-(1-2*qn)/qremFit$weights)
fittedVSnew <- fitSEMMS(dataYXZtmp,distribution = 'N', mincor=0.8,rnd=F,
nnset=nnset, minchange = 1, maxst = 20)
if (length(fittedVSnew$gam.out$nn) == 0) {
return(fittedVSnew$gam.out$nn)
}
t2=Sys.time()
cat(round(difftime(t2,t1,units = "secs")),"seconds.",repno,":\t",fittedVSnew$gam.out$nn,":\t", qremFit$empq,"\n")
t1=t2
nnset <- fittedVSnew$gam.out$nn
}
if (plots) {
fittedGLM <- runLinearModel(dataYXZtmp,nnset, "N")
print(summary(fittedGLM$mod))
plotMDS(dataYXZ, fittedVSnew, fittedGLM, ttl="...")
plotFit(fittedGLM)
plot(dataYXZ$Y, col=(2+(qremFit$ui>0)), cex=0.7, pch=19)
for (i in 2:ncol(dframetmp)) {
qrdiag <- QRdiagnostics(dframetmp[,i],colnames(dframetmp)[i],qremFit$ui,qn)
}
}
nnset <- sort(c(fittedVSnew$gam.out$nn, which(fittedVSnew$gam.out$lockedOut == 1 )))
t3=Sys.time()
cat(round(difftime(t3,t0,units = "secs")),"seconds (total). Done\n")
nnset
}

res <- matrix(0,ncol=11,nrow=100*9)
cnt <- 1
simno = 9
qns = seq(0.1,0.9,by=0.1)
for (i in 1:100) {
M0 = simulateAR1(980,100,0)
M = simulateAR1(20,100, corcoef=0.95)
truepreds = 1:20
X = cbind(M,M0)
y = rowSums(X[,1:20]+rnorm(100,0,0.1))
dframe <- as.data.frame(cbind(y,X))
datfn <- "simAR.RData"
save(dframe, file=datfn)
for (qn in qns) {
L1fit <- rq.lasso.fit(X,y,lambda=0.13,tau = qn)
selectedL1 <- which(L1fit$coefficients[-1]!=0)
if (length(selectedL1) > 2) {
selected <- runSEMMSandQREM(datfn, qn, nnset=selectedL1)
} else {
selected <- runSEMMSandQREM(datfn, qn)
}
res[cnt,] <- c(simno, i, qn, confmat(1:1000, truepreds, selected),
confmat(1:1000, truepreds, selectedL1))
cat(c(simno, i, qn, confmat(1:1000, truepreds, selected),
confmat(1:1000, truepreds, selectedL1)),"\n")
cnt <- cnt+1
}
}

save(res,file="resSim.RData")
136 changes: 136 additions & 0 deletions SimulationsTable3.R
@@ -0,0 +1,136 @@
# Simulation for the QREM+SEMMS paper
# see sims.R for simulation configurations.

# compare with mean-model (SEMMS)
# plots - after simulations, quantile functions

rm(list=ls())
library("SEMMS")
library("QREM")
source("sims.R")
nn <- 4
maxRep <- 40
plots <- FALSE
simsToRun <- 8 # see sims.R
res <- matrix(0,ncol=7,nrow=100*9*length(simsToRun))

qn <- 0.8
simno = 1
i=1

confmat <- function(allidx, actual, estimated) {
TP <- which(estimated %in% actual)
FP <- which(estimated %in% setdiff(allidx,actual))
TN <- which(setdiff(allidx,estimated) %in% setdiff(allidx,actual))
FN <- which(setdiff(allidx,estimated) %in% actual)
c(length(TP), length(FP),length(TN), length(FN))
}

runSEMMSandQREM <- function(filename, qn) {
dataYXZ <- readInputFile(filename, ycol=1, Zcols=2:501)
n <- dataYXZ$N
K <- dataYXZ$K
t0=Sys.time()
cat("initializing...\n")
pval <- rep(0, K) ### need to speed up pval initialization!
for (i in 1:K) {
linmod <- as.formula(paste("Y ~", colnames(dataYXZ$Z)[i]))
dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,i]))
colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[i])
qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
sm <- summary(qremFit$fitted.mod)
pval[i] <- sm$coefficients[2,4]
}
nnset = order(pval)[1:nn]
t1=Sys.time()
cat(round(difftime(t1,t0, units = "secs")),"seconds. initial set", nnset,"\n")
for (rep in 1:maxRep) {
# create a subset of the selected columns and run QREM
preds <- paste(colnames(dataYXZ$Z)[nnset], collapse = "+")
linmod <- as.formula(paste("Y ~", preds))
dframetmp <- data.frame(cbind(dataYXZ$Y, dataYXZ$Z[,nnset]))
colnames(dframetmp) <- c("Y",colnames(dataYXZ$Z)[nnset])
qremFit <- QREM(lm,linmod, dframetmp, qn, maxInvLambda = 1000)
# apply the weights found by QREM and rerun SEMMS
dataYXZtmp <- dataYXZ
dataYXZtmp$Y <- (dataYXZ$Y-(1-2*qn)/qremFit$weights)
fittedVSnew <- fitSEMMS(dataYXZtmp,distribution = 'N', mincor=0.8,rnd=F,
nnset=nnset, minchange = 1, maxst = 20)
if (length(fittedVSnew$gam.out$nn) == 0) {
return(fittedVSnew$gam.out$nn)
}
#foundSEMMSnew <- sort(union(which(fittedVSnew$gam.out$lockedOut != 0),
# fittedVSnew$gam.out$nn))
t2=Sys.time()
cat(round(difftime(t2,t1,units = "secs")),"seconds.",rep,":\t",fittedVSnew$gam.out$nn,":\t", qremFit$empq,"\n")
t1=t2
if (length(fittedVSnew$gam.out$nn) == length(nnset)) {
if (all(fittedVSnew$gam.out$nn == nnset)) {
break
}
}
nnset <- fittedVSnew$gam.out$nn
}
if (plots) {
fittedGLM <- runLinearModel(dataYXZtmp,nnset, "N")
print(summary(fittedGLM$mod))
plotMDS(dataYXZ, fittedVSnew, fittedGLM, ttl="...")
plotFit(fittedGLM)
plot(dataYXZ$Y, col=(2+(qremFit$ui>0)), cex=0.7, pch=19)
for (i in 2:ncol(dframetmp)) {
qrdiag <- QRdiagnostics(dframetmp[,i],colnames(dframetmp)[i],qremFit$ui,qn)
}
}
t3=Sys.time()
cat(round(difftime(t3,t0,units = "secs")),"seconds (total). Done\n")
nnset
}

cnt <- 1
for (simno in 1:length(sims)) {
if (simno %in% simsToRun) {
cat("\nSim. #",simno,"\n")
n <- sims[[simno]]$n
qns <- sims[[simno]]$qns
reps <- sims[[simno]]$reps
coefs <- sims[[simno]]$coefs
lp <- as.list(attr(terms(sims[[simno]]$mod), "variables"))[-(1:2)]
truepreds = rep(0,length(lp))
for(i in 1:length(lp)) {
truepreds[i] = gsub("[a-zA-Z]","",lp[[i]], perl=TRUE)
}
truepreds <- as.numeric(truepreds)

xrng <- matrix(sims[[simno]]$xrng, nrow=(length(coefs)-1), ncol=2)
for (i in 1:reps) {
set.seed(simno*100000 + sims[[simno]]$seed + i)
X <- matrix(0,nrow=n, ncol=length(coefs))
X[,1] <- rep(1,n)
for (jj in 1:(length(coefs)-1)) {
X[,jj+1] <- runif(n, min=xrng[jj,1], max=xrng[jj,2])
}
colnames(X) <- c("const", paste("X",1:(length(coefs)-1),sep=""))
if (sims[[simno]]$errvar == 0) {
errs <- sims[[simno]]$err(rep(0,n))
} else {
if (length(sims[[simno]]$errvar) == 2) {
errs <- sims[[simno]]$err(X[,sims[[simno]]$errvar[1]+1], X[,sims[[simno]]$errvar[2]+1])
} else {
errs <- sims[[simno]]$err(X[,sims[[simno]]$errvar+1])
}
}
y <- X%*%coefs + errs
dframe <- data.frame(y,X[,-1], matrix(rnorm((500-ncol(X)+1)*n,0,0.1),
nrow=n, ncol=(500-ncol(X)+1)))
datfn <- sprintf("data/sim%02d.RData",simno)
save(dframe, file=datfn)
for (qn in qns) {
selected <- runSEMMSandQREM(datfn, qn)
res[cnt,] <- c(simno, i, qn, confmat(1:500, truepreds, selected))
cat(c(simno, i, qn, confmat(1:500, truepreds, selected)),"\n")
cnt <- cnt+1
}
}
save(res,file=sprintf("results/resSim%02d.RData",simno))
}
}
42 changes: 42 additions & 0 deletions sims.R
@@ -0,0 +1,42 @@
sims <- list(
# 1. intercept only, but increasing variance
list(seed=11001, n=200, err=function(x) {rnorm(length(x), 0, 0.1+x)},
errvar=1, mod=y~X1, coefs=c(3,0),
qns=seq(0.1,0.9,by=0.1),
B=0, reps=100, xrng=rbind(c(0,1))),
# 2. SLR, constant variance
list(seed=11001, n=200, err=function(x) {rnorm(length(x), 0, 0.3)},
errvar=0, mod=y~X1, coefs=c(5,-1),
qns=seq(0.1,0.9,by=0.1),
B=0, reps=100, xrng=rbind(c(0,1))),
# 3. SLR, increasing variance
list(seed=11001, n=200, err=function(x) {rnorm(length(x), 0, 0.1+x)},
errvar=1, mod=y~X1, coefs=c(5,-1),
qns=seq(0.1,0.9,by=0.1),
B=0, reps=100, xrng=rbind(c(0,1))),
# 4. multiple variables, constant variance
list(seed=11001, n=200, err=function(x) {rnorm(length(x), 0, 0.1)},
errvar=0, mod=y~X1+X2+X3+X4+X5, coefs=c(1,-3,2,2,-1,-2),
qns=seq(0.1,0.9,by=0.1),
B=0, reps=100, xrng=rbind(c(0,1),c(0,1),c(0,1),c(0,1),c(0,1))),
# 5. multiple variables, non-constant variance:
list(seed=11001, n=200, err=function(x) {rnorm(length(x), 0, 0.1+x)},
errvar=1, mod=y~X1+X2+X3+X4+X5, coefs=c(1,-3,2,2,-1,-2),
qns=seq(0.1,0.9,by=0.1),
B=0, reps=100, xrng=rbind(c(0,1),c(0,1),c(0,1),c(0,1),c(0,1))),
# 6. variance depends on x1, x2
list(seed=11001, n=200, err=function(x1, x2) {rnorm(length(x1), 0, 0.1+x1+1.3*x2)},
errvar=c(1,2), mod=y~X1+X2+X3+X4+X5, coefs=c(1,-3,2,2,-1,-2),
qns=seq(0.1,0.9,by=0.1),
B=0, reps=100, xrng=rbind(c(0,1),c(0,1),c(0,1),c(0,1),c(0,1))),
# 7. non-normal errors
list(seed=11001, n=200, err=function(x) {rlnorm(length(x), 0, 0.75)},
errvar=0, mod=y~X1+X2+X3+X4+X5, coefs=c(1,-3,2,2,-1,-2),
qns=seq(0.1,0.9,by=0.1),
B=0, reps=100, xrng=rbind(c(0,1),c(0,1),c(0,1),c(0,1),c(0,1))),
# 8. non-normal errors which depend on a variable
list(seed=11001, n=200, err=function(x) {rlnorm(length(x), 0, 0.25+0.5*x)},
errvar=1, mod=y~X1, coefs=c(2, -2),
qns=seq(0.1,0.9,by=0.1),
B=0, reps=100, xrng=rbind(c(0,1)))
)

0 comments on commit a590b6e

Please sign in to comment.