library(MASS) ####### PART 1 - consistency of Information Criteria library(leaps) simulate <- function(seed,nSam,sigma,SigmaMatrix){ #set random seed set.seed(seed) nEff <- 4 X <- mvrnorm(n=nSam, mu = rep(0,6), SigmaMatrix) X1 <- X[,1:4] X2 <- X[,5:6] #generate y according to known true linear model y <- 2 + X1 %*% c(10,8,6,4) + sigma*rnorm(nSam,0,1) myData <- data.frame(y,X1[,1],X1[,2],X1[,3],X1[,4],X2[,1],X2[,2]) b <- regsubsets(y ~ ., data = myData) rs <- summary(b) rs$which numParams <- 2:7 AIC <- nSam*log(rs$rss/nSam) + (numParams)*2 AICc<- AIC + (2*(numParams)*(1 + numParams))/(nSam - (numParams)-1) BIC <- nSam*log(rs$rss/nSam) + log(nSam)*(numParams) return(c(which.min(AIC),which.min(AICc),which.min(BIC))) } nSam <- 2000 sigma <- 1 #generate random covariance matrix R <- matrix(runif(36,-1,1),6,6) SigmaMatrix <- diag(c(1,1,1,1,1,1))+ 0.1*R%*%t(R) numRegs <- sapply(1:1000,function(seed) simulate(seed,nSam,sigma,SigmaMatrix)) sapply(1:8,function(a) mean(numRegs[1,] == a))#AIC sapply(1:8,function(a) mean(numRegs[2,] == a))#AICc sapply(1:8,function(a) mean(numRegs[3,] == a))#BIC ################ THE END OF PART 1 ################### ################ PART 2 - validity of p-values POST model selection ################ FIRST EXERCISE library(leaps) simulateCI1 <- function(seed,nSam,sigma,SigmaMatrix){ #set random seed set.seed(seed) nEff <- 4 X <- mvrnorm(n=nSam, mu = rep(0,8), SigmaMatrix) X1 <- X[,1:4] X2 <- X[,5:8] #generate y according to known true linear model y <- X1 %*% c(1,1,1,1) + sigma*rnorm(nSam,0,1) myData <- data.frame(y,X1[,1],X1[,2],X1[,3],X1[,4],X2[,1],X2[,2],X2[,3],X2[,4]) b <- regsubsets(y ~ ., data = myData, int=FALSE) rs <- summary(b) AIC <- nSam*log(rs$rss/nSam) + (1:8)*2 #this is the best regression according to AIC lmod_best <- lm(y ~ cbind(X1,X2)[,rs$which[which.min(AIC),]]) confint(lmod_best)[2,] if (rs$which[which.min(AIC),1]==TRUE){ ci = confint(lmod_best)[1,] } else { ci = c(0,0) } #this is the correct model (for comparison) lmod_truemod <- lm(y ~ X1) ci_truemod <- confint(lmod_truemod)[2,] return(cbind(ci,ci_truemod)) } nSam <- 50 #make this small enough to illustrate the point sigma <- 12 #make this large enough to illustrate the point #generate random covariance matrix R <- matrix(runif(64,-1,1),8,8) SigmaMatrix <- diag(1:8)+R%*%t(R) #BAM, run the regression ciReg <- sapply(1:1000,function(seed) simulateCI1(seed,nSam,sigma,SigmaMatrix)) ciRegMod <- ciReg[,which(apply(ciReg[1:2,],2,sum)!=0)] #empirical coverage of the model selection based estimator mean((ciRegMod[1,]<1)*(ciRegMod[2,]>1)) #empirical coverage of the CI from the correct model mean((ciReg[3,]<1)*(ciReg[4,]>1)) ################ THE END OF PART 2 ################### ######### PART 3 - validity of p=values POST model selection ######### SECOND EXERCISE #data-generating process taken from http://eranraviv.com/advances-post-model-selection-inference/ set.seed(100) tempcor <- NULL mod <- NULL repp=1000 bet0 <- bet1 <- NULL TT <- 50 sigma <- 1 ci0 <- matrix(0,1000,2) ci1 <- matrix(0,1000,2) # We choose from three options: only x1 # only x2 and x1+x2, according to AIC criterion for (i in 1:repp){ eps <- rnorm(TT) x1 <- rnorm(TT,3,1) x2 <- x1 + sigma*rnorm(TT,0,1) #X <- mvrnorm(n=TT, rep(0,2),matrix(c(2,0.9,0.9,2),2,2)) #x1 <- X[,1] #x2 <- X[,2] y <- x1 +eps tempcor[i] <- cor(x1,x2) lm0 <- lm(y~x1+x2) ci0[i,] <- confint(lm0)[2,] bet0[i] <- summary(lm0)$coef[2,1] AIC0 <- AIC(lm0) lm1 <- lm(y~x1) bet1[i] <- summary(lm1)$coef[2,1] ci1[i,] <- confint(lm1)[2,] AIC1 <- AIC(lm1) AIC2 <- AIC(lm(y~x2)) mod[i] <- which.min(c(AIC0,AIC1,AIC2)) } mean((ci0[,1]<1)*(ci0[,2]>1)) mean((ci1[,1]<1)*(ci1[,2]>1)) ci3 <- (mod==1)*ci0 + (mod==2)*ci1 ci3 <- ci3[which(apply(ci3,1,sum)!=0),] mean((ci3[,1]<1)*(ci3[,2]>1)) mean(ci0[,2]-ci0[,1]) #large model mean(ci1[,2]-ci1[,1]) #small model mean(ci3[,2]-ci3[,1]) #model based on AIC ################ THE END OF PART 3 ###################