R-code for each method

Sequence as in the appendix to the paper:

Identifying clusters
1. PCA-based clustering
2. cluster analysis
3. iVIF

Cluster-independent methods
4. select07
5. seqreg

Latent variable methods
6. PCR
7. PLS
8. PPLS
9. CPCA
10. LRR
11. DR

Robust methods
12. ridge
13. LASSO
14. OSCAR
15. BRT
16. randomForest
17. SVM
18. MARS
19. CWR

In our analysis, BIC was usually better than AIC. We hence present the code for BIC and put a note in where the BIC was implemented, and hence where to change the code to get the AIC instead. NOTE that BIC is here implemented through the penalisation term "k", which involves counting the number of cases. This seemingly trivial task needs to be done differently for different error distributions, since we are talking about the "limiting sample size"!
For normal: k=log(N)
For binary: k=log(min(table(N)))
See Harrell (2001, page 61) for further details!


For each method, we first fit the model, then use a test data set for prediction, and finally correlate test-predictions with a test realisation. When we encapsulate the function in a "system.time", this indicates that the runtime exceeds a few seconds or minutes. The time given is based on an 2 GHz processor.

[Some functions can give interesting/annoying amount of detail. We set here all "verbose" and "trace" and "silent" switch to report maximum detail during the fitting. This may make the methods a bit more transparent for the novice.
When an error message occurs, you can use "traceback()" to find out in which function call it happened.]

#-------------------------------------------------------------------------------
# The various functions we wrote for some parts of some methods will be loaded first. Packages required by a specific method will be loaded when the function is called.
setwd("D:/Data/aktuell/2007_DFG_collinearityWS/AA_paper&appendix/") # wherever you put the files
source("COLL_allfunctions.r") 
# generate an example data set:
dats1 <- collineariser()      # using all the defaults
train <- as.data.frame(dats1$train)          # training data
test.none <- as.data.frame(dats1$test.none)  # test data with no collinearity
# We could visualise the correlation structure by an image plot of the variables' correlation:
image(cor(train[,-1])) # note that image is turned by 90∞ (see ?image)
image(cor(train[,-1])[,21:1]) # that's better!

#-------------------------------------------------------------------------------
# 0. GLM and GAM for reference only:
source("stepAICc.r") # Christoph Scherber's stepAICc-code!
f <- formula.maker(train, quadratic=TRUE, interactions=TRUE)
# The AIC-based model (AICc-based backward stepwise model selection):
system.time(f.glm <- stepAICc(glm(f, data=train), direction="backward", k=2, trace=FALSE)) # takes approx. 1 1/2 hours
# Refinement to BIC:
system.time(f.glm.bic <- step(f.glm, k=log(nrow(train)))) #takes 1 minute
cor(predict(f.glm.bic, newdata=test.none), test.none[,1])

# GAM (with cubic splines and shrinkage; see ?step.gam for details):
require(mgcv)
f <- as.formula(paste("train[,1] ~ ", paste("s(", colnames(train[,-1]), ", bs='cs')", collapse="+", sep="")))
f.gam <- gam(f, data=train, family=gaussian, gamma=1) # 20 secs
cor(predict(f.gam, newdata=test.none), test.none[,1])

#-------------------------------------------------------------------------------
# 1. PCA-based clustering

clust.pca.forward <- PCAclusteringforward(train[,-1], threshold=0.32) # yields clusters
train.pca <- CollClustReduction(X=train[,-1], y=train[,1], clusters=clust.pca.forward, varselect="centred") # processes clusters with one of three options ("varselect")
f <- formula.maker(train.pca, quadratic=TRUE, interaction=TRUE) # makes a formula for the lm
# BIC is defined by the penalisation term "k". Choose k=2 or omit the statement to get the (default) AIC:
f.pca.cent <- step(lm(f, data=train), k=log(nrow(train)), trace=FALSE) # runs the lm     
cor(predict(f.pca.cent, newdata=test.none), test.none[,1]) # correlation between train and a test realisation

#-------------------------------------------------------------------------------
# 2. cluster analysis
# Here you have several options as to how to compute the cluster; check ?varclus for details!
clust.clust <- varcluster(train[,-1])   
train.clust <- CollClustReduction(X=train[,-1], y=train[,1], clusters=clust.clust, varselect="centred")
f <- formula.maker(train.clust, quadratic=TRUE, interaction=TRUE)
# BIC is defined by the penalisation term "k". Choose k=2 or omit the statement to get the (default) AIC:
f.clust.cent <- step(lm(f, data=train), k=log(nrow(train)), trace=FALSE)      
cor(predict(f.clust.cent, newdata=test.none), test.none[,1]) 

#-------------------------------------------------------------------------------
# 3. iVIF
ivif.clust <- ivif(x=train[,-1], y=train[,1])$names.grouped 
train.clust <- CollClustReduction(X=train[,-1], y=train[,1], clusters=ivif.clust, varselect="centred")
f <- formula.maker(train.clust, quadratic=TRUE, interaction=TRUE)
# BIC is defined by the penalisation term "k". Choose k=2 or omit the statement to get the (default) AIC:
f.ivif.cent <- step(lm(f, data=train), k=log(nrow(train)), trace=FALSE)      
cor(predict(f.ivif.cent, newdata=test.none), test.none[,1]) 

#-------------------------------------------------------------------------------
# 4. select07/select04
# select07 and select04 differ only in the value for "threshold". This is select07:
newtrain <- select07(X=train[,-1], y=train[,1], family="gaussian", univar="gam", threshold=0.7, method="pearson")
newtrain <- cbind.data.frame(y=train[,1], newtrain)
f <- formula.maker(newtrain, quadratic=TRUE, interactions=TRUE)
# BIC is defined by the penalisation term "k". Choose k=2 or omit the statement to get the (default) AIC:
f.07 <- step(lm(f, data=newtrain), direction="backward", k=log(nrow(train)), trace=TRUE)
cor(predict(f.07, newdata=test.none), test.none[,1]) 

#-------------------------------------------------------------------------------
# 5. seqreg
# NOTE: Sometimes step.seqreg caused an error. This was only extremely rarely the case, and not in the final set of 4000 runs.
# BIC is defined by the penalisation term "k". Choose k=2 or omit the statement to get the (default) AIC:
system.time(seqreg.obj <- step.seqreg(data=train, y.col=1, k=log(nrow(train)), family="gaussian", univar="gam", sequence=NULL, quadratic=TRUE, interactions=TRUE))  # takes quite a while: 
# based on this sequence, we also have to process the data to predict to:
newtest.none <- predict.seqreg(seqreg.obj, newdata=test.none)
cor(predict(seqreg.obj$model, newdata=newtest.none), test.none[,1]) 

#-------------------------------------------------------------------------------
# 6. PCR
pca <- prcomp(train[,-1], scale=TRUE)
pca.data <- cbind.data.frame(y=train[,1], pca$x)
f <- formula.maker(pca.data, quadratic=TRUE, interactions=TRUE)
# BIC is defined by the penalisation term "k". Choose k=2 or omit the statement to get the (default) AIC:
f.pca <- step(lm(formula=f, data=as.data.frame(pca.data)), k=log(nrow(train)), trace=TRUE)
newtest.none <- as.data.frame(predict(pca, newdata=test.none))
cor(predict(f.pca, newdata=newtest.none), test.none[,1]) 

#-------------------------------------------------------------------------------
# 7. CPCA
# CPCA does not allow for an easy integration of interactions. Here, we only use quadratic effects:
trainquad <- cbind(train, train[,-1]^2)
test.nonequad <- cbind(test.none, test.none[,-1]^2)   
# Here we decide between AIC (and BIC).
alpha.aic <- round(optimise(CPCA.fit, c(0,1), maximum=TRUE, method="AIC", data=trainquad, thedata=trainquad)[[1]], 2)
fm.cpca.aic <- train.cpca(trainquad, alpha=alpha.aic)
# this step in the analysis reduces the number of components:
fit <- NS <- 1:ncol(fm.cpca.aic$coef)
for (i in NS){
    pred.train.cpca.aic <- as.matrix(trainquad[,-1]) %*% fm.cpca.aic$coef[,i]  
    fit[i] <- cor(pred.train.cpca.aic, trainquad[,1])^2 
}
n.comp.bic <- which.min(AIC.r2(fit, 100, NS))
# Finally, we run the real CPCA:
f.cpca.bic <- train.cpca(trainquad, alpha=alpha.bic, comps=n.comp.bic)
cor(pred.cpca(f.cpca.bic, newdata=test.nonequad), test.nonequad[,1]) 

#-------------------------------------------------------------------------------
# 8. PLS
trainquad <- cbind(train, train[,-1]^2)
test.nonequad <- cbind(test.none, test.none[,-1]^2)  
f.pls <- train.pls(trainquad, scale=TRUE)
# this step in the analysis reduces the number of components: (see ppls for comments)
fit <- NS <- 1:ncol(f.pls$pls$coefficients)
for (i in NS){
   pred.train.pls <- as.matrix(trainquad[,-1]) %*% f.pls$pls$coefficients[,i]  
   fit[i] <- cor(pred.train.pls, trainquad[,1])^2 
}
# Here we select the model with the lowest AIC. Use BIC.r2 if you want BIC.
n.comp <- which.min(AICc.r2(fit, 100, NS))      
f.pls$comps <- n.comp
cor(pred.pls(f.pls, newdata=test.nonequad), test.nonequad[,1]) 

#-------------------------------------------------------------------------------
# 9. PPLS
trainquad <- cbind(train, train[,-1]^2)
test.nonequad <- cbind(test.none, test.none[,-1]^2)  
#optimisation, grid search (We "only" scan 420 value combinations. In a single analysis, one should invest a bit more time here and run thousands of values, or even "optim"):
vals <- expand.grid(nk=1:20, lambda=seq(0, 100, by=5))
ores <- apply(vals, 1, pplsoptim, train.data=trainquad)  #  207s
bestparms <- as.numeric(vals[which.max(ores),])
# now, using the best settings, a ppls:
f.ppls <- train.ppls(trainquad, nk=bestparms[1], lambda=bestparms[2])  
# this step in the analysis reduces the number of components:
fit <- NS <- 1:ncol(f.ppls$ppls$coefficients)
for (i in NS){
   pred.train.ppls <- f.ppls$transf.data %*% f.ppls$ppls$coefficients[,i]  # calculates predicted values for train data set
   fit[i] <- cor(pred.train.ppls, trainquad[,1])^2 # calculates R2 for train data set
}
# Here we select the model with the lowest AIC. Use BIC.r2 if you want BIC.
n.comp <- which.min(AICc.r2(fit, 100, NS)) 
f.ppls$comps <- n.comp # derived number of components is stored in the model object
cor(pred.ppls(f.ppls, newdata=test.nonequad), test.nonequad[,1]) 

#-------------------------------------------------------------------------------
# 10. LRR
trainquad <- cbind(train, train[,-1]^2)
test.nonequad <- cbind(test.none, test.none[,-1]^2)  
# (There is no AIC/BIC selection implemented here.)
f.lrr <- train.lrr(trainquad)
cor(pred.lrr(f.lrr, newdata=test.nonequad), test.nonequad[,1]) 

#-------------------------------------------------------------------------------
# 11. DR
trainquad <- cbind(train, train[,-1]^2)
test.nonequad <- cbind(test.none, test.none[,-1]^2)  
# (There is no AIC/BIC selection implemented here.)
strain <- dr(y ~ ., data=trainquad, slice.function=dr.slices.arc, nslices=8, chi2approx="wood", method="sir") 
ndim <- which.max(summary(strain)$test[[3]] >= 0.05) - 1
# adapt the data to dr-lm:
traindata <- data.frame(y = trainquad[,1], dr.direction.default.br(strain, 1:ndim))
# run a lm on the dr-components in order to get a function to do the predictions:
f.dr <- lm(formula(paste("y ~", paste("Dir", seq(ndim), sep="", collapse=" + "))), data=traindata)
newtest.nonequad <- as.data.frame(dr.direction.default.br(strain, 1:ndim, x=as.matrix(test.nonequad[,-1])))
cor(predict(f.dr, newdata=newtest.nonequad), test.none[,1])

#-------------------------------------------------------------------------------
# 12. ridge 
f <- as.formula(paste("~(", paste(colnames(train[,-1]), collapse="+"),")^2 + ", paste("I(",colnames(train[,-1]), "^2)", collapse="+", sep=""), sep="" ))     
# first, we run an optimiser for the L2-norm (see ?optL2 for details):
ridgesets <- optL2(response=train[,1], penalized=f, minlambda2=1e-8, maxlambda2=100, model="linear", fold = 10, standardize=TRUE, trace=TRUE, data=train) 
# now we run the ridge model itself:
# (There is no AIC/BIC selection implemented here.)
f.ridge <- penalized(response=train[,1], penalized=f, lambda2=ridgesets$lambda, model="linear", data=train)
cor(predict(f.ridge, model.matrix(f, test.none)[,-1])[,1], test.none[,1]) 

#-------------------------------------------------------------------------------
# 13. LASSO
f <- as.formula(paste("~(", paste(colnames(train[,-1]), collapse="+"),")^2 + ", paste("I(",colnames(train[,-1]), "^2)", collapse="+", sep=""), sep="" ))     
# first, we run an optimiser for the L1-norm (see ?optL1 for details):
lassosets <- optL1(response=train[,1], penalized=f, minlambda1=0, maxlambda1=100, model="linear", fold=10, standardize=TRUE, trace=TRUE, data=train) # optimise parameters
# now we run the ridge model itself:
# (There is no AIC/BIC selection implemented here.)
f.lasso <- penalized(response=train[,1], penalized=f, lambda1=lassosets$lambda, model="linear", data=train)     
cor(predict(f.lasso, model.matrix(f, test.none)[,-1])[,1], test.none[,1]) 

#-------------------------------------------------------------------------------
# 14. OSCAR
# OSCAR requires MATLAB to run on you computer!
source("OSCAR_functions.r")


#-------------------------------------------------------------------------------
# 15. BRT
source("brtfunctions.r")
train2 <- train  # BRT deletes the file somewhere!
f.brt <- gbm.step(data=train2, gbm.x = 2:22, gbm.y = 1, family = "gaussian",  tree.complexity = 5, learning.rate = 0.01, bag.fraction = 0.5, verbose=TRUE, silent=FALSE, plot.main=TRUE)
cor(gbm.predict.grids(f.brt, new.dat=test.none, eval.data, want.grids = FALSE, sp.name = "preds"), test.none[,1])

#-------------------------------------------------------------------------------
# 16. randomForest
f.rf <- randomForest(x=train[,-1], y=train[,1], ntree=1000, mtry=20)
cor(predict(f.rf, newdata=test.none[,-1]), test.none[,1])

#-------------------------------------------------------------------------------
# 17. SVM
f.svm <- svm(x=train[,-1], y=train[,1], type="nu-regression")
cor(predict(f.svm, newdata=test.none[,-1]), test.none[,1])

#-------------------------------------------------------------------------------
# 18. MARS
source("marsHelpers.r")
f.mars <- mars.glm(data=train, mars.x=c(2:22), mars.y=1, family="gaussian", mars.degree=1, verbose=TRUE)
cor(mars.predict(f.mars, new.data=test.none)[[1]], test.none[,1])

#-------------------------------------------------------------------------------
# 19. CWR
W <- giveWeightsVif(X=as.matrix(train))#, expon=2) # takes quite a while!
f <- formula.maker(train, quadratic=TRUE, interaction=TRUE)
# BIC is defined by the penalisation term "k". Choose k=2 or omit the statement to get the (default) AIC:
system.time(f.cwr <- step(lm(f, weights=W, data=cbind.data.frame(train, W=W)), k=log(nrow(train)), trace=FALSE)) # takes quite a while:
cor(predict(f.cwr, newdata=test.none), test.none[,1])     