# Global Bird species density study

if (grep("apple", R.Version()$platform)) windows <- function(...) quartz(...)

#*******************************************************************************
#****************** ANALYSIS ***************************************************
#*******************************************************************************
#*******************************************************************************

# set working directory


# load relevant libraries and additional codes
require(dr)
require(Hmisc)
require(mgcv)
require(penalized)
library(boot)
library(randomForest)
library(gbm)

source("../../AB_comparison/diagnostics.r")
source("../../AB_comparison/COLL_allfunctions.r")
source("../../AB_comparison/brtfunctions.r")
source("../../AB_comparison/marsHelpers.r")

 
#-------------------------------------------------------------------------------
# load data:
load("GlobalBirds.Rdata") 
# loads a data.frame called "all.df"
summary(all.df)
# all.df comprises standardised predictors, area-corrected bird density ("birds_num_arrh") plus the original absolute bird species richness and country names (last two columns).
# we are not interested in detecting latitudinal or longitudinal trends, hence X and Y are excluded:
birds <- all.df[,-c(16, 17, 26:27)]
attach(birds)

#------------------------------------------------------------------------------
# There are too many possibilities for quadratic terms and interactions compared to the number of data points: 2*p+p*(p-1)/2 = 299.
# We could thus use univariate fits and sort the data accordingly:

imp <- 1:22
names(imp) <- colnames(birds)[-1]
for (i in 1:22){
	f <- as.formula(paste("birds_num_arrh ~ s(", colnames(birds)[i+1], ")", sep=""))
	imp[i] <- gam(f)$deviance	
}
(var.sequence <- sort(imp, decreasing=F)) # first is best univariate predictor
birds <- birds[,c("birds_num_arrh", names(var.sequence))] # now sorted by univariate importance
detach(birds)
attach(birds)


#-------------------------------------------------------------------------------
#--------------------------  DIAGNOSTICS  --------------------------------------

colDiagnostics(birds, birds[,1], silent=T)
# reports collinearity for most indices, but not for condition index, condition number and variance decomposition

#-------------------------------------------------------------------------------
#------------------------- COLL METHODS  ---------------------------------------
# run the following different approaches:
# * GLM/GAM
# * select 07
# * ridge
# * MARS
# * DR
# * cluster (Hoeffding/Ward)
# * (PLS)
# * BRT
# * rf

var.imp.comp = data.frame(matrix(0, ncol=11, nrow=24))
colnames(var.imp.comp) = c("glm","gam","select07","ridge","dr", "mars","cluster","pls","brt","rf", "seqreg")
rownames(var.imp.comp) = c(colnames(birds[,-1]),"AUC","R2")

#-------------------------------------------------------------------------------

# scatter plot smoothing
windows(width=14, height=10)
par(mfrow=c(4,7), mar=c(3,2,1,1))
for (i in 2:23){
  plot(birds_num_arrh ~ birds[,i], pch=16, col=rgb(.4,.5,.6,.75))
  legend("topright", legend=colnames(birds)[i], bty="n", cex=2)
  lines(lowess(birds_num_arrh ~ birds[,i]), col="grey80", lwd=2)
}

# correlation matrix
levelplot(cor(birds[,2:23])[,22:1], at=c(-1,-.9,-.7,-.5,-.3,0,.3,.5,.7,.9,1), las=2)

#-------------------------------------------------------------------------------
#--------------- GLM and GAM for reference only --------------------------------

# glm
f <- formula.maker(birds[,1:20], y.col=1, quad=T, int=T) # can still use most variables!
fm.glm <- stepAIC(lm(f, data=birds), k=log(length(birds_num_arrh)))
summary(fm.glm)
an <- anova(fm.glm)

# Obviously, this result is silly, since it uses 174 parameters to explain 212 data points.
# Further automatic simplification was prohibited by collinearity.
# We can force further simplification by increasing the model complexity penalty substantially:
fm.glm2 <- step(fm.glm, k=15)
summary(fm.glm2) # this model has 14 parameters
an <- anova(fm.glm2)
plot(fm.glm2) # data points 91 (India) and 159 (Philippines) are rather influential


# enter the values into the matrix:
for (i in 1:22){	
	ind <- grep(rownames(var.imp.comp)[i], rownames(an))
	var.imp.comp[i, "glm"] <- sum(an$"Sum Sq"[ind] * ifelse(ind <= 9, 1, 0.5)) / sum(an$"Sum Sq"[1:(nrow(an)-1)])
}
var.imp.comp[24, "glm"] <- summary(fm.glm2)$r.squared
#THM: GDP_in by far most important (followed by voi_acc, urban)

#-------------------------------------------------------------------------------
# gam
f <- as.formula(paste("birds_num_arrh ~ ", paste("s(", colnames(birds[,-1]), ", bs='cs',k=4)", collapse="+", sep="")))
fm.gam <- gam(f, data=birds, family="gaussian", gamma=1)
summary(fm.gam)
an <- anova(fm.gam)
plot(fm.gam)
# use F-values as indicator of importance:
var.imp.comp[, "gam"] <- c(anova(fm.gam)$s.table[,3], 0,0)/sum(anova(fm.gam)$s.table[, 3])
var.imp.comp[24, "gam"] <- 0.39 # deviance explained

windows(width=12)
par(mfrow=c(4,6), mar=c(3,3,.5,.5),mgp=c(1.7,.6,0),tcl=-.2)
plot(fm.gam)
# THM: prec_y (humpshaped), GDP_ppp (linear positive), arbl (linear negative) most important

#-------------------------------------------------------------------------------
#------------------------------- select07 --------------------------------------

birds.07 <- data.frame(birds_num_arrh, select07(X=birds[,-1], y=birds[,1], family="gaussian", univar="gam",
  threshold=0.7, method="pearson", k=4))
f <- formula.maker(birds.07, y.col=1, quad=T, interactions=T)
fm.07 <- step(lm(f, data=birds.07), k=log(length(birds_num_arrh)))
summary(fm.07)
# still too many variables (see glm):
fm.07.2 <- step(fm.07, k=10)
summary(fm.glm2) # this model has 14 parameters
(an <- anova(fm.07.2))

# enter the values into the matrix:
for (i in 1:22){	
	ind <- grep(rownames(var.imp.comp)[i], rownames(an))
	var.imp.comp[i, "select07"] <- sum(an$"Sum Sq"[ind] * ifelse(ind <= 11, 1, 0.5)) / sum(an$"Sum Sq"[1:(nrow(an)-1)])
}

var.imp.comp[24, "select07"] <- summary(fm.07.2)$r.squared


#windows(width=12)
#par(mfrow=c(2,7), mar=c(3,3,.5,.5),mgp=c(1.7,.6,0),tcl=-.2, las=1)
#response.curve(fm.07, birds.07[,-1], birds.07[,1], ylim=c(0,200))
# Take home: temp_sea clearly most important (int with elev und IU_cou), followed by GDP_in and urban


#-------------------------------------------------------------------------------
#-------------------------------- seqreg ---------------------------------------

seqreg.bic <- step.seqreg(data=birds, y.col=1, k=log(224), quadratic=T, interactions=T ) 
summary(seqreg.bic[[1]])
an <- anova(seqreg.bic[[1]])
# NOTE: predict.seqreg() newdata argument requires dataset with same structure 
# (columns) as training data!

# enter the values into the matrix:
for (i in 1:22){	
	ind <- grep(rownames(var.imp.comp)[i], rownames(an))
	var.imp.comp[i, "seqreg"] <- sum(an$Deviance[ind] * ifelse(ind <= 18, 1, 0.5)) / sum(an$Deviance, na.rm=T)
}
var.imp.comp[24, "seqreg"] <- sum(an$Deviance, na.rm=T)/an$"Resid. Dev"[1]

# Take home: tem_sea most important (various interactions), followed by IU_area, followed by urban


#-------------------------------------------------------------------------------
#---------------------------------- Ridge --------------------------------------

f <- as.formula(paste("~", paste(colnames(birds[,-1]), collapse="+")," + ",  paste("I(",colnames(birds[,-1]), "^2)", collapse="+", sep=""), sep="" ))
#f <- as.formula(paste("~", paste(colnames(birds[,2:5]), collapse="+")," + ",  paste("I(",colnames(birds[,2:5]), "^2)", collapse="+", sep=""), sep="" ))
# first run an optimiser for the L2-norm:      
# use 'wisely' and in conjunction with profL2 as pre-scan to make sure you find global
# not local minimum!
ridgesets <- optL2(response=birds_num_arrh, penalized=f, minlambda2=1e-8, maxlambda2=1000, model="linear", fold = 10, trace=TRUE, data=birds)
p2 <- profL2(response=birds_num_arrh, penalized=f, minlambda2=1e-8, maxlambda2=1000, model="linear", fold = 10, trace=TRUE, data=birds)
plot(p2$lambda, p2$cvl, type="l")
opt.lambda2=400 #starts levelling off at 300 or so; repeated calls of optL2 yielded values between 380 and 550    
f.ridge <- penalized(response=birds[,1], penalized=f, lambda2=opt.lambda2, model="linear", data=birds)
coef(f.ridge) 
# variable importance
an <- abs(coef(f.ridge,which="penalized"))
an2 <- an[1:22] + an[23:44]
index <- match(names(an2), rownames(var.imp.comp))

var.imp.comp[index, "ridge"] <- an2/sum(an2)
var.imp.comp[24, "ridge"] <- cor(birds[,1], predict(f.ridge, model.matrix(f, birds)[,-1])[,1])^2

# Take home: prc_y, tem_sea and tem_y most important! (urban and elev then)

#-------------------------------------------------------------------------------
#-------------------------------- DR -------------------------------------------

birdsquad <- cbind(birds_num_arrh=birds[,1], birds[,-c(1,21:23)], birds[,-c(1,21:23)]^2)
colnames(birdsquad)[21:39] <- paste(colnames(birds)[2:20], ".sqr", sep="")
f <- as.formula(paste("birds_num_arrh ~ ", paste(colnames(birdsquad[,-1]), collapse="+"), sep="" ))

# Only 2 slices are available for binary response. At most, only a single vector
# in the central subspace can be found by SIR if response is binary.
strain <- dr(f, data=birdsquad, slice.function=dr.slices.arc, nslices=8, chi2approx="wood", method="sir")
summary(strain)
(ndim <- which.max(summary(strain)$test[[3]] >= 0.05) ) # 3
# adapt the data to dr-lm:
traindata <- data.frame(y = birdsquad[,1], dr.direction.default.br(strain, 1:ndim))
# run a lm on the dr-components in order to get a function to do the predictions:
f.dr <- step(lm(formula(paste("birds_num_arrh ~", paste("Dir", seq(ndim), sep="", collapse=" + "))), data=traindata))
summary(f.dr); anova(f.dr)
(loadings <- summary(strain)$evectors[,1:3]) #because the first 2 Dir were selected in the lm

# variable importance
load2 <- cbind.data.frame(loadings[1:19,], loadings[20:38,])
an <- apply(load2, 1, function(x) sum(abs(x)))

index <- match(names(an), rownames(var.imp.comp))

var.imp.comp[index, "dr"] <- an/sum(an)
var.imp.comp[24, "dr"] <- summary(f.dr)$r.squared
# pol_sta, gov_eff (both quadratic)


#-------------------------------------------------------------------------------
#------------------------------- MARS ------------------------------------------

f.mars <- mars.glm(data=birds, mars.x=c(2:22), mars.y=1, family="gaussian", mars.degree=1)
# needed (tiny) modifications of function mars.contribs() (l.142) in marsHelpers.r!!   
varimp <- mars.contribs(f.mars)$deviance
an <- abs(as.numeric(as.character(varimp[,2])))
names(an) <- varimp[,1]

index <- match(names(an), rownames(var.imp.comp))

var.imp.comp[index, "mars"] <- an/sum(an)
var.imp.comp[24, "mars"] <- 1 - f.mars$deviances$resid.dev/f.mars$deviances$null.dev
# Take home: voi_acc followed by prc_y and tem_sea/urban/reg_qua


#-------------------------------------------------------------------------------
#------------------------------- cluster ---------------------------------------

clusters <- varcluster(birds[,-1])  # defaults to Hoeffding-Ward 
cluster.vars <- CollClustReduction(X=birds[,-1], y=birds[,1], 
  clusters=clusters, varselect="centred")
# do the clusters make sense ecologically?  YES
colnames(cluster.vars)[1] <- "birds_num_arrh"
f <- formula.maker(cluster.vars, quad=T, int=T) 
fm.cluster <- step(lm(f, data=birds), k=10)#log(NROW(birds)))  # need to increase k on order to reduce model complexity to EPV10
summary(fm.cluster)
an <- anova(fm.cluster) 

for (i in 1:22){	
	ind <- grep(rownames(var.imp.comp)[i], rownames(an))
	var.imp.comp[i, "cluster"] <- sum(an$"Sum Sq"[ind] * ifelse(ind <= 7, 1, 0.5)) / sum(an$"Sum Sq"[1:(nrow(an)-1)])
}
var.imp.comp[24, "cluster"] <- summary(fm.cluster)$r.squared

# Take home: IU_area and urban slight winners over rul_law, GDP_in and forest


#-------------------------------------------------------------------------------
#-------------------------- PLS ------------------------------------------------

birdsquad <- cbind(birds_num_arrh=birds[,1], birds[,-1], birds[,-1]^2)
colnames(birdsquad)[24:45] <- paste(colnames(birds)[2:23], ".sqr", sep="")
#testquad <- cbind(pb=testdata$pb, testdata[,c(1:6,8:19)], testdata[,c(1:6,8:19)]^2)
#colnames(testquad)[20:37] <- paste(colnames(testdata)[c(1:6,8:19)], ".sqr", sep="")
f.pls <- train.pls(birdsquad, family="gaussian")
an <- abs(f.pls$pls$coef)

f <- formula.maker(birds, quad=T, int=F) 
require(pls)
res <- plsr(f)
summary(res)
imp <- drop(R2(res, estimate = "train", intercept = FALSE)$val) # levels off at 18 components (which explain 83% of the variance in X)
# (code taken from function "summary.mvr")
imps <- abs(loadings(res)[, 1:18]) %*% c(imp[1], diff(imp)[1:17])
var.imp.comp[1:22, "pls"] <- (imps[1:22] + imps[23:44])/sum(imps)
var.imp.comp[24, "pls"] <- imp[18]
# THM: elev, GDP_PPP, urban and shrub most important


#-------------------------------------------------------------------------------
#------------------------------- BRT -------------------------------------------

# NOTE: for comparison with other methods where we did not include any 2-way interactions, 
f.brt <- gbm.step(data=birds, 
  gbm.x = 2:23,  gbm.y = 1, 
  family = "gaussian",  tree.complexity = 3, 
  learning.rate = 0.0025, bag.fraction = 0.75) # leads to roughly 1000 trees, which is great
an <- f.brt$contrib
index <- match(an[,1], rownames(var.imp.comp))
var.imp.comp[index,"brt"] <- f.brt$contrib$rel.inf/100
var.imp.comp[24, "brt"] <- 1- f.brt$self.statistics$mean.resid / f.brt$self.statistics$mean.null
#THM: tem_sea followed by prc_y, elev and forest

#-------------------------------------------------------------------------------
#-------------------------------- RF -------------------------------------------

f.rf <- randomForest(x=birds[,-1], y=birds[,1], ntree=1000)
varImpPlot(f.rf) 
var.imp.comp[1:22,"rf"] <- importance(f.rf)/sum(importance(f.rf))
var.imp.comp[24,"rf"] <- f.rf$rsq[length(f.rf$rsq)]

# Take home: temp_sea and urban, followed by tem_y and prec_y
var.imp.comp <- var.imp.comp[,-c(11:15)]
var.imp.comp[is.na(var.imp.comp)] <- 0

save.image(file="globalbirds_analysisresults.Rdata")
detach(birds)


#---------------------------------------------------------------------------------------------
# picture:

colSums(var.imp.comp[1:22,]) # should all be 1!
seq.col <- order(var.imp.comp[24,])
seq.row <- order(rowSums(var.imp.comp[1:22,]))


require(RColorBrewer)
color <-  brewer.pal(9, "Blues")#"Spectral" )#
#breaks <- c(seq(0,.4,.05), .5)
blueColors <- colorRampPalette(colors=color)

require(lattice)
levelplot(as.matrix(var.imp.comp)[rev(seq.row), seq.col],  xlab="Variables", ylab="Methods", col.regions=c(blueColors(100)))#, scales=list(x=list(at=1:22, labels=rownames(var.impcomp)), rot=90), y=list(labels=colnames(var.imp.comp)) )

