##########################################################
# perform some diagnostics on a X dataframe and an response
# author: sven.lautenbach@ufz.de
# date: 12.12.07
# change log
#   21.05.08 -> added some round statements to
#     make the output a bit fancier
#   28.05.08 -> replaced the round statements with
#     signif statements
#   05.06.08 -> added summary function, included vif
#   18.05.11 -> sum clean up 
##########################################################
require(perturb)
require(car)
#source("ivif.r")

colDiagnostics <- function(theDf, theY, plot.hist = FALSE, silent=FALSE, no.ivif=FALSE)
{
  results <- list()
	# thresholds
  th.cor.det <- 0.1 # to be checked! the paper says close to 0...
  th.kappa <- 15
  th.vif <- 10
  # indicates for each indicator if collinearity is detected or not
	is.coll <- as.data.frame(t(rep(FALSE,8)))
  names(is.coll) <- c("cor", "cor.det", "ivif", "kappa", "condition index", "condition.number", "vd", "vif")

	theY <- as.data.frame(theY)
  # todo: check if df is dataframe
  if(class(theDf) != "data.frame")
  {
    theDf <- as.data.frame(theDf)
    if(!is.data.frame(theDf))
    	stop("theDF is not of class data.frame and can not be converted into one")
	}
  
  
  cor.m <- cor(theDf)
 results$n.gt.09 <-sum(cor.m > 0.9 & cor.m < 1) / 2
 results$n.gt.07 <-sum(cor.m > 0.3 & cor.m < 1) / 2
 results$n.gt.05 <-sum(cor.m > 0.5 & cor.m < 1) / 2
 if(results$n.gt.09 + results$n.gt.07 + results$n.gt.05 > 0)
 {
    is.coll$cor <- TRUE
 }
 # only if switch is set
 if (plot.hist)
		hist(cor.m[which( cor.m < 1)], main="Histogramm of the correlation matrix without diagonal elements",
  		xlab="Correlation indicies")

  results$cor.9 <- getHighCor(cor.m,0.9)
  results$cor.7 <- getHighCor(cor.m,0.7)
  results$cor.5 <- getHighCor(cor.m,0.5)

  
 eigen.v <- eigen(cor.m)
 max.v <- max( eigen.v$values)
 min.v <- min( eigen.v$values)
 if (min.v == 0) {
   cat("Warning: Smallest eigenvalue is zero! Kappa set to NA...")
   results$kappa <- NA
 	}
	else
  	results$kappa <- kappa(cor.m)#(max.v / min.v)
  if(results$kappa > th.kappa)
    is.coll$kappa <- TRUE
	results$cond.index <- sqrt(results$kappa)
	
  
  results$eigen.v <- eigen(cor.m)$values

  if(!no.ivif)
 	{
    ivif.res <- try(ivif( theDf, as.data.frame(theY), trace=FALSE))
    if (class(ivif.res) != "try-error"){
      results$ivif.proxys <- ivif.res 
      n.proxysets <- length(ivif.res$id)
      proxySize <- numeric(n.proxysets )
      i <- 0
      for( element in ivif.res$id)
      {
          i<- i+1
          proxySize[i] <- length(element)
      }
    	results$n.proxysets <- n.proxysets
      results$n.xvariables <- length(theDf)
    	results$max.var.in.proxyset <- max(proxySize)
      results$mean.var.in.proxyset <- mean(proxySize)
      results$proxySize <- proxySize
    	if(results$n.proxysets < results$n.xvariables )
    	  is.coll$ivif <- TRUE
    } else {
        warning("IVIF could not be used:")  
        warning(ivif.res[[1]])
        is.coll$ivif <- NA
    }
  } else {  # no.ivif set
    results$n.proxysets <- NA
    results$n.xvariables <- length(theDf)
  	results$max.var.in.proxyset <- NA
    results$mean.var.in.proxyset <- NA
    results$proxySize <- NA
    is.coll$ivif <- NA
  } 
	
  cd <- colldiag(cor.m, scale=TRUE, center=TRUE)
  results$condinx <- cd$condindx
  results$vdp <- cd$pi
  
  
  cor.det <- det(cor.m)
	results$cor.det <- cor.det
	if(abs(cor.det) < th.cor.det)
	{
	  is.coll$cor.det <- TRUE
	}
  
  theString = paste( names(theY), collapse = " + ")
  if(theString=="")
    theString <- "y"
 	theString = paste( theString , "~ .")
 	

  vif.res <- try(vif(lm(as.formula(theString), data=cbind(theY, theDf))))
  if (class(vif.res) != "try-error"){
    results$vif <- vif.res
    
    if(is.matrix(results$vif))
    	results$vif.gt.th <- results$vif[which(results$vif[,1] > th.vif),]
  	else
  	  results$vif.gt.th <- results$vif[which(results$vif > th.vif)]
    if(length(which(results$vif > th.vif))> 0)
  		is.coll$vif <- TRUE
	} else results$vif <- NA
   
  #attributes
  results$is.coll <- is.coll
  class(results) <- "coll.diag"
  if (!silent) summary(results)
  return(results)
}

summary.coll.diag <- function(colDiag)
{
	attach(colDiag, warn.conflicts = FALSE)
  # print stuff
  if(sum(colDiag$is.coll==TRUE, na.rm=TRUE)>0)
    cat("Data seems to be collinear.\n\n")
	else
	  cat("No collinearity in the data detected.\n\n")

	# first collinear stuff
	if(is.coll$cor.det)
		cat("Collinearity detected based on the determinant of the correlation matrix.\n")


	if(colDiag$is.coll$kappa)
	 cat("Collinearity detected based on kappa.\n")
	if(is.coll$vif)
		cat("Collinearity detected based on VIF.\n")
	if(is.coll$ivif)
	  cat("Collinearity detected by IVIF.\n")
  if(is.coll$cor)
	{
	cat("\nCollinearity detected on base of the correlation matrix.\n")
	 if( n.gt.09 > 0)
	 	cat(paste("\tElements in correlation matrix > 0.9 & < 1: ", n.gt.09, "\n", sep=""))
	 if( n.gt.07 > 0)
	 		cat(paste("\tElements in correlation matrix > 0.7 & < 1: ", n.gt.07, "\n", sep=""))
	 if( n.gt.05 > 0)
	 		cat(paste("\tElements in correlation matrix > 0.5 & < 1: ", n.gt.05, "\n", sep=""))
	}
	cat("\n\n")
  cat(paste("correlation matrix determinat:\n\t", signif(cor.det, 3), sep="", "\n"))
  cat(paste("Kappa (max eigenvalue / min eigenvalue):\n\t", signif(kappa,7), "\n", sep=""))
	cat(paste("Condition index:\n\t", signif(sqrt(kappa),3), "\n" ))
  cat(paste("Max/Min/Mean eigenvalue of correlation matrix:\n\t", signif(max(eigen.v),3), "/", signif(min(eigen.v),3),
      "/", signif(mean(eigen.v),3), "\n" )  )
  cat(paste( "\nNumber of proxysets found by ivif:\n\t ", n.proxysets), "\n")
  cat(paste("Number of variables:\n\t", n.xvariables, "\n"))
  cat(paste("max/mean number of variables in one proxy set:\n\t", max(proxySize), "/",
     signif(mean(proxySize),2), "\n"))

	cat("\nVIFs greater than threshold:\n")
  print( signif(vif.gt.th))
	#print(signif(vif,3))
  #print("condition indexes : ")
  #print(cd$condindx)
  #print("variance decomposition proportions: ")
  #print(cd$pi)
 	detach(colDiag)
}



getHighCor <- function(cor.m, th)
{
 high.row <- vector("list")
 high.col <- vector("list")
 theStrings <- vector("list")
 count <- 0
 for (i in 1:length(cor.m[,1]))
 {
    for(j in 1:i)
    {
      if((abs(cor.m[i,j]) > th) & (i !=j))
      {
        count <- count +1
        high.row[[ count]] <- i
        high.col [[count]] <- j
        theStrings[[count]] <- (paste(rownames(cor.m)[i], " - ", colnames(cor.m)[j], ": ", cor.m[i,j]))
      }
    }
 }
  high.row <- unlist(high.row)
  high.col <- unlist(high.col)
	results <- list()
	results$highcor.row <- high.row
  results$highcor.col <- high.col
	results$highcor.str <- theStrings
	return(results)
}

########################### IVIF FILE ##########################################
##################################################
#
# iterative variance inflation factor method
#
# References: Gordon D. Booth, Michael J. Niccolucci, Ervin G. Schuster (1994)
# "Identifying proxy sets in multiple linear regression: an aid
# to better coefficient interpretation"
# Res. Pap. INT-470.  Ogden, UT:
# U.S. Department of Agriculture, Forest Service, Intermountain Research Station; 1994. 12 p.
#
#
# author: sven.lautenbach@ufz.de
# date: 15.10.2007
# last modified: 24.10.2008
# bug fixed (use of j instead of currentModelIDs[j]) on 06.11.2007
# bug fixed (forgot to reset newModel after first loop) on 12.12.2007
# bug fixed (reintroduction of all variables from proxysets in the second step)
#   on 23.10.2008
##################################################

###
# Variance inflation factors are a scaled version of the multiple correlation coefficient between
# variable j and the rest of the independent variables. Specifically,
#      VIF(j) = 1/(1 - R(j)**2)
# where Rj is the multiple correlation coefficient.
# Variance inflation factors are often given as the reciprocal of the above formula.
# In this case, they are referred to as the tolerances.
# If Rj equals zero (i.e., no correlation between Xj and the remaining independent variables),
# then VIFj equals 1. This is the minimum value. 
###

####
# proxy set information is stored in a vector
# the proxy sets vector relates to the xvar id by its position
# and to the proxy set by its value
# for example
# xvar  		1 2 3 4 5
# proxyset  1 1 2 3 2
# for the construction of the model formula, the xvar id is used

#ToDo:
# 2) wie soll man mit dem Bereich 3B/C umgehen? Was soll in diesem Zusammenhang jumped VIF
# bedeuten? Relative Vernderung, differenz zum prevVIF?
# 4) test for glm
# 5) should we use GVIF^{1/(2times df)} instead of GVIF?

require(car)

ivif <- function (x,y, mod="lm", threshold=1.5, jumpThreshold=5, trace=FALSE) {
	
  if (!(inherits(y, "data.frame"))) y <- as.data.frame(y)
  if (!(inherits(x, "data.frame"))) x <- as.data.frame(x)
  
	# check arguments
	if ( mod != "lm" && mod != "glm")
		{ return("The fit object is not of type lm or its inherited classes! Aborting")	}
	if (!(inherits(x, "data.frame")))
		{ return("The x object is not of type data.frame or its inherited classes! Aborting")	}
  if (!(inherits(y, "data.frame")))
		{ return("The y object is not of type data.frame or its inherited classes! Aborting")	}
	if (is.null( names(y) ) ) {	return("Please specify names for y") }
	if ( is.null(names(x) ) )	{	return("Please specify names for x")	}
	if (length(x[,1]) != length(y[,1]) ) { return("Data frames differ in size") }
	if (threshold <= 1.1) { return("Thresholds <= 1.1 should not be used") }

	theDF <- as.data.frame(cbind(y,x))
	currentModelIDs <- 1
	#currentModelIDs[2] <- 2
	###
	# the proxy sets is a vector indicating to which proxy set a x-variable belongs
	theProxySets <- rep(-1, length(x))

	# the ids of the x-vars which have been removed
	# is lateron needed by reinventing the xars in the model
	removedIDs <- vector("list",0)

	#create the vif for the first two variables in x
	prevFormula <- makeFormula(x,y,currentModelIDs)
	# the first variable belongs to the first proxy set
  theProxySets[1] <- 1


	## loop over all explanatory variables in y
	for (i in 2:length(x) )
	{
		newModelIDs <- currentModelIDs
    newModelIDs[length(newModelIDs)+1] <- i


		newFormula <- makeFormula(x,y,newModelIDs)
		newVIF <- getVIF(newFormula, theData = theDF, mod)

		if (trace) {print(newVIF)}

		# compare  newVIF with threshold
		# if needed update modelIDs, otherwise sort in proxy set
		
		aboveTH <- FALSE
		theFirst <- -1 # indicates if it is the first VIF greater then the threshhold
											# or not. If not lists are merged and the first is the position
											# in the list of proxy sets there we other elemens should be connected to
		IDs2rm <- vector("list",0)
		count <- 1
		
		for (j in 1:length(currentModelIDs)) 	# loop over the variables in the current model and
		{   #  check if their VIF values have increased more then the threshold
			if (newVIF[j] > threshold)
			{
				aboveTH <- TRUE
				if (trace) { print(paste("rejected variable: ", names(x)[i],
								". VIF > threshold for ", names(x)[currentModelIDs[j]],sep=" ")) }
				if(theFirst == -1)
				{
					# set the proxy-id for the new var to the proxy id of the
					# var which VIF has jumped
        	theProxySets[i] <- theProxySets[currentModelIDs[j]]
					#store the proxy set id in case more then one variable has jumped
					theFirst <- theProxySets[ currentModelIDs[j] ]

					# store id in the list
					# if the id is not already in the list
					if (sum(removedIDs == i) == 0)
					{
          	removedIDs[[length(removedIDs) +1]] <- i
					}
				}
				else # there has been an vif > threshold before for the variable x_i
				{
          theProxySets[j] <- theFirst # was i before, 23.10.08 changed
          # store id in the list
					# if not already stored
          if (sum(removedIDs == currentModelIDs[j]) == 0) # changed from '== i' to 'currentModelIDs[j]'
					{
          	removedIDs[[length(removedIDs) +1]] <- currentModelIDs[j] # changed from '<- i' to '<- currentModelIDs[j]'
						if(trace)
						{	print(paste("marking variable", names(x)[currentModelIDs[j]], " for deletion from current model"))}
						#print("removedIDs:")
						#print(unlist(removedIDs))
					}
          
					#add the index to the list to be removed from the current model ids
					IDs2rm[[count]] <- currentModelIDs[j] # bug removed, was j before
					count = count +1
				}
        if (trace)
					{
						print("**** after update of proxy list ***")
	          print (paste(theProxySets,collapse=", "))
					}
			}
		}
		if (!aboveTH)
		{ 
			currentModelIDs[length(currentModelIDs)+1] <- newModelIDs[length(currentModelIDs)+1]
			if (trace) { print(paste("Model IDs updated with variable ", names(x)[i], sep=" ")) }
			#create new proxy set
      theProxySets[i] <- max(theProxySets) +1
   	}
		else if (length(IDs2rm) > 0)
		{ # delete the x variables indices from the model
			# if lists have been combined
			#print(currentModelIDs)

			for (k in 1:length(IDs2rm) )
			{
				if ( length( which( currentModelIDs == IDs2rm[[k]] ) ) > 0)
					{ currentModelIDs <- currentModelIDs[ - which(currentModelIDs == IDs2rm[[k]])]
					}
				else
					print("Should never happen.... check source code...")
				print(which(currentModelIDs == IDs2rm[[k]]))
			}
      #IDs2rm <- vector("list",0)
		}

  	prevFormula <- makeFormula(x,y,currentModelIDs)
		prevVIF <- getVIF(prevFormula, theData = theDF, mod)

	}
   newModelIDs <- currentModelIDs # new checking
	####
	# loop over removed ids list
	# and reinvent the removed x-vars
	# to see if there are jumps in the VIFs
	# step 3B and 3C in Booth et al. 1994
	###
	for (rID in removedIDs)
	{
    
		pos <- length(newModelIDs)+1
    
		if ((rID > -1) & (sum(newModelIDs == rID) == 0)) # changed 23.10.2008 , added & (sum(newModelIDs == rID) == 0)
		{ 
		  newModelIDs[pos] <- rID # changed 23.10.08 was outside block before
	    newFormula <- makeFormula(x,y,newModelIDs)
			#print(newFormula)
			newVIF <- getVIF(newFormula, theData = theDF, mod)
			#print("newVIF:")
      #print(newVIF)
			#print(prevVIF)
			relVIF <- newVIF[-pos] / prevVIF
			if (trace)
			{ print(paste("Reinventing ", names(x)[rID]))
				print ("VIFs which excessed jump-threshold:")
				print (which(relVIF > jumpThreshold))
				print(newVIF)
			}
			# set the proxy set id for the xvars which jumped
			# to the id of the proxy set there the newly reinmvented variable belongs to
			# 	which(relVIF > jumpThreshold) -> the current-model-vars which jumped
			#   newModelIDs[which(relVIF > jumpThreshold) ] -> the ids of the original xvars
			#   theProxySets[ newModelIDs[which(relVIF > jumpThreshold) ] ] -> the proxyset ids of the selected xvars
			# 	theProxySets %in% theProxySets[ newModelIDs[which(relVIF > jumpThreshold) ] ] -> all xvars which belong to the same proxy set
			theProxySets <- ifelse( theProxySets %in% theProxySets[ newModelIDs[which(relVIF > jumpThreshold) ] ] ,
						theProxySets[rID] , theProxySets)

			prevVIF <- newVIF
		}
    
	}
	

# ToDo: das knnte man besser machen
# momentan wird eine Warnung erzeugt
# wie bekommt man die weg???

	result <- vector("list",0)
	result$XIDs <- theProxySets
	
	res <- createLists(x, theProxySets)
	# changed 23.10.2008
	# store only the names and ids for non empty lists
	# I am sure there is a nicer way out there to do it...
	result <- vector("list", 0)
	result$ids.grouped <-vector("list", 0)
	result$names.grouped <-vector("list", 0)
	count <- 1
	for(ii in 1:length(res$ids))
	{
	  if (length(res$ids[[ii]]) != 0)
	  {
	    
	    result$ids.grouped[[count]] <- res$ids[[ii]]
	    result$names.grouped[[count]] <- res$names[[ii]]
	    count <- count +1
	  }
	}
	return(result)

}



#create a formula out of x and y vector
# for x take only the columns that are specified in the ID vector
makeFormula <- function( x, y, idVector, combine= " + ")
{
	theStringY = paste( names(y), collapse = combine)
 	theStringX = paste( names(x[idVector]), collapse = combine)
	theString = paste (theStringY, theStringX, sep = " ~ ")
	return (as.formula(theString))
}

#return the VIF for a model
# toDo: checkh if VIF =FALSE and use the other indicator instead
getVIF <- function(theFormula, theData, mod, VIF=TRUE )
{
	#print(theData)
	if (mod == "lm")
	{
	 #print(theFormula)
		theModel <- lm(formula = theFormula, data= theData)
		#print( length(theModel$model))

	}
	#paste("Length model: " , length(theModel$model))
	if ( length(theModel$model) <= 2)    # this has been changed to account
	{                                    # for situations then the first and the second variable are collinear
	  theVIF <- numeric(1)
	}
	else
	{
    theVIF <- vif(theModel)
 }
	if (is.null(colnames(theVIF)))
	{
		return (theVIF)
	}
	else
	{
		return(theVIF[,"GVIF"])
	}
}

createLists <- function(df.x, proxyVector)
{ # returns list with variable names and another with ids
	names <- vector("list", 0 )
	ids <- vector("list", 0 )
	for (i in (min(proxyVector):max(proxyVector)) )
	{
		pos <- length(ids) +1
    ids[[pos]] <- which(proxyVector==i)
		names[[pos]] <- names(df.x)[ids[[pos]] ]
	}
	res <- list()
	res$names <- names
	res$ids <- ids
	return(res)
}

