##########################################
#
# run OSCAR on a bunch of collinear data sets
#
#
# author: sven.lautenbach@ufz.de
#
# date: 18.12.2007
# major revisions: 12.02.2008
#   cv.oscar
#   predict.oscar
#   run.oscar as a seperate function which rearanges results into nice arrays
# scalling of X in predict added : 01.04.2008
#########################################

###########################################
#
# function overview
#
# cv.folds
#     create folds for cross validation
#
# runOscar 
#     perform one oscar run
#
# cv.oscar
#     perform a K-fold crossvalidation on a oscar run
# predict.oscar
#     predict y for the coefficients estimated by oscar



#{ # block used to avoid 'syntax error, unexpected ELSE in "else"'
#	if (.Platform$OS.type == "windows")
#		baseFolder  <- "Y:/Nutzer/development/R/collinearity/"
#	else
#		baseFolder <- "/home/slautenb/projekte/modelle/R/"
#}

################################################################################
# functions
################################################################################

whichvar <- function(x, array) pmatch(x, attr(dim(array), "names"))

cv.folds <-function (n, folds = 5)
{
    split(sample(1:n), rep(1:folds, length = n))
}
##########################################################
# K-fold cross-validation
# to estimate the performance of the different
# c and prop combinations
# and to select the best model
#
# author: sven.lautenbach@ufz.de
#
##########################################################
cv.oscar <- function(X,y, K = 5, path2Mexe, datFile, outputCoef, outputDf, outputRSS,
                      cvaluesFile, propvaluesFile, trace = FALSE)
{
  cvalues <- as.matrix(read.table(cvaluesFile, header=FALSE, sep=","))
	propvalues <- as.matrix(read.table(propvaluesFile, header=FALSE, sep=","))
	# create a nice 3-D CoefMatrix with K, propvalues  and cvalues as dimensions
  ncvalues <- length(cvalues)
  npropvalues <- length(propvalues)

  all.folds <- cv.folds(length(y), K)

  resid.array <- array(data=NA, dim=c(fold=K, prop.val=npropvalues, c.val=ncvalues),
			dimnames=list(paste("fold-", 1:K, sep=""), paste("prop.", propvalues[,1], sep=""),paste("c.", cvalues[,1], sep="")) )

	base.datFile <-  strsplit(datFile, "\\.")[[1]][1]
  base.outputCoef <-  strsplit(outputCoef , "\\.")[[1]][1]
  base.outputDf <- strsplit(outputDf , "\\.")[[1]][1]
  base.outputRSS <- strsplit(outputRSS , "\\.")[[1]][1]

  for (i in seq(K)) # for all folds
 {
 		omit <- all.folds[[i]]

		fold.datFile <- paste(base.datFile, "_fold_", i, ".csv", sep="")
    fold.outputCoef <- paste(base.outputCoef, "_fold_", i, ".csv", sep="")
    fold.outputDf <- paste(base.outputDf, "_fold_", i, ".csv", sep="")
    fold.outputRSS <- paste(base.outputRSS, "_fold_", i, ".csv", sep="")

		write.table(cbind(y[-omit],X[-omit,]), file= fold.datFile,
      row.names=FALSE, quote=FALSE, dec=".", sep=",")

		# run oscar
		oscar.obj <- runOscar (path2Mexe, fold.datFile, fold.outputCoef, fold.outputDf, fold.outputRSS,
                      cvaluesFile, propvaluesFile, trace = FALSE)
		# calc and store the residuals
		for( j in 1:ncvalues)
		{
			aCvalue <- cvalues[j]
			for( k in 1:npropvalues)
			{
		  	aPropvalue<- propvalues[k]
				# predict at the omited samples
				fit <- predict.oscar(oscar.obj, X[omit,], aCvalue, aPropvalue)
				# calculate the residuals
				resid.array[i,k,j] <-  sum((y[omit] - fit)^2)
			}
		}

	}

	# calc cv and cv.error b applying mean and var on the resid.array
  cv <- apply(resid.array,c(2,3),mean)
  cv.error <- apply(resid.array,c(2,3), var)
	# identify best model
	minRSS.cvalue  <- cvalues[which(cv==min(cv), arr.ind=TRUE)[2]]
	minRSS.propvalue  <- cvalues[which(cv==min(cv), arr.ind=TRUE)[1]]
  # calc AICS for each fold
  # assuming that the model errors are normally and independently distributed
  aic.array <- resid.array # copy structure form the residual (rss) array
  nData <- length(y[-omit])
  for (m in 1:K)
  {
  	aic.array[m,,] <- nData*log(resid.array[m,,] / nData) + 2* oscar.obj$df.array
	}

	cv.aic <- apply(aic.array,c(2,3),mean)
  cv.aic.error <- apply(aic.array,c(2,3), var)
	minAIC.cvalue <- cvalues[which(cv.aic==min(cv.aic), arr.ind=TRUE)[2]]
	minAIC.propvalue  <- propvalues[which(cv.aic==min(cv.aic), arr.ind=TRUE)[1]]

	# return resid.array and best model in one object
	result <- list(resid.array=resid.array, cv = cv, cv.error = cv.error, cvalues = cvalues, propvalues=propvalues,
		minRSS.cvalue= minRSS.cvalue, minRSS.propvalue= minRSS.propvalue, aic.array=aic.array, cv.aic = cv.aic,
		cv.aic.error= cv.aic.error, minAIC.cvalue=minAIC.cvalue, minAIC.propvalue=minAIC.propvalue)

	class(result) <- "oscar.cv.obj"

	return(result)
}

####################################################
# predict y values for a couple of X vectors
# using a specific c and prop value
#
# input paramater:
# 	oscar.obj contains
# 	predict at <- Matrix of X vectors at whitch to predict
#
# author: sven.lautenbach@ufz.de
#
#################################################
predict.oscar <- function(oscar.obj, newdata, cvalue=NULL, propvalue=NULL)
{
	### parameter checking
	if (class(oscar.obj) != "oscar.obj")
		stop("parameter oscar.obj is not of class oscar.obj...")
	#if(class(newdata) != "matrix")
	#	stop("parameter newdata.at is not of class matrix...")
	nX <- dim(oscar.obj$Coef.array)[1]
	if(nX != dim(newdata)[2])
		stop(paste("newdata has the wrong number of x-variables...\n should be ", nX,
					" but is ", dim(newdata)[1] ))

	#ToDo: if cvalue == NULL and propValue == NULL, take best model...
	if (is.null(cvalue) & is.null(propvalue)  )
	{
    if(is.na(oscar.obj$best.model.id))
		{
			stop("best model has not been calculated, please do so or supply cvalue and propvalue...")
		}
		else
		{
			# ToDo: select best model
			stop("Please provide cvalue and propvalue for predict.oscar...")
		}
	}
	else
	{
		if (is.null(cvalue) | is.null(propvalue))
		{
			stop("propvalue and cvalue must both be set to a specific value or to NULL...")
		}
	}
	if(length(which(oscar.obj$cvalues == cvalue)) == 0)
		stop(paste("cvalue not available in oscar.obj$cvalues... available values are: ",
				oscar.obj$cvalues, sep=""))
  if(length(which(oscar.obj$propvalues == propvalue)) == 0)
		stop(paste("propvalue not available in oscar.obj$cvalues... available values are: ",
				oscar.obj$propvalues, sep=""))
	if( (cvalue > 1) | (cvalue < 0) | (propvalue > 1) | (cvalue < 0) )
		stop("cvalue and propvalue must be in [0,1]...")

  # transpose matrix
  predict.at <- t(newdata)

	cvalue.id <- which(cvalues == cvalue)[1]
	propvalue.id <- which(propvalues == propvalue)[1] # in case the same values appears more thenonce...

	### predict
	nPredicts <- ncol(predict.at)
	results <- numeric(nPredicts)
	
	

	# Standardize predict.at variables to mean zero, variance 1
	predict.at.scaled <- predict.at

  for( i in 1:(length(oscar.obj$train.mean.X)))
  {
    predict.at.scaled[,i] <- (predict.at.scaled[,i]- oscar.obj$train.mean.X ) / oscar.obj$train.sds.X
    #predict.at.scaled[,i] <- (predict.at.scaled[,i]- oscar.obj$train.mean.X[i] ) / oscar.obj$train.sds.X[i]
  }
	# ToDO: use a vector function like apply
	for ( i in 1:nPredicts )
	{
    results[i] <- oscar.obj$Coef.array[,propvalue.id, cvalue.id] %*% predict.at.scaled[,i]
	}
	
	return(results)
}

###############################################################################
# run the Matlab executable to perform the OSCAR analysis
#
# input parameter:
#   path2Mexe -		path to the matlab executable
#		datFile -
#		outputCoef -	filename of the Coefficients matrix (matlab stores results here)
#		outputDf -		filename of the Degrees of freedom matrix (matlab stores results here)
#		outputRSS -   filename of the residual sum of squares matrix (matlab stores results here)
#   cvaluesFile - filename to the cvalues (matlab reas from here)
#		propvaluesFile - filename to the propvalues (matlab reads from here)
#   trace -       display status messages
#
# output:
#   object of class oscar.obj
#
# author: sven.lautenbach@ufz.de
#
################################################################################

runOscar <- function(path2Mexe, datFile, outputCoef, outputDf, outputRSS,
                      cvaluesFile, propvaluesFile, trace = FALSE)
{


	command <- paste(path2Mexe, datFile, outputCoef, outputDf, outputRSS,
                      cvaluesFile, propvaluesFile, sep=" ")
  if (trace)
		print(command)
  # call MatLab executable
	# for windows shell( command, wait=TRUE, mustWork=TRUE)
  { # block used to avoid 'syntax error, unexpected ELSE in "else"'
	if (.Platform$OS.type == "windows")
		shell( command, wait=TRUE, mustWork=TRUE)
	else
		system( command=command, wait=TRUE)
	}


	#Xy <- read.table(datFile, header=TRUE, sep=",")
	#y <- as.matrix(Xy[,1])
	#X <- as.matrix(Xy[,2:ncol(Xy)])

	# read the data
  # rows are the explanatory variables
	# cols are a nested props for each cvalue list
	# for each cvalue, the list of all propvalues is grouped
	# when the next lsit of propvalues for the next cvalue follows
	CoefMatrix <- read.table(outputCoef, header=FALSE, sep=",")
	dfMatrix <- read.table(outputDf, header=FALSE, sep=",")
	rssMatrix <- read.table(outputRSS, header=FALSE, sep=",")
	cvalues <- read.table(cvaluesFile, header=FALSE, sep=",")
	propvalues <- read.table(propvaluesFile, header=FALSE, sep=",")

	# create a nice 3-D CoefMatrix with x, cvalues and propvalues as dimensions
  ncvalues <- length(cvalues[,1])
  npropvalues <- length(propvalues[,1])
	nX <- length(CoefMatrix[,1])

	Coef.array <- array(data=NA, dim=c(x=nX, propvalues=npropvalues, cvalues=ncvalues),
			dimnames=list(paste("X", 1:nX, sep=""),paste("prop.", propvalues[,1], sep=""),paste("c.", cvalues[,1], sep="")))
	df.array <- array(data=NA, dim=c( propvalues=npropvalues, cvalues=ncvalues),
			dimnames=list(paste("prop.", propvalues[,1], sep=""),paste("c.", cvalues[,1], sep="")))
  rss.array <- array(data=NA, dim=c( propvalues=npropvalues, cvalues=ncvalues),
			dimnames=list(paste("prop.", propvalues[,1], sep=""),paste("c.", cvalues[,1], sep="")))
	for ( i in 1:ncvalues)
	{
    Coef.array[,,i] <- as.matrix(CoefMatrix[,( 1+(i-1) * npropvalues) :(i*npropvalues )])
    df.array[,i] <- as.matrix(dfMatrix[( 1+(i-1) * npropvalues) :(i*npropvalues )])
    rss.array[,i] <- as.matrix(rssMatrix[( 1+(i-1) * npropvalues) :(i*npropvalues )])

	}

	#get means and sd of y and X to allow scalling and rescalling in predict
	dat <- read.table(datFile, header=TRUE, sep=",")
	means <- mean(dat)
	sds <- sd(dat)

	res <- list('vector')
	res$Coef.array <- Coef.array
	res$df.array <- df.array
	res$rss.array <- rss.array
	res$cvalues <- cvalues
	res$propvalues <- propvalues
	res$best.model.id <- NA
	#res$X <- X
	# XXXXXXXXXXXXXXXXXXXXXXXXX
	# this might cause trouble if y is not named y...
	# XXXXXXXXXXXXXXXXXXXXXXXXX
	res$train.mean.y <- means[1]
	res$train.sds.y <- sds[1]
	res$train.mean.X <- means[-1]
	res$train.sds.X <- sds[-1]

	class(res) <- "oscar.obj"

	return(res)
}

getRSS <- function(X.test, betas, y.test)
{
  res <- X.test %*% betas - y.test
  res <- sum(res*res)
  return(res)
}

findBestOscarModel <- function(CoefMatrix, dfMatrix, cvalues, propvalues, dat  )
{
  ncvalues <- length(cvalues)
  npropvalues <- length(propvalues)

  X.test <- as.data.frame(dat$test.same[, -1])
  # we need to use the scaled X values since
  # OSCAR returns the coefficients for the scaled X
  X.test.sc <- apply(X.test, FUN=scale, MARGIN=2)

  y.test <- as.data.frame(dat$test.same[, 1])
  #create empty matrix
  res.m <- matrix(0,0,8)
  res.names <- c("col.id", "c", "prop", "RSS", "df", "AIC", "BIC", "R.sq")
  # loop over all combinations of c and prop and find the best one
  for (colID in 1:ncol(CoefMatrix))
  {
    betas <- as.matrix(CoefMatrix[,colID])
    RSS <- getRSS(X.test.sc, betas, y.test-mean(y))
    cvalue <- cvalues[ceiling(colID / npropvalues)]
    tmp <- colID %% npropvalues
    propvalue <- propvalues[ ifelse(tmp == 0, npropvalues, tmp) ]
    df.val <-  dfMatrix[colID]
    aic <- N*log(RSS / N) + 2* df.val
    bic <- N*log(RSS / N) + 2* log( df.val)
    y.mean <- mean(y.test)
    TSS <- sum((y.test -y.mean)^2 )
    rsq <- 1 - RSS / TSS
    res.m <- rbind( res.m,
      as.numeric(c(  colID, cvalue, propvalue, RSS, df.val, aic, bic, rsq  ))
       )
  }
  res.df <-  as.data.frame(res.m)
  names(res.df) <- res.names
  result <- list()
  result$res.df <- res.df
  minAIC.id <- which.min(res.df$AIC)
  minAIC <- res.df$AIC[minAIC.id]
  result$minAIC <- minAIC
  result$minAIC.id <- minAIC.id
  result$minAIC.c <- res.df$c[ minAIC.id]
  result$minAIC.prop <- res.df$prop[ minAIC.id]
  return (result)
}

optOSCAR <- function(cdata, path2Mexe, k=5, 
  cvalues =  c(seq(0,0.001, 0.0001) , seq(0.002, 0.01, 0.001), seq(0.02, 0.1, 0.01), seq(0.2,1,0.1))
, propvalues =  c(seq(.0001,0.001, 0.0001) , seq(0.002, 0.01, 0.001), seq(0.02, 0.1, 0.01), seq(0.2,0.9,0.1))
)
{
  baseFolder <- paste(getwd(), "/", sep="")
  
  baseFN <- paste(baseFolder, "oscar/exchange/train/", sep="")
  outFolder=paste(baseFolder, "oscar/exchange/from_Matlab_main/", sep="")
  cvaluesFile <- paste(outFolder, "cvalues.csv", sep="")
  propvaluesFile <- paste(outFolder, "propvalues.csv", sep="")
  
  # make the directories for OSCAR:
  system("mkdir oscar")
  system("mkdir oscar/exchange")
  system("mkdir oscar/exchange/train")
  system("mkdir oscar/exchange/from_Matlab_main")     # could use mkdir - p unter linux/unix
  oldwd <- getwd()
  setwd( dirname( path2Mexe)  )
  # toDo: eleminate doubles in cvalues and propvalues
  # order the vectors...
  write.table(cvalues, file= cvaluesFile, col.names=FALSE,
        row.names=FALSE, quote=FALSE, dec=".", sep=",")
  write.table(propvalues, file= propvaluesFile,  col.names=FALSE,
        row.names=FALSE, quote=FALSE, dec=".", sep=",")
  # write data file for analysis with MatLab
  datFile = paste(baseFN, "/cdata.csv", sep="")
  write.table(cdata, file= datFile, row.names=FALSE, quote=FALSE, dec=".", sep=",")

  
  # create output file names and command
  outputCoef <- paste(outFolder, "Coef_train_cdata.csv", sep="")
  outputDf <-   paste(outFolder, "df_train_cdata.csv", sep="")
  outputRSS <-   paste(outFolder, "SS_train_cdata.csv", sep="")

	# this is somnehow redundant since we could also read X and y from the textfiles...
  X <- as.matrix(cdata[,2:ncol(cdata)])
	y <- as.matrix(unlist(cdata[,1]))
  # check if colanms present if not, set to "y"
#	if(colnames(y) == "")
	colnames(y) <- "y"
  # do the real work
  # cross validation
  oscar.cv.obj <- cv.oscar(X,y, K = 5, path2Mexe, datFile, outputCoef, outputDf, outputRSS, cvaluesFile, propvaluesFile, trace = FALSE)
	# call oscar with all data
	oscar.obj <- runOscar(path2Mexe, datFile, outputCoef, outputDf, outputRSS,
	                      cvaluesFile, propvaluesFile, trace = FALSE)
	                      
  setwd(oldwd)	                      
  return (list(oscar.cv.obj = oscar.cv.obj, oscar.obj = oscar.obj  ))
}