#############################################
# FASTLO                                    #
# This is a faster form of cyclic loess.    #
# It does not produce the same results as   #
# cyclic loess but does run considerably    #
# faster. The code is a modified verion of  #
# Terry's fastlo2                           #
# author: therneau                          #
# modified by: ballman                      #
# HISTORY:                                  #
#   2006 Aug 31 - version 5 branched by morlan            #
#     to use  *.log2pm files                #
# date: 06/27/03                            #
#############################################


#Copyright 2004 Mayo Clinic College of Medicine.
#This software is accepted by users "as is" and without warranties or
#guarantees of any kind.  It may be used for research purposes or in
#relation to projects with commercial applications or included in commercial
#packages, but only so long as it is not relicensed as a stand-alone
#program, and only so long as the first two sentences of this paragraph
#(copyright notice and no warranty) are reproduced with the software.


#This software is free software; you can redistribute it and/or modify it
#under the terms of the GNU General Public License as published by the Free
#Software Foundation; either version 2 of the License, or (at your
#option) any later version.

#This program is distributed in the hope that it will be useful, but
#WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
#or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
#more details.

################################################################################
## 80 columns
################################################################################

fastlo.on.disk <- function(
  input.columns=NULL, 
  output.dir=NULL,
  file.extension=NULL,                           
  subset, 
  maxit=3, 
  mfun=NULL, 
  log.it=TRUE,
  verbose=TRUE,
  epsilon = 0.01, 
  MM=F,
  ...) 
{
  version <- "5.0.0, August 2006"
  #####
  # "quick help"
  #####
    if ( is.null(input.columns) )
    {
      msg <- paste(
        "\n##########",
        "\n# fastlo()  - Version: ",version,
        "\n#   input.columns = the R object files containing the matrix column vectors to be normalied.",
        "\n#                   Alternatively, this can simply be the directory containing the files.",           
        "\n#   output.dir    = The ouput directory where the normalized column vectors will be placed.",
        "\n#   ----- (optional)",
        "\n#   file.extension = The file extension of the input vectors.",
        "\n#   subset   = index into a subset of rows to use in the normalization (default is 'all rows')",
        "\n#   maxit    = maximum number of iterations (default is 3)",
        "\n#   mfun     = Disabled for fastlo.on.disk().",
        "\n#   log.it   = Boolean to determining whether to log (base 2) matrix for normalization.",
        "\n#              The inverse log of the matrix is taken before function exits.  Default is TRUE.",           
        "\n#   verbose  = how much information to report back during process (default is TRUE)",
        "\n#   epsilon  = convergence criteria (default is 0.01)",
        "\n##########\n",
        sep="")
      cat(msg)
      invisible(FALSE)
    }
    
  if ( is.null(input.columns) ) {
    stop(paste("fastlo.on.disk:  requires input.columns to be a list of R objects or the path\n",
               "where the columns are located.\n", sep=""))
  }# END IF is.null(input.columns)

  input.dir <- NULL
  if (length(input.columns) < 2) { # then this should be the directory where the R objects are
    input.dir <- input.columns
    if ( !file.info(input.dir$isdir) ) {
      stop(paste("fastlo.on.disk:  requires input.columns to be a list of R objects or the path\n",
                 "where the columns are located.\n", sep=""))
    }
    # Grab the input columns from the input directory that match file.extension (optionally specified extension)
    input.columns <- dir(input.dir, full.names=TRUE, pattern = paste(file.extension, "$", sep=""))
  }
   
  # We also need the file names without the directory path.
  input.column.names <- NULL
  input.column.paths <- strsplit(input.columns, "/")
  for (i in 1:length(input.column.paths)) {
    # The file name sans path will be the last element of each vector in the list
    input.column.names <- c(input.column.names,
                            input.column.paths[[i]][ length(input.column.paths[[i]]) ])
  }

  ## Move the input files to a temp directory
  temp.dir <- tempdir()
  file.copy(input.columns, temp.dir, overwrite=TRUE)

  ## Now, input.columns should point to the temp paths
  input.columns <- paste(temp.dir, input.column.names, sep="/") 
  
  if ( is.null(output.dir) ) {# THEN we will put the output in the input folder
    output.dir <- input.dir
    if ( is.null(output.dir) || !file.info(input.dir)$isdir || length(outp.dir) > 1 ) {
      stop("fastlo.on.disk:  output directory not specified.\n")
    }
  }# END IF is.null(output.dir) 

  #####
  # Output directory may or may not end with '/' -- easier if it does 
  #####
  if ( substr(output.dir, nchar(output.dir), nchar(output.dir)) != "/" ) {
    output.dir <- paste(output.dir, "/", sep="")
    # The following should never happen, but just in case...
    if ( !file.info(output.dir)$isdir ) {
      stop("ERROR: fastlo.on.disk can't determine output directory.\n")
    }
  }
  
  #####
  # Backward compatibility ... mfun is no longer supported.
  #####
  if ( !is.null(mfun) ) {
    errmsg <- paste("fastlo.by.column() does not support the 'mfun' functionality.\n",
                    "Full matrix must be passed to fastlo to use function other than rowMeans.\n",sep="")
    cat(errmsg)
    stop(errmsg)
  }
  
  #####
  # lc is a loess.control object used later to manage loess()
  #####
  lc<-loess.control(surface="interpolate",statistics="approximate",trace.hat="approximate",cell=0.2,iterations=4)   
 
  #####
  # some structure ...
  #
  #    nChips          = the number of chips (columns)
  #    nSpots         = the number of spots on a chip (rows or elements in each column vector)
  #####

  # Load the first chip, capture it's object name, and assign it to curChip
  curChip <- eval(parse(text = load(input.columns[1])))

  # need a new way to calculate nSpots:  read first and calculate length
  nChips <- length(input.columns)
  nSpots <- length(curChip)
  
  if ( verbose ) {
    cat("\n      nChips  = ",nChips,
        "\n      nSpots = ",nSpots,
        sep="")
  }
  
  #####
  # subset is never used, but if it were, it would select a subset of the probes to use in the normalization process
  #####
  if(!missing(subset))
    {# THEN we are working with a subset
      if(length(subset)==1) 
        {# THEN subset is the number to be used, and we generate a random subset index of that length
        subset <- sample(1:nSpots, subset)
      } else {
        if(any(subset < 1 | subset > nSpots))
          {# THEN subset contains an/some invalid value/s
          errmsg <- "fastlo() - invalid rows in the subset argument"
          stop(errmsg)
        }
      }
    } else {
     # ELSE when subset = missing takes all of the rows                  
        subset <- 1:nSpots
    }
    
  #####
  # w is the weights for loess fit
  #####
    w <- c(0,rep(1,length(subset)),0) 
    
    yhat <- NULL
  
    changes <- rep(0,nChips)
    
    
  #####
  # setting up, we find the initial yhat
  #####
    ysum <- vector("numeric",nSpots)
    if(verbose) 
    {
      cat("\n      Initialize yhat: ",rep(".",nChips),
          "\n                       ",
        sep="")   
    }
    for ( iChip in 1:nChips ) {# FOR EACH chip
      if ( verbose ) { cat("o") }

      if (iChip > 1) {# we already loaded the first chip 
        curChip <- eval(parse(text = load(input.columns[iChip])))
      }
      
      if ( is.matrix(curChip) )
      {# THEN we may be in trouble (if there are more than 1 column)
        if ( dim(curChip)[[2]] > 1 )
        {
          errmsg <- "ERROR: fastlo.on.disk() - while initializing yhat the column objects should be the original vectors or matrices with second dimension=1."
          stop(errmsg)
        } else {
         # ELSE convert log2pm to a vector
          curChip <- c(curChip)
        }# END IF dim(curChip)[[2]]>1
      }# END IF is.matrix(curChip)
      if (log.it) {
        # Test requested by Bruce Morlan
        max.value <- max(curChip)
        if ( max.value < 15 ) {
          cat(paste("WARNING:  fastlo.on.disk: Maximum value of input vector is less than 15.\n",
                    "Make sure you're not double logging.\n"))
        }
        curChip <- logb(curChip, 2)
      }
      ysum <- ysum + curChip
      if (verbose) cat("      ++")  #cat("\08+")  --JPS changed to work in R214
    }# NEXT chip
    yhat <- ysum / nChips
    ###
    # To avoid NA's due to extrapolation, the lowess has to include the largest and smallest yhat
    ###
  temp  <- order(yhat)
  index <- c(temp[1], subset, temp[nSpots])

  #####
  # Iterate.
  #####  
  for (iter in 1:maxit) 
    {
      if(verbose) 
        {
          cat("\n      Iteration ", format(iter,width=2),": ",rep(".",nChips),sep="")
          cat("\n                    ")   
        }
      
#####
# first, get next yhat
#####
      ysum <- vector("numeric",nSpots)
      for ( iChip in 1:nChips ) {# FOR EACH chip
          if ( verbose ) cat("o")

          # After first iteration, input.columns refers to temp space
          curChip <- eval(parse(text = load(input.columns[iChip])))
          
          if ( iter == 1 )
            {# THEN this is the first time we have read this data, so smooth is a vector of 0's
              old.smooth <- rep(0,nSpots)
              if (log.it) { curChip <- logb(curChip, 2) }
            } else {
              # ELSE the curChip file contains both pm and smooth information
              old.smooth <- curChip[,2]
              curChip     <- curChip[,1]
            }
          if ( length(ysum) != length(curChip) )
            {
              errmsg <- paste("\nERROR: fastlo(): length(ysum)=",length(ysum)," is NOT equal to length(curChip)",length(curChip),") for chip=",thisChip,sep="")
              stop(errmsg)
            }
          tfit   <- loess(c(curChip[index] - yhat[index]) ~ yhat[index], degree=1,weights=w,control=lc)
          smooth <- predict(tfit, yhat)
          ysum   <- ysum + curChip - smooth
          curChip <- cbind(curChip, smooth)

          save(curChip, file = input.columns[iChip])
          changes[iChip] <- mean( ( smooth[subset] - old.smooth[subset] )^2 )
          if (verbose) cat("      ++")  #cat("\08+")--JPS changed to work in R214
        }# NEXT chip                    
###
# All subsequent passes we will read from the tmp location and from the working *.normalized file.
###
#      tmpLog2PMDataPath <- temp.dir
      file.extension    <- ".normalized"
      yhat <- ysum / nChips

###
# To avoid NA's due to extrapolation, the lowess has to include the largest and smallest yhat
###
      temp  <- order(yhat)
      index <- c(temp[1], subset, temp[nSpots])     
      
      change <- max( changes )
      if (verbose) cat("\n      Finished, change = ", format(change),sep="")
      if ( change <= epsilon ) break
    } # NEXT iter

  if (verbose) cat("\n")

#####
# a final adjustment, and we now place the files into the real directory
#####
  for ( iChip in 1:nChips )
    {# FOR EACH chip
      curChip <- eval(parse(text = load(input.columns[iChip])))

      normalized.filename <- paste(gsub(paste(temp.dir, "/", sep="")
                                        , output.dir, input.columns[iChip])
                                   , ".normalized", sep="")
      y      <- curChip[,1]
      smooth <- curChip[,2]
      pmnorm <- y - smooth
      if (log.it) { pmnorm <- 2^pmnorm }
      save(pmnorm, file = normalized.filename)
    }# NEXT chip

  # ELSE we return the path that was used to build the .normalized files
  return(paste("Normalized files for ",nChips," chips are found in '",output.dir,"'",sep=""))

} # END FUNCTION fastlo.on.disk
