#$Author: sinnwell $
#$Date: 2011/02/25 16:37:23 $
#$Header: /projects/genetics/cvs/cvsroot/mend.err/R/sge.jackknife.q,v 1.2 2011/02/25 16:37:23 sinnwell Exp $
#$Locker:  $
#$Log: sge.jackknife.q,v $
#Revision 1.2  2011/02/25 16:37:23  sinnwell
#long to int, add x.sexcheck, T to TRUE
#
#Revision 1.1.1.1  2011/02/16 14:33:20  sinnwell
#initial for mend.err package
#
#Revision 1.4  2009/03/02 22:29:07  sinnwell
#add PACKAGE to .C call
#
#Revision 1.3  2009/02/25 22:13:49  sinnwell
#add keywords
#
# This function takes an object x of class "sge" and performs jackknifing at
# all loci that have Mendelian Errors.  If x has no loci with Mendelian Errors,
# then no action is performed.  If x does have loci with Mendelian Errors, then
# all people that have one or both allele codes known at a problematic locus
# will be systematically removed to determine whether this eliminates Menelian
# Errors at the locus in question.

sge.jackknife <- function(x,print.stdio=FALSE)        # Object of class "sge"
{
 # Check that x is an object of class "sge": 
 if(class(x) != "sge")
    {
     warning("Input is not an object of class sge.")
     return(NULL)
    }

  # Check if there are Mendelian Errors to jackknife:
  if(!any(x$err.locus))
    {
     warning("Input contains no Mendelian Errors to jackknife.")
     return(NULL)
    }

  # Retrieve values from x that will be used:
  ped             <- x$ped
  geno            <- x$geno
  miss.val        <- x$miss.val
  male.code       <- x$male.code
  female.code     <- x$female.code
  nLoci           <- x$nLoci
  loci.used       <- x$loci.used
  err.locus       <- x$err.locus
  pedigree.column <- x$pedigree.column
  person.column   <- x$person.column
  labID.column    <- x$labID.column
  father.column   <- x$father.column
  mother.column   <- x$mother.column
  sex.column      <- x$sex.column 

  # loci.err is the loci that give Mendelian Errors:

  loci.err        <- loci.used[err.locus != 0]

  # Gather the loci that cause Mendelian Errors:
  index <- NULL
  for(i in loci.err)
     {
      index <- c(index,2*i-1,2*i)
     }

  # The column names are no longer needed, so removing them suppresses warning
  # messages in S-Plus in the event that index columns have duplicate names:

 if(exists("is.R") && is.function(is.R) && !is.R())
   names(geno) <- NULL

  geno <- x$geno[,index]

  # Retrieve needed columns of the pedigree given by ped:
  person <- ped[,person.column]
  father <- ped[,father.column]
  mother <- ped[,mother.column]
  sex    <- ped[,sex.column]
    
  # Proper sex not specified.
  if(any((sex != male.code) & (sex != female.code)))
    warning("One or more subjects do not have there sex correctly specified.")
  
  if(is.factor(sex))
    {
     sex <- as.character(sex)
    }

  # Unspecified sex defaults to female
  sex[(sex != male.code) & (sex != female.code)] <- 0

  # Match male to sex code 1, and females to sex code 0
  sex[sex == male.code] <-   1
  sex[sex == female.code] <- 0

  # We will well-order the person, father, and mother vectors by considering
  # the entries of the person vector to be an ordering and then mapping this
  # ordering to 1,2,...,n where n = length(person):
    
  if(is.factor(person))
    {
     # Convert factors to characters
     person  <- as.character(person)
     father  <- as.character(father)
     mother  <- as.character(mother)
     
     # All the codes involved in the pedigree should be present the in
     # person array:
     lev <- unique(person)
     IDcodes <- as.numeric(factor(c(father,mother,person),
                                  levels=lev,exclude=miss.val))
     IDcodes[is.na(IDcodes)] <- 0
    }  
  else
    {     
     # All the codes involved in the pedigree should be present the in
     # person array:
     lev     <- unique(person)
     IDcodes <- as.numeric(factor(c(father,mother,person),
                                  levels=lev,exclude=miss.val))
     IDcodes[is.na(IDcodes)] <- 0
   }
       
  n              <- length(IDcodes)
  n              <- n/3
  father         <- IDcodes[1:n]
  mother         <- IDcodes[(n+1):(2*n)]
  person         <- IDcodes[(2*n+1):(3*n)]

  # Figure out the number of loci that we have, and set up some initial
  # stubs:
  
  nAllele         <- integer(0)
  nLoci           <- (ncol(geno)) / 2
  loci            <- 1:nLoci
  pheno           <- NULL
  phenoCats       <- list(NULL)
  genoMaps        <- list(NULL)
  lengthPhenoCats <- NULL
  lengthGenoMaps  <- NULL
  genoSize        <- 0
  k1              <- 1
  k2              <- 1
  allele.labels   <- list(NULL)
  typed           <- list(NULL)
  n.typed         <- 0
  n.results       <- 0

  for(i in loci)
    {
     tmp1 <- geno[,2*i-1]
     tmp2 <- geno[,2*i]
     loc  <- locus(allele1=tmp1, allele2=tmp2, 
                   male.code=male.code, female.code=female.code,
                   miss.val=miss.val)
     
     tmp  <- c(loc[,1],loc[,2])
     if(any(tmp[!is.na(tmp)] == 0))
       tmp[!is.na(tmp)] <- tmp[!is.na(tmp)] + 1
     
     tmp[is.na(tmp)] <- 0
     tmp1 <- tmp[1:n]
     tmp2 <- tmp[(n+1):(2*n)]

     # a1 < a2
     a1   <- ifelse(tmp1 < tmp2, tmp1, tmp2)
     a2   <- ifelse(tmp1 < tmp2, tmp2, tmp1)
     a1a2 <- cbind(a1,a2)
     
     # Missing two alleles:
     missingTwo  <-  a2 == 0

     # Subjects not missing two alleles:
     typed[[i]]  <- person[!missingTwo]
     n.typed[i]  <- length(typed[[i]])
     n.results   <- n.results + length(typed[[i]])

     # The following is a placeholder to prevent segmentation faults. It will
     # need to be replaced by a smarter solution when one is devised.
     if(all(missingTwo))
       {
        msg <- paste("All sujects are untyped at locus",i,
                     "-  No genotype elimination possible.")
        warning(msg)
        return("Finished")
       }

     # Tells if any subjects are missing two alleles:
     if(any(missingTwo)) m2 <- TRUE else m2 <- FALSE
         
     missingOne  <- (a1 == 0) & (!(a2 == 0))
     # Tells if any subjects are missing one allele:                        
     if(any(missingOne)) m1 <- TRUE else m1 <- FALSE
                       
     missingZero <- (!missingOne) & (!missingTwo)
     # Tells if any subjects are missing zero alleles:
     if(any(missingZero)) mz <- TRUE else mz <- FALSE
   
     if(all(missingZero))
        { nAllele[i] <- length(unique(c(a1,a2))) }
     else
        { nAllele[i] <- length(unique(c(a1,a2))) - 1 }

     phenoTmp <- gcode(a1,a2)

     if(m1)
       {
        # maxmzp1 means maximum missing zero plus 1
        maxmzp1 = max(phenoTmp) + 1
        phenoTmp[missingOne] = phenoTmp[missingOne] + maxmzp1 
       }

     # The genotypes now all have unique phenotype numbers
     lev      <- unique(phenoTmp)
     phenoTmp <- as.numeric(factor(phenoTmp, levels=lev)) - 1
     pheno    <- cbind(pheno, phenoTmp)

     # Form k-by-2 and k-by-3 matrices with column one containing the phenotype 
     # code and the remaining columns containing the allele codes:

     # Missing zero alleles:
                       
     if(mz)
       {
        strictGenotypes  <- cbind(pheno[missingZero, i], 
                                  matrix(a1a2[missingZero,], ncol=2))
        if(is.vector(strictGenotypes)) 
          strictGenotypes <- matrix(strictGenotypes, nrow=1)
       }

     # Missing one allele:
                       
     if(m1)
       {
        halfGenotypes    <- cbind(pheno[missingOne, i], a2[missingOne])
        if(is.vector(halfGenotypes)) 
          halfGenotypes <- matrix(halfGenotypes, nrow=1)
       }

     # Missing two alleles:
                       
     if(m2)
       {
        missingGenotypes <- cbind(pheno[missingTwo, i], 
                                  matrix(a1a2[missingTwo,], ncol=2))
        if(is.vector(missingGenotypes)) 
          missingGenotypes <- matrix(missingGenotypes, nrow=1)
       }         

     # Need to subset the above three matrices using column 1 as a key and 
     # finding the subset of rows with unique entries in column 1:

     # Missing zero alleles:
     
     if(mz)
       {
        index <- match(unique(strictGenotypes[,1]), strictGenotypes[,1])
        strictGenotypes <- strictGenotypes[index,]
        if(is.vector(strictGenotypes)) 
          strictGenotypes <- matrix(strictGenotypes, nrow=1)
       }

     # Missing one allele:
     
     if(m1)
       {
        index <- match(unique(halfGenotypes[,1]), halfGenotypes[,1])
        halfGenotypes <- halfGenotypes[index,]
        if(is.vector(halfGenotypes)) 
          halfGenotypes <- matrix(halfGenotypes, nrow=1)
       }

     # Missing two alleles:

     if(m2)
       {
        index <- match(unique(missingGenotypes[,1]), missingGenotypes[,1])
        missingGenotypes <- missingGenotypes[index,]
        if(is.vector(missingGenotypes)) 
          missingGenotypes <- matrix(missingGenotypes, nrow=1)
       }

     phenoCats[[k1]]  <- c(if(m2) missingGenotypes[,1],
                          if(m1) halfGenotypes[,1],
                          if(mz) strictGenotypes[,1])
   
     lengthPhenoCats[i] <- length(phenoCats[[k1]])
     k1 <- k1 + 1

     # Form the genotype mappings:

     # Missing two alleles:

     if(m2)
       {
        alleles <- 1:nAllele[i]
        tmp <- expand.grid(alleles,alleles)
        t1 <- tmp[,2]
        t2 <- tmp[,1]
        tmp <- cbind(t1,t2)
        tmp <- tmp[tmp[,1] <= tmp[,2],]
        genoMaps[[k2]] <- as.vector(t(tmp))
        lengthGenoMaps[[k2]] <- length(genoMaps[[k2]])
        k2 <- k2 + 1
       }

     # Missing one allele:

     if(m1)
       {
        alleles <- 1:nAllele[i]
        tmp <- expand.grid(alleles, halfGenotypes[,2])
        genoMaps[[k2]] <- as.vector(t(tmp))
        lengthGenoMaps[[k2]] <- list(rep(2*nAllele[i],
                                     length(halfGenotypes[,1])))
        k2 <- k2 + 1
       }

     # Missing zero alleles:

     if(mz)
       {
        tmp <- strictGenotypes[,2:3]
        genoMaps[[k2]] <- as.vector(t(tmp))
        lengthGenoMaps[[k2]] <- list(rep(2,length(strictGenotypes[,1])))
        k2 <- k2 + 1
       }
    # Size of the memory passed to C for return size:

    genoSize <- genoSize +
                2*sum(as.numeric(missingZero)) +
                2*nAllele[i]*sum(as.numeric(missingOne)) +
                nAllele[i] * (nAllele[i]+1) * sum(as.numeric(missingTwo))
    }

  # Transform the arguments to vectors for passage to the C program
  # "genotype_elimination:
  
  pheno            <- as.vector(pheno)
  phenoCats        <- unlist(phenoCats)
  nLengthPhenoCats <- length(lengthPhenoCats)

  genoMaps         <- unlist(genoMaps)
  lengthGenoMaps   <- unlist(lengthGenoMaps)
  nLengthGenoMaps  <- length(lengthGenoMaps)

  typedID          <- unlist(typed)
  sort.loci        <- FALSE
  flags            <- c(sort.loci, print.stdio)
  results          <- rep(0,n.results)

  ans <- 
  .C("genotype_elimination_jackknife",
     as.integer(person),
     as.integer(father),
     as.integer(mother),
     as.integer(sex),
     as.integer(n),
     as.integer(nLoci),
     as.integer(nAllele),
     as.integer(pheno),
     as.integer(phenoCats),
     as.integer(lengthPhenoCats),
     as.integer(nLengthPhenoCats),
     as.integer(genoMaps),
     as.integer(lengthGenoMaps),
     as.integer(nLengthGenoMaps),
     as.integer(flags),
     as.integer(typedID),
     as.integer(n.typed),
     as.integer(length(n.typed)),
     err.fatal    = integer(1),
     results      = as.integer(results),
     PACKAGES="mend.err")

  if(ans$err.fatal)
    { # Got a memory-related error:
      msg <- paste("Fatal program error detected.  Try decreasing batch.size ",
                   "or checking the\n pedigree format of your data.  If the ",
                   "problem persists, please report this error.\n",sep="")
      stop(msg)
    }

 ret             <- list(NULL)
 ret$orig        <- x
 ret$typed       <- typed
 ret$n.typed     <- n.typed
 ret$loci        <- loci.err
 ret$results     <- ans$results

 if(exists("is.R") && is.function(is.R) && is.R())
   class(ret)    <- "sge.jackknife"   
 else
   oldClass(ret) <- "sge.jackknife"

  
  return(ret)
 }
  
