#$Author: sinnwell $
#$Date: 2006/08/25 21:38:42 $
#$Header: /people/biostat3/sinnwell/Projects/LDpairs/Make/RCS/compositeLD.q,v 1.9 2006/08/25 21:38:42 sinnwell Exp $
#$Locker:  $
#$Log: compositeLD.q,v $
#Revision 1.9  2006/08/25 21:38:42  sinnwell
#include more warning messages, for all homozygous marker and highly polymorphic
#
#Revision 1.8  2004/12/03 23:01:48  sinnwell
#T to TRUE
#
#Revision 1.7  2004/12/03 20:39:55  sinnwell
#fix data.frame in allele.info for R
#
#Revision 1.6  2004/08/10 19:02:32  sinnwell
#change '_' to <-, needed for R.
#
#Revision 1.5  2004/07/27 20:15:55  sinnwell
#make a.allele and b.allele to allow character
#
#Revision 1.4  2004/07/26 23:01:32  schaid
#*** empty log message ***
#
#Revision 1.3  2003/12/05 19:11:43  sinnwell
#fix error of using alleles and index, with geno.recode, then translate
#back to original alleles at the end.
#
#Revision 1.2  2003/11/13 19:33:11  sinnwell
#make faster by vectorizing 'for' loops
#
#Revision 1.1  2003/10/14 19:52:04  sinnwell
#Initial revision
#

compositeLD <- function(a1, a2, b1, b2)
  
#  Schaid, 6/2003, Sinnwell 11/2003
# Mayo Clinic, Dept. of Biostatistics
# vectorized and working so no double for-loop to fill the cov-matrices
{
	# a1, a2 are allele vectors for first locus, 
	# b1, b2 are allele vectors for second locus, 
	# where all vectors have the same length.
	len1 <- length(a1)
	if(length(a2) != len1 | length(b1) != len1 | length(b2) != len1) {
		stop("Input allele vectors must all be the same length")
	}
	# take care of missing values
	zed <- apply(!is.na(cbind(a1, a2, b1, b2)), 1, all)
	if(sum(!zed) == length(zed)) {
		stop("all obs have at least one missing allele")
	}
	# subset to not missing
	a1 <- a1[zed]
	a2 <- a2[zed]
	b1 <- b1[zed]
	b2 <- b2[zed]

        # removing geno pairs with missings could have eliminated rare alleles
        # recode from 1 on up using geno.recode for use in gcode() because we use
        # allele numbers as an index
        tmp.geno <- geno.recode(cbind(a1,a2,b1,b2))
        a1 <- tmp.geno$grec[,1]
        a2 <- tmp.geno$grec[,2]
        b1 <- tmp.geno$grec[,3]
        b2 <- tmp.geno$grec[,4]
        a.allele <- tmp.geno$alist[[1]]$allele
        b.allele <- tmp.geno$alist[[2]]$allele

        # test for all homozygous at either locus
        tbl.a <- table(c(a1, a2))
        tbl.b <- table(c(b1, b2))
        if(length(tbl.a)==1 || length(tbl.b)==1)
          stop("one of the markers is homozygous after NAs removed")
        
        # give warning if too highly polymorphic
        if(length(tbl.a) + length(tbl.b) > 20)
          warning("One or both markers are highly polymorphic, computations may be long.")
        
        locus.info <- function(a1, a2)
          {
           # order so that a1 <= a2
            t1 <- ifelse(a1 < a2, a1, a2)
            t2 <- ifelse(a2 > a1, a2, a1)
            a1 <- t1
            a2 <- t2
           # Allele frequencies
            afreq <- table(c(a1, a2))
            afreq <- afreq/sum(afreq)
           # Determine unique alleles at locus
            a <- sort(unique(c(a1, a2)))
            tmp <- expand.grid(a, a)
            tmp <- cbind(tmp[, 2], tmp[, 1])
            tmp <- tmp[tmp[, 1] <= tmp[, 2],  ]
            u1 <- tmp[, 1]
            u2 <- tmp[, 2]
            ugeno <- gcode(u1, u2)
            geno <- factor(gcode(a1, a2), levels = ugeno)
            n.geno <- table(geno)
		# HW disequilibria
            exp.hwe <- afreq[u1] * afreq[u2]
            exp.hwe <- ifelse(u1 != u2, 2 * exp.hwe, exp.hwe)
            obs <- n.geno/sum(n.geno)
            D.hw <- (exp.hwe - obs)/2
		# fixup for homozygotes
            hom <- u1 == u2
            het <- !hom
            indx.hom <- (1:length(hom))[hom]
            for(i in indx.hom) {
              v1 <- u1[i]
              D.hw[i] <- sum(D.hw[(u1 == v1 | u2 == v1) & het])
            }

            # R had a problem with some of these vectors as tables
            # so convert necessary ones to vectors in ugeno.df data.frame
            ugeno.df = data.frame(a1 = tmp[, 1], a2 = tmp[, 2], geno = 
              ugeno, n = as.vector(n.geno), probs.obs = as.vector(obs), prob.hwe = exp.hwe,
              D.hw = as.vector(D.hw))
            
            geno.df = data.frame(a1 = a1, a2 = a2, geno = geno)
            return(list(afreq = afreq, geno.df = geno.df,
                        ugeno.df = ugeno.df))
          }

        setup.countmat <- function(n.a) {
           # make a matrix that is n.a^2 rows by n.a cols, then trim down
           # rows that represent reversed allele order
          single.mat <- matrix(rep(diag(.5,n.a),n.a),ncol=n.a,byrow=TRUE)
          block.mat <- matrix(apply(diag(.5,n.a),1,rep,times=rep(n.a,n.a)),ncol=n.a)
          mat <- single.mat + block.mat
           # make index of rows that are repeats
          rm.indx <- unlist(sapply(1:(n.a-1),indx <- function(x,n){return(x*n+1:x)},n=n.a))
          mat <- mat[-rm.indx,]
          return(mat)
        }

        D.hw <- function(a.count, b.count, c.count, geno.vec, n)
	{
		pa <- sum(a.count * geno.vec)/n
		pb <- sum(b.count * geno.vec)/n
		D.ab <- sum(c.count * geno.vec)/n - 2 * pa * pb
		return(D.ab)
	}
              
	# Now start computations
	tmp.a <- locus.info(a1, a2)
	tmp.b <- locus.info(b1, b2)
	tbl.count <- table(tmp.a$geno.df$geno, tmp.b$geno.df$geno)
        
	geno.vec <- as.vector(tbl.count)
        # Observed probs geno
	prgeno.obs <- geno.vec/sum(geno.vec)
	n <- sum(geno.vec)
	# Expected probs geno under null of no composite LD
	d.tmp <- ifelse(tmp.a$ugeno.df$a1 == tmp.a$ugeno.df$a2, tmp.a$ugeno$D.hw,
                        -2 * tmp.a$ugeno$D.hw)
	marg.a <- tmp.a$ugeno.df$prob.hwe + d.tmp
	d.tmp <- ifelse(tmp.b$ugeno.df$a1==tmp.b$ugeno.df$a2, tmp.b$ugeno$D.hw,
                        -2*tmp.b$ugeno$D.hw)
	marg.b <- tmp.b$ugeno.df$prob.hwe + d.tmp
	prgeno.null <- as.vector(marg.a %o% marg.b)
	prgenoHWE <- as.vector(tmp.a$ugeno.df$prob.hwe%o%tmp.b$ugeno.df$prob.hwe)
	n.a <- length(tmp.a$afreq)
	n.b <- length(tmp.b$afreq)

        #setup count matrices a and b, make c.counts in next loop
        a.sub <- setup.countmat(n.a)
        b.sub <- setup.countmat(n.b)
        a.counts <- matrix(apply(a.sub,2,rep,times=nrow(b.sub)),ncol=n.a)
        b.counts <- matrix(apply(b.sub,1,rep,times=nrow(a.sub)),ncol=n.b,byrow=TRUE)
        
        D.df <- NULL
        c.counts <- NULL

	for(a in 1:(n.a - 1)) {
          for(b in 1:(n.b - 1)) {
            c.vec <- ifelse(a.counts[,a]==0 | b.counts[,b]==0, 0, trunc(a.counts[,a]+b.counts[,b]))
            c.vec <- ifelse(a.counts[,a]==.5 & b.counts[,b]==.5, .5, c.vec)
            D.df <- rbind(D.df, c(a, b, D.hw(a.counts[,a], b.counts[,b],
                                    c.vec, geno.vec, n)))
            c.counts <- cbind(c.counts,c.vec)
          }
        }


        pa.vec <- t(a.counts)%*%prgeno.null
        pb.vec <- t(b.counts)%*%prgeno.null
        
        D.df <- data.frame(D.df)
	names(D.df) <- c("a", "b", "delta")
	nr <- nrow(D.df)
	delta.cov <- matrix(rep(0, nr^2), ncol = nr)
	delta.covHWE <- matrix(rep(0, nr^2), ncol = nr)
        
        # mtx is the allele pair indexed by 1:nr in indx 
        mtx <- cbind(D.df$a,D.df$b)
        
        # create 2-col index of pairs of allele pairs
        # to construct cov[i=indx[1,],j=indx[,2]]
        indx <- expand.grid(1:nr,1:nr)
        indx <- indx[!(indx[,1]<indx[,2]),]
        indx <- matrix(c(indx[,1],indx[,2]),byrow=FALSE,ncol=2)
        indx2 <- cbind(indx[,2],indx[,1])

        nr.counts <- nrow(a.counts)
        # compute dT's
        #i=indx[,1],j=indx[,2],(a1,b1)=mtx[indx[,1],] (a2,b2)=mtx[indx[,2],]
        #separate these parts b/c must duplicate pa,pv-vecs into matrices
        part1 <- 2 * (matrix(rep(pa.vec[mtx[indx[,1],1]],nr.counts),nrow=nr.counts,byrow=TRUE) *
                      b.counts[,mtx[indx[,1],2]] +
                      matrix(rep(pb.vec[mtx[indx[,1],2]],nr.counts),nrow=nr.counts,byrow=TRUE) *
                      a.counts[,mtx[indx[,1],1]])
        part2 <- 2 * (matrix(rep(pa.vec[mtx[indx[,2],1]],nr.counts),nrow=nr.counts,byrow=TRUE) *
                       b.counts[,mtx[indx[,2],2]] +
                       matrix(rep(pb.vec[mtx[indx[,2],2]],nr.counts),nrow=nr.counts,byrow=TRUE) *
                       a.counts[,mtx[indx[,2],1]])
      
        tmat1 <- (c.counts[,indx[,1]] - part1) * (c.counts[,indx[,2]] - part2)
       
        tmat <- tmat1 * prgeno.null
        tmatHWE <- tmat1 * prgenoHWE
        t.tot <- ( - t(prgeno.null)%*% c.counts[,indx[,1]]  +
                  4 * pa.vec[mtx[indx[,1],1]] * pb.vec[mtx[indx[,1],2]]) *
                 ( - t(prgeno.null)%*%c.counts[,indx[,2]] + 
                  4 * pa.vec[mtx[indx[,2],1]] * pb.vec[mtx[indx[,2],2]])
        t.totHWE <- ( - t(prgenoHWE)%*%c.counts[,indx[,1]] +
                       4 * pa.vec[mtx[indx[,1],1]] * pb.vec[mtx[indx[,1],2]]) *
                    ( - t(prgenoHWE)%*%c.counts[,indx[,2]] +
                       4 * pa.vec[mtx[indx[,2],1]] * pb.vec[mtx[indx[,1],2]])
        # sum up columns
        sum.null <- unlist(apply(tmat,2,sum))
        sum.HWE <- unlist(apply(tmatHWE,2,sum))
        # assign upper and lower-triangle of cov-mats
        delta.cov[indx2] <- delta.cov[indx] <- (sum.null - as.vector(t.tot))/n
        delta.covHWE[indx2] <- delta.covHWE[indx] <- (sum.HWE - as.vector(t.totHWE))/n

       	stat.delta <- D.df$delta^2/diag(delta.cov)
	stat.deltaHWE <- D.df$delta^2/diag(delta.covHWE)
  
	if(length(D.df$delta) == 1) {
		stat.global <- stat.delta
		stat.globalHWE <- stat.deltaHWE
		df <- 1
		dfHWE <- 1
	} else {
		tmp <- Ginv(delta.cov)
		inv <- tmp$Ginv
		df <- tmp$rank
		stat.global <- t(D.df$delta) %*% inv %*% D.df$delta
		tmp <- Ginv(delta.covHWE)
		inv <- tmp$Ginv
		dfHWE <- tmp$rank
		stat.globalHWE <- t(D.df$delta) %*% inv %*% D.df$delta
	}

        D.tbl <- D.df

        D.tbl[,1] <- a.allele[D.df[,1]]
        D.tbl[,2] <- b.allele[D.df[,2]]
      
	D.tbl = cbind(D.tbl, var.delta=diag(delta.cov), chistat = stat.delta, pval=1-pchisq(stat.delta,1))
	obj <- list(D.tbl = D.tbl, chistat.global = stat.global, df = df, 
		delta.cov = delta.cov)

	oldClass(obj) <- "compositeLD"
	return(obj)
}
