##############################################################################
##  Program Name:  SNPScore.adj.R
##  Purpose: 	   Create SNP-Specific Scores on a single chromosome
##                 for running GeneSetScan, adjusted by covariates
##
##  Requirements:  R, plink, basic unix shell commands
##                 case/control status and genotypes in Plink binary format)
##		   covariate file with adjustment variables for subjects,
##		     whose IDs can be matched to plink files
##                 
##  Optional:      Continuous response: change to family=gaussian in all 
##		   glm calls
##
##############################################################################

## this will get "chrom" from command line
args <- commandArgs(trailingOnly=T)
args
eval(parse(text=args))

## Report which chromosome
cat(paste("Running chromosome: ", chrom, "\n"))

## this file contains functions: create.scores.adj and create.scores.unadj
source("./create_scores.R")

## Name plink file root name
## with files mygwas.bed, mygwas.bim
plinkfile <- "/path/to/datafiles/mygwas"


## from plink binary format, pull out minor allele counts (0/1/2) for chrom

## from plink binary format, pull out minor allele counts (0/1/2) for chrom
## writes to plink.chr.raw with no --out name given
system(paste("/path/to/bin/plink --bfile ",plinkfile,
" --recodeA --chr ",chrom," --output-missing-genotype NA --out plink",
chrom,sep=""))

### read in plink data
snpdat <- read.table(paste("plink",chrom,".raw",sep=""), header=T) 
system(paste("rm plink",chrom,".*",sep=""))



### read in eigenvectors and/or covariates for adjusting
covs.mat<-read.table("/YOUR/PATH/cov_gwas.txt",header=T)


dim(snpdat)
dim(covs.mat)
names(covs.mat)
names(snpdat)[1:10]
## For example, use columns 4:7 as covariates, and iid matched IID from plink
## but not necessarily the same order

## make sure that the snpdata and the covariate data are in the same order
## and with same people

## get rid of un-matched subjects
pheno2adj <- match(covs.mat$iid, snpdat$IID)
covs.mat <- covs.mat[-which(is.na(pheno2adj)),1:7]


## order them the same as well 
## --this only works if ids are all integers
## --and if the subjects are exactly the same
ord1 <- order(covs.mat$iid)
covs.mat <- covs.mat[ord1,]  
ord2 <- order(snpdat$IID)
snpdat <- snpdat[ord2,]


nsubj <- nrow(snpdat)
cat(paste("N-subj: ", nsubj, "\n"))


## fit a model with covariates, will use residuals in create.scores.adj
phenotype <- ifelse(snpdat$PHENOTYPE==2,1,0)
adj.mat <- as.matrix(covs.mat[,4:7])
fit.null <- glm(phenotype~adj.mat,family=binomial)

## work with only SNP minor allele dosage from plink data 
snpdat <- snpdat[,7:ncol(snpdat)]

## Some snps may be missing all or most genotypes, remove them from snpdat
## GeneSetScan does handle missings well, see the next step
## Chrom Y (24 in plink) will have higher percentage missing by pct-females
## so set the cutoff missing rate at 0.8, for example
pctmiss <- 0.8
rmsnp <- which(apply(is.na(snpdat), 2, sum) > (nsubj*pctmiss))
cat(paste("N-snps to remove: ", length(rmsnp), "\n"))
if(length(rmsnp))
  snpdat <- snpdat[,-rmsnp]
 
## Get scores for all markers
## output missing value as -999.0, and is recognized by GeneSetScan with 
## par file option:  --miss_val -999.0
scores.out <- apply(snpdat, 2, create.scores.adj, phenotype, as.matrix(covs.mat[,4:7]), fit.null, -999)

## round scores to few signif digits so file size is a little smaller;
## still gives plenty of precision, 
## consider doing even 2 or 3 signif digits, which will still round to 1e-4 because some SNP scores
## are that small
ndigits <- 4
scores.out <- round(scores.out, ndigits)

## scores.out is still rows=subjects, we need to print rows=SNPs, so transpose
outdat <- t(scores.out)

## fix names to drop the minor allele char that plink added in the --recodeA option
snps <- names(snpdat)
snps <- sub("_A","",snps)
snps <- sub("_C","",snps)
snps <- sub("_T","",snps)
snps <- sub("_G","",snps)

## write scores for markers on chromosome, rows are markers
dimnames(outdat)[[1]] <- snps
write.table(outdat, paste("SNPScore",chrom,".txt",sep=""), quote=FALSE,
            row.names=TRUE, col.names=FALSE)

## clean out R session
rm(snpdat,scores.out,outdat)


## gzip file and remove remnants of work 
system(paste("gzip SNPScore",chrom,".txt",sep=""))

