

stdin <- commandArgs(TRUE) 

if(length(stdin) != 6){
	stop("ERROR! Incorrect number of arguments. \nUSAGE: RVboost.R [input VCF file] [library paths] [path to the script] [hapmap TRUE list] [model] [output directory]")
}

###arguments
inputVCF <- stdin[1]
library <- stdin[2]
scriptPath <- stdin[3]	
hapmap <- stdin[4]
model <- stdin[5]
output <- stdin[6]

### setting R library
.libPaths(library)
require(VariantAnnotation)
require(gbm)
require(mgcv)

### load require Rscript
parseVCF<-paste (scriptPath,"funcs/parseVCF.R",sep="/")
source(parseVCF)
fitRVmodel<-paste (scriptPath,"funcs/fitRVmodel.R",sep="/")
source(fitRVmodel)

### use these attributes from VCF file to make a model
sel.attri <- c("DJ","PctExtPos","ReadPosRankSum","QD","FS","ED") 

parseRNA.res <- parseVCF(inputVCF,sel.info.attr=sel.attri)

tmp.mtx <- parseRNA.res$attri.mtx
tmp.mtx[,"ReadPosRankSum"] <- abs(tmp.mtx[,"ReadPosRankSum"]) # make ReandPosRank monotonical

#=== imputation of missing values
for(k in 1:ncol(tmp.mtx)){
  sel.na.idx <- which(is.na(tmp.mtx[,k]))
  sel.nan.idx <- which(!is.na(tmp.mtx[,k]))
  if(!is.null(sel.na.idx)){
    tmp.mtx[sel.na.idx,k] <- median(tmp.mtx[sel.nan.idx,k])
  }
}

#=== fit adaboost model
fit.res <- fitRVmodel(input.mtx=tmp.mtx,
                      DB.filename=hapmap,DB.ID="pos",pos.vec=parseRNA.res$pos.vec,ada.n=2e4)

#=== compute SNP.conf score
RVboost.ECDF <- ecdf(fit.res$fitted.values[which(fit.res$train.label==1)])
RVboost.Q.score <- RVboost.ECDF(fit.res$fitted.values)
score <- paste (output,"original_score.txt",sep="/");
write.table(fit.res$fitted.values,file=score,col.names=FALSE,row.names=FALSE,quote=FALSE)
Qscore <- paste (output,"RV.Qscore.txt",sep="/"); 
write.table(RVboost.Q.score,file=Qscore,col.names=FALSE,row.names=FALSE,quote=FALSE)
#=== Final outputs to export 
#=== 1. fit.res$fitted.values : orginial adaboost scores 
#=== 2. RVboost.Q.score : SNP% confidence score
