# creates QC boxlplots of raw and norm expression counts
# USAGE: USAGE: expression_QC_boxplots.R <Mature_miRNA_expression.xls> <output dir>

############################################################################
## Description:
## a script to create QC boxplots of raw and normalized expression counts.
## This is useful for detecting outlier samples before proceeding with differential expression
##
## Author: Jared Evans
## Date: 5/22/14
##
## Parameters:
## <Mature_miRNA_expression.xls> - miRNA expression table output by CAP-miRSeq
## <output_dir> - Directory where QC plot should be written
##
############################################################################


stdin <- commandArgs(TRUE)

if(length(stdin) != 2){
	stop("ERROR! Incorrect number of arguments. \nUSAGE: expression_QC_boxplots.R <Mature_miRNA_expression.xls> <output dir>")
}
raw.exprs.path <- stdin[1]
output.dir <- stdin[2]

library(edgeR)

# load expression table
expression.raw <- read.table(raw.exprs.path, sep="\t",header=T,stringsAsFactor=F)
# assign unique mature_precursor id to each row
row.names(expression.raw)<-expression.raw$Mature.miRNA

# get raw expression boxplot (this just illustrates the need for normalization)
pdf(paste(output.dir,"/expression_boxplots.pdf",sep=""),width=11,height=8.5)
par(mar = c(10,6,5,3))
# plot log2, plus remove any rows that are all zeros
boxplot(log2(expression.raw[rowSums(expression.raw[2:length(expression.raw)]) != 0,2:length(expression.raw)]),las=2,xlab="",ylab="Raw log2 Counts",main="Raw Expression Counts")

expression.raw.dglist <- DGEList(expression.raw[2:length(expression.raw)])
expression.raw.dglist <- calcNormFactors(expression.raw.dglist)
normcnt <- cpm(expression.raw.dglist, normalized.lib.sizes=T)
# filter out miRNAs that have less than 5 read counts in 1/2 of the samples
keep<-rowSums(normcnt>5) > (length(expression.raw)-2)/2

# display normalized, filtered counts
boxplot(log2(normcnt[keep,]),xlab="",ylab="Normalized log2 Counts",main="Normalized Counts: \n miRNAs with > 5 counts in more than half of the samples",las=2)
dev.off()



