#!/bin/bash

############################################################################
## Description:
## a script to create raw, normalized, and novel miRNA expression reports
##
## Author: Jared Evans
## Date: 5/22/14
##
## Parameters:
## <input dir> - Directory where per-sample miRDeep2 results are located
## <output_dir> - Directory where merged excel sheets should be written
## <sample names(s1:s2)> - Colon seperated list of sample names
## <script path> - Path to where the CAP-miRSeq scripts are located
##
############################################################################



if [ $# != 4 ];
then
        echo "usage: <input dir> <output_dir> <sample names(s1:s2)> <script path>";
else
set -x
input_dir=$1
output_dir=$2
samples=$3
script_path=$4

#mkdir $output_dir/tmp
first_sample=$(echo $samples | cut -d":" -f1)

echo -e "Mature miRNA\tPrecursor\tmiRBase link" > $output_dir/miRNA_expression_raw.xls
sed 1d $input_dir/$first_sample/miRNAs_expressed_all_samples*.csv | cut -f1,3 | awk '{print $1"\t"$2"\t=Hyperlink(\"http://www.mirbase.org/cgi-bin/query.pl?terms="$2"\",\"miRBase\")"}' >> $output_dir/miRNA_expression_raw.xls
cat $output_dir/miRNA_expression_raw.xls > $output_dir/miRNA_expression_norm.xls
touch $output_dir/beds.txt

# loop through each sample and extract the expression counts from the miRDeep2 output
for sample in $(echo $samples | tr ":" " ")
do
	file=$input_dir/$sample/miRNAs_expressed_all_samples*.csv
	echo $input_dir/$sample/result*.bed >> $output_dir/beds.txt 
        # raw report
	echo $sample > $output_dir/$sample.expressed.raw.tmp
	sed 1d $file | cut -f5 >> $output_dir/$sample.expressed.raw.tmp
	paste $output_dir/miRNA_expression_raw.xls $output_dir/$sample.expressed.raw.tmp > $output_dir/miRNA_expression_raw.xls.tmp
	mv $output_dir/miRNA_expression_raw.xls.tmp $output_dir/miRNA_expression_raw.xls
	rm $output_dir/$sample.expressed.raw.tmp
	
	# normalized report
	echo $sample > $output_dir/$sample.expressed.norm.tmp
	sed 1d $file | cut -f6 >> $output_dir/$sample.expressed.norm.tmp
	paste $output_dir/miRNA_expression_norm.xls $output_dir/$sample.expressed.norm.tmp > $output_dir/miRNA_expression_norm.xls.tmp
	mv $output_dir/miRNA_expression_norm.xls.tmp $output_dir/miRNA_expression_norm.xls
	rm $output_dir/$sample.expressed.norm.tmp

	# precursor tmp report
#	grep nowrap $input_dir/$sample/expression_*.html | awk -F"\t" '{print $1}' | grep pdfs | awk -F"</a>" '{print $1}' | cut -d">" -f4 > $output_dir/tmp/$sample.precursor.reads.txt.tmp
#	grep nowrap $input_dir/$sample/expression_*.html | awk -F"\t" '{print $4}' | grep nowrap | cut -d">" -f2 | cut -d"<" -f1 | paste $output_dir/tmp/$sample.precursor.reads.txt.tmp - > $output_dir/tmp/$sample.precursor.reads.txt
#	rm $output_dir/tmp/$sample.precursor.reads.txt.tmp

done


# sometimes there are duplicate rows in the mirdeep2 expression reports. why? who knows..
#uniq $output_dir/miRNA_expression_raw.xls > $output_dir/miRNA_expression_raw.xls.tmp
#uniq $output_dir/miRNA_expression_norm.xls > $output_dir/miRNA_expression_norm.xls.tmp
#mv $output_dir/miRNA_expression_raw.xls.tmp $output_dir/miRNA_expression_raw.xls
#mv $output_dir/miRNA_expression_norm.xls.tmp $output_dir/miRNA_expression_norm.xls

# unique mature miRNA report
$script_path/uniq_mature_mirna.pl $output_dir/miRNA_expression_raw.xls $output_dir/mature_miRNA_expression.xls

# generate novel miRNA report
perl $script_path/novel_mirna.pl $output_dir/beds.txt $output_dir
file=$(ls $output_dir/*.novel.tmp | sort -n | head -1)
cat $file | cut -f1-3 > $output_dir/report.tmp.xls
for f in $output_dir/*.novel.tmp; do $(cat $f | cut -f4-6 | paste $output_dir/report.tmp.xls - > $output_dir/report.tmp); mv $output_dir/report.tmp $output_dir/report.tmp.xls; done
head -n1 $output_dir/report.tmp.xls > $output_dir/novel_miRNA.xls
sed 1d $output_dir/report.tmp.xls | perl -p -e 's/^chr([1-9])\b/chr0$1/g'| sort -k1,1 -k2,2n | perl -p -e 's/^chr0([1-9])\b/chr$1/g' >> $output_dir/novel_miRNA.xls
rm $output_dir/*.novel.tmp
rm $output_dir/report.tmp.xls
rm $output_dir/beds.txt


# check if expected output files exist
if [ ! -s $output_dir/miRNA_expression_raw.xls ]
then
	echo "ERROR : ${output_dir}/miRNA_expression_raw.xls is empty!"
fi
	
if [ ! -s $output_dir/miRNA_expression_norm.xls ]
then
	echo "ERROR : ${output_dir}/miRNA_expression_norm.xls is empty!"
fi

if [ ! -s $output_dir/mature_miRNA_expression.xls ]
then
	echo "ERROR : ${output_dir}/mature_miRNA_expression.xls is empty!"
fi

if [ ! -s $output_dir/novel_miRNA.xls ]
then
    echo "ERROR : ${output_dir}/novel_miRNA.xls is empty!"
fi

fi


