#!/bin/bash
### baheti.saurabh@mayo.edu
### Saurabh Baheti
### last updated: NOV 07 2012

if [ $# != 6 ]
then
	echo -e "script to preprocess BAM file\nUsage: ./ProcessBAM.sh <input BAM file> <sample name> <input directory> <configuraion file><flag for aligner type>"
	exit 1;
else
	set -x
	START=$(date +%s)
	input=$1
	bam=$2 
	sample=$3 
	align=$4 
	config=$5
	flag=$6
	
	mkdir -p $align/logs
	logs=$align/logs
	aligner=$( cat $config | grep '^ALIGNER=' | sed -e '/ALIGNER=/s///g'| tr "[A-Z]" "[a-z]" | tr ":" "\n" | head -n $flag | tail -n 1)
	picard=$( cat $config | grep '^PICARD=' | sed -e '/PICARD=/s///g')
	samtools=$( cat $config | grep '^SAMTOOLS=' | sed -e '/SAMTOOLS=/s///g')
	java=$( cat $config | grep '^JAVA=' | sed -e '/JAVA=/s///g')
	ref=$( cat $config | grep '^REF_GENOME=' | sed -e '/REF_GENOME=/s///g')
	script_path=$( cat $config | grep '^SCRIPT_PATH=' | sed -e '/SCRIPT_PATH=/s///g')
	recal_flag=$( cat $config | grep '^RECALIBRATION=' |  sed -e '/RECALIBRATION=/s///g' | tr "[A-Z]" "[a-z]")
	realign_flag=$( cat $config | grep '^REALIGNEMNT=' |  sed -e '/REALIGNEMNT=/s///g' |tr "[A-Z]" "[a-z]")
	
	echo -e "\n******* Preprocessing $aligner BAM script started *******"
	let j=0
	chrindex=$( cat $config | grep -w '^CHR_INDEX' | cut -d '=' -f2 | tr ":" " ")
	chr_str=""
	for chr in $chrindex
	do
		chr_str=$chr_str"chr$chr "
	done	
	for i in `echo $bam | tr "," " "`
	do
		if [ $aligner == "tophat" ]
		then
			echo "tophat aligned bam"
			echo "Preprocessing the BAM file"
			SORT_FLAG=`$script_path/checkBAMsorted.pl -i $input/$i -s $samtools`
			if [ $SORT_FLAG == 1 ]
			then
				ln -s $input/$i $align/$i 
				$samtools/samtools index $align/$i 
			else	
				$script_path/SortBAM.sh $input/$i $align/$i $sample $align $config $logs coordinate
			fi
			$samtools/samtools view $align/$i $chr_str | awk -F '\t' '{ if($0 ~ /^@/) {print} else { for(k=1;k<=NF;k++){ if ($k ~ /NH:i:1$/){print}} } }' | awk -F '\t' '{ if($0 ~ /^@/) {print} else if ($0 !~ /XF:Z/) {print} }' | awk -F '\t' 'BEGIN{OFS="\t";}{if ($5~/255/){$5=50; print $0}else{print} }' | $samtools/samtools view -bt $ref.fai - > $align/$sample.$j.bam 
			rm $align/$i $align/$i.bai
		elif [ $aligner == "bwa" ]
		then
			echo "bwa aligned bam"
			echo "Preprocessing the BAM file"
			SORT_FLAG=`$script_path/checkBAMsorted.pl -i $input/$i -s $samtools`
			if [ $SORT_FLAG == 1 ]
			then
				ln -s $input/$i $align/$i 
				$samtools/samtools index $align/$i
			else	
				$script_path/SortBAM.sh $input/$i $align/$i $sample $align $config $logs coordinate
			fi
			$samtools/samtools view -q 1 $align/$i $chr_str | $samtools/samtools view -bt $ref.fai - > $align/$sample.$j.bam 
			rm $align/$i $align/$i.bai
		elif [ $aligner == "mapsplice" ]
		then
			echo "mapsplice aligned bam"
			echo "Preprocessing the BAM file"
			SORT_FLAG=`$script_path/checkBAMsorted.pl -i $input/$i -s $samtools`
			if [ $SORT_FLAG == 1 ]
			then
				ln -s $input/$i $align/$i 
				$samtools/samtools index $align/$i
			else	
				$script_path/SortBAM.sh $input/$i $align/$i $sample $align $config $logs coordinate
			fi
			$samtools/samtools view -F 256 $align/$i $chr_str | $samtools/samtools view -bt $ref.fai - > $align/$sample.$j.bam 
			rm $align/$i $align/$i.bai
		else 
			echo "Aligner not supported"
			exit 1;	
		fi
		let j=j+1
	done
	let j=j-1
	
	### merge the bam files
	if [ $j -gt 0 ]
	then
		##merge the BAMs using picard
		inbam=""
		for i in $(seq 1 $j)
		do
			inbam=$inbam"INPUT=$align/$sample.$i.bam "
		done	
		$script_path/MergeBAMs.sh "$inbam" $align/$sample.bam $sample $align $config $logs yes  
	else
		SORT_FLAG=`$script_path/checkBAMsorted.pl -i $align/$sample.$j.bam -s $samtools`
		if [ $SORT_FLAG == 1 ]
		then
			echo "BAM is already sorted"
			mv $align/$sample.$j.bam $align/$sample.bam
			$samtools/samtools index $align/$sample.bam	
		else
			echo "Sorting the BAM file"
			$script_path/SortBAM.sh $align/$sample.$j.bam $align/$sample.bam $sample $align $config $logs coordinate
			rm $align/$sample.$j.bam

			

		fi
	fi	
	
	### reorder the BAM files to make sure the header corresponds to the reference genome
	echo "Reording the BAM file with respect to the reference genome provided"
	$script_path/ReorderBAM.sh $align/$sample.bam $sample $align $config $logs

	#### add the readgroup if it is not avilable in the bam file
	echo "Adding ReadGroup information to  the BAM file if required"
	RG_ID=`$samtools/samtools view -H $align/$sample.bam | grep "^@RG" | tr '\t' '\n' | grep "^ID"| cut -f 2 -d ":"`
	if [ "$RG_ID" == "$sample" ]
  	then
		echo "Read Group Information is already available"
    	else	
		$script_path/ReadGroupBAM.sh $align/$sample.bam $sample $align $config $logs
	fi	
	
	#### mark duplicate
	DUP_STATUS=`$samtools/samtools view -H $align/$sample.bam | grep "^@CO" | grep "MarkDuplicates" | wc -l`
	if [ "$DUP_STATUS" -eq 0 ] 
	then
		echo "Removing the duplicates from the BAM file"
		$script_path/RmdupBAM.sh $align/$sample.bam $sample $align $config $logs
	else
		echo "Duplicate Reads are already removed from the BAM file"
	fi

	
	### get the bam statistics
	echo "generating the BAM statistics for the $sample"
	$samtools/samtools flagstat $align/$sample.bam > $align/$sample.flagstat
	if [[ $realign_flag == "yes"  || $recal_flag == "yes" ]]
	then
		if [ $aligner == "tophat" ]
		then
			$samtools/samtools view -h $align/$sample.bam | awk '{if ($0 ~/^@/){print} else if($6 ~ /N/) {print}}'  | $samtools/samtools view -bS - > $align/$sample.junction.bam
			$samtools/samtools view -h $align/$sample.bam | awk '{if ($0 ~/^@/){print} else if($6 !~ /N/) {print}}' | $samtools/samtools view -bS - > $align/$sample.genome.bam	
			mv $align/$sample.genome.bam $align/$sample.bam
			$samtools/samtools index $align/$sample.bam
			$samtools/samtools index $align/$sample.junction.bam
		fi	
	fi
	
	END=$(date +%s)
	DIFF=$(( $END - $START ))
	echo "Processing of BAM file for $sample took $DIFF seconds"
	echo -e "\n******* Preprocessing $aligner BAM script completed *******\n"
fi	
