#!/bin/bash
### tang.xiaojia@mayo.edu
### Xiaojia Tang
### last updated: NOV 27 2012

############################################################
#               $1              =       /path/to/input directory
#               $2              =       name of the samples
#               $3              =       bamfiles from same aligner are "," seperated and ":" for different aligners
#               $4              =       /path/to/outputfolder
#               $5              =       /path/to/configuration file
#############################################################

if [ $# != 3 ]
then
	echo -e "script to merge the multiple vcf files and annotate uuisng annovar\nUsage: ./eSNV_merge.sh <sample name></path/to/outputfolder></path/to/configuration file>"
	exit 1;
else
	set -x
	START=$(date +%s)
	sample=$1
	output=$2
	config=$3
	echo -e "\n******* Merging, Filtering and annotating variants for $sample started *******\n"
	perl=$( cat $config | grep '^PERL=' | sed -e '/PERL=/s///g')
	script_path=$( cat $config | grep '^SCRIPT_PATH=' | sed -e '/SCRIPT_PATH=/s///g')
	ref=$(cat $config | grep '^REF_GENOME=' |  sed -e '/REF_GENOME=/s///g')
	samtools=$( cat $config | grep '^SAMTOOLS=' | sed -e '/SAMTOOLS=/s///g')
 	refseq_uniport=$( cat $config | grep '^REFSEQ_UNIPORT=' | sed -e '/REFSEQ_UNIPORT=/s///g')
	human_domain_corr=$( cat $config | grep '^HUMAN_DOMAIN_CORR=' | sed -e '/HUMAN_DOMAIN_CORR=/s///g')
	## Merge VCF files from Two aligner i
	# Usage: perl merge_vcf.pl <sample name> <vcf from preferred aligner> <vcf from less preferred confident aligner> <FLAG of if low confident SNVs needed: T-need; F-not need >
	aligner1=$( cat $config | grep '^ALIGNER=' |sed -e '/ALIGNER=/s///g'| tr "[A-Z]" "[a-z]" | tr ":" "\n" | head -n 1 | tail -n 1)
	aligner2=$( cat $config | grep '^ALIGNER=' | sed -e '/ALIGNER=/s///g'| tr "[A-Z]" "[a-z]" | tr ":" "\n" | head -n 2 | tail -n 1)
	recal_flag=$( cat $config | grep '^RECALIBRATION=' |  sed -e '/RECALIBRATION=/s///g' | tr "[A-Z]" "[a-z]")
	realign_flag=$( cat $config | grep '^REALIGNEMNT=' |  sed -e '/REALIGNEMNT=/s///g' |tr "[A-Z]" "[a-z]")
	RRPS_high=$( cat $config | grep '^HIGH_ReadRankPosSum=' | sed -e '/HIGH_ReadRankPosSum=/s///g')
	RRPS_low=$( cat $config | grep '^LOW_ReadRankPosSum=' | sed -e '/LOW_ReadRankPosSum=/s///g')
	r_cutoff=$(cat $config | grep '^R_CUTOFF=' |  sed -e '/R_CUTOFF=/s///g')
	r_cutoff2=$(cat $config | grep '^R_CUTOFF2=' |  sed -e '/R_CUTOFF2=/s///g')
	keep_all_snv=$( cat $config | grep '^KEEP_ALL_SNV=' | sed -e '/KEEP_ALL_SNV=/s///g')
	


	if [ ! -s $output/$aligner1/variant/$sample.vcf ]
	then
		echo " cannot find $output/$aligner1/variant/$sample.vcf"
		exit 1
	fi
	
	if [ ! -s $output/$aligner2/variant/$sample.vcf ]
	then
		echo " cannot find $output/$aligner2/variant/$sample.vcf"
		exit 1
	fi

	echo " Merge two samples!"
	echo "$perl/perl $script_path/merge_vcf.pl $sample $output/$aligner1/variant/$sample.vcf $output/$aligner2/variant/$sample.vcf $keep_all_snv $output $RRPS_high $RRPS_low"
	$perl/perl $script_path/merge_vcf.pl $sample $output/$aligner1/variant/$sample.vcf $output/$aligner2/variant/$sample.vcf $keep_all_snv $output $RRPS_high $RRPS_low
	$script_path/annovar.sh $sample $output/$sample.ANNOVAR.txt $config $output          
#   original code
#	cat $output/$sample.anno.exome_summary.txt | awk '{if (NR==1) print "ID\t"$0; else { if(/nonsynonymous/ || /stop/) print $(NF-5)"_"$(NF-4)"_"$(NF-2)"_"$(NF-1)"\t"$0} }' | sed -e 's/^chr//' -e 's/^X/23/' -e 's/^Y/24/' | (read -r; printf "%s\n" "$REPLY" ; sort -t"_" -k1,1n -k2,2n) | sed -e 's/^23/X/' -e 's/^24/Y/' > $output/$sample.anno.exome_summary.txt.temp
#	cut -f1 $output/$sample.anno.exome_summary.txt.temp | tail -n+2 > $output/$sample.nonsyn.list
#
	cat $output/$sample.anno.genome_summary.txt | awk -F'\t' '{if (NR==1) print "ID\t"$0; else {  print $(NF-5)"_"$(NF-4)"_"$(NF-2)"_"$(NF-1)"\t"$0} }' | sed -e 's/^chr//' -e 's/^X/23/' -e 's/^Y/24/' | (read -r; printf "%s\n" "$REPLY" ; sort -t"_" -k1,1n -k2,2n) | sed -e 's/^23/X/' -e 's/^24/Y/' > $output/$sample.anno.genome_summary.txt.temp
	cut -f1 $output/$sample.anno.genome_summary.txt.temp | tail -n+2 > $output/$sample.genome.list
	# strand bias + filter
	echo " Start strand bias analysis...."
	bam=$output/$aligner1/variant/$sample.bam
	cat $output/$sample.genome.list | sed -e 's/_/\t/g' -e 's/^/chr/' | cut -f1-2 >$output/$sample.genome.list.2 
	#read -p "press enter"
	$samtools/samtools mpileup -AB -l $output/$sample.genome.list.2 -f $ref $bam | tee $output/$sample.genome.list.original.pileup | $perl/perl $script_path/strandBias_extract.pl $output/$sample.genome.list $r_cutoff $r_cutoff2 - >$output/$sample.genome.list.strBia.summary.txt 2>$output/logs/$sample.genome.list.strBia.log
	echo " Strand bias analysis done!"
	cut -f2- $output/$sample.genome.list.strBia.summary.txt >$output/$sample.genome.list.extract.2.temp
    	paste $output/$sample.anno.genome_summary.txt.temp $output/$sample.genome.list.extract.2.temp >$output/$sample.anno.genome.list.strBia.txt

	cat $output/$sample.anno.genome.list.strBia.txt |  awk -F'\t' '{if (NR==1) print $0; else { if( $NF=="N") print;} }' | tee $output/$sample.anno.genome.SNVs.txt |  awk -F'\t' '{if (NR==1) print $0; else {if( /nonsynonymous/ || /stop/) print $0 } } ' > $output/$sample.anno.nonsyn.SNVs.txt
#	awk '{if(NR==1 || $NF=="N") print $0}' $output/$sample.anno.nonsy.strBia.temp > $output/$sample.anno.nonsyn.SNVs.txt
	mv $output/$sample.*.temp $output/logs
	mv $output/*summary.txt $output/logs
	mv $output/*genome.list* $output/logs

	# post-protein omics data # 
	
	proteomics=$( cat $config | grep '^PROTEOMICS=' | cut -d"=" -f2 | tr '[:lower:]' '[:upper:]' )
	echo "running proteomics analysis..."
	echo $proteomics
	if [[ $proteomics == "YES"  ]] 
	then
		$perl/perl $script_path/AAC2D_v2.pl $output $output/$sample.anno.nonsyn.SNVs.txt $refseq_uniport $human_domain_corr >$output/$sample.proteomics.temp
		if [ `wc -l $output/$sample.anno.nonsyn.SNVs.txt | cut -d" " -f1` -eq `wc -l $output/$sample.proteomics.temp | cut -d" " -f1`  ]
		then
			awk '{if(NR==1){print "AAChange\tProteinDomain"}else{print $0}}' <$output/$sample.proteomics.temp >$output/temp
			paste $output/$sample.anno.nonsyn.SNVs.txt $output/temp > $output/temp2
			mv $output/temp2 $output/$sample.anno.nonsyn.SNVs.txt 
			rm $output/temp $output/$sample.proteomics.temp
		fi
	fi
	temp_files=$( cat $config | grep '^TEMPORARY_FILES_REMOVE=' | sed -e '/TEMPORARY_FILES_REMOVE=/s///g' | tr "[A-Z]" "[a-z]" )
	if [ $temp_files == "yes" ]
	then
		rm -f $output/*/alignment/*.bam*
		rm -f $output/*/realignment/*.bam*
		rm -f $output/*/recalibration/*.bam*
	fi
	echo " You have got your SNV reports!"
	END=$(date +%s)
	DIFF=$(( $END - $START ))
	echo " Filtering of variants for $sample took $DIFF seconds"
	echo -e "\n******* Merging, Filtering and annotating variants for $sample completed *******\n"
fi
