#!/bin/bash

set -x

PROJECT_NAME=`grep -w '^PROJECT_NAME' $1 | cut -d '=' -f2`
SEQ_DIR=`grep -w '^SEQ_DIR' $1 | cut -d '=' -f2`
SEQ_TYPE=`grep -w '^SEQ_TYPE' $1 | cut -d '=' -f2`
END1_SEQ=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^END1_SEQ/) print $2}' $1) )
END2_SEQ=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^END2_SEQ/) print $2}' $1) )
SEQ_SUFFIX=`grep -w '^SEQ_SUFFIX' $1 | cut -d '=' -f2`
WORK_DIR=`grep -w '^WORK_DIR' $1 | cut -d '=' -f2`
FILTER_TYPE=`grep -w '^FILTER_TYPE' $1 | cut -d '=' -f2`
PEAK_CALLER=`grep -w '^PEAK_CALLER' $1 | cut -d '=' -f2`
IP_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^IP_FILE/) print $2}' $1) )
INPUT_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^INPUT_FILE/) print $2}' $1) )
tool_info=`grep -w '^TOOL_INFO' $1 | cut -d '=' -f2`

FASTQC=`grep -w '^FASTQC' $tool_info | cut -d '=' -f2`
BWA_REF=`grep -w '^BWA_REF' $tool_info | cut -d '=' -f2`
BWA_PATH=`grep -w '^BWA_PATH' $tool_info | cut -d '=' -f2`
MACS_PATH=`grep -w '^MACS_PATH' $tool_info | cut -d '=' -f2`
SAMTOOLS=`grep -w '^SAMTOOLS' $tool_info | cut -d '=' -f2`
BEDTOOLS=`grep -w '^BEDTOOLS' $tool_info | cut -d '=' -f2`
PICARD=`grep -w '^PICARD' $tool_info | cut -d '=' -f2`
GENOME_TABLE=`grep -w '^GENOME_TABLE' $tool_info | cut -d '=' -f2`
TCLR_LIST=`grep -w '^TCLR_LIST' $tool_info | cut -d '=' -f2`
MACS2_ARGS=`grep -w '^MACS2_ARGS' $tool_info | cut -d '=' -f2`
FRAGMENT_SIZE=`grep -w '^FRAGMENT_SIZE' $tool_info | cut -d '=' -f2`
REMOVE_DUP=`grep -w '^REMOVE_DUP' $tool_info | cut -d '=' -f2`

# Constants used in this file
MAP_OUTDIR=$WORK_DIR/mapout
MACS2_OUTDIR=$WORK_DIR/macs2out
IDR_OUTDIR=$WORK_DIR/idrout

if [[ $PEAK_CALLER = "macs2idr" ]]
then
	mkdir -p $MACS2_OUTDIR
	mkdir -p $IDR_OUTDIR
fi

echo "$PEAK_CALLER is used to call peaks"

##################### start option 2 for IDR
##################### elif only a single IP file, then do not use IDR analysis (option 2)
# TODO MTK: Does this block still make sense since we've eliminated IDR_ANALYSIS?
if [[ $PEAK_CALLER = "macs2idr" && ${#IP_FILE[@]} -eq 1 ]]
then
    	echo "IDR analysis requires replicates, but this IP has no replicate, stop IDR analysis"
    	echo "Please use SICER or MACS2 (without IDR analysis) to call peaks"
    	echo " "

##################### start option 3 for IDR
##################### elif >=2 IP files (IP with replicates) in bam format  (option 3)
elif [[ $PEAK_CALLER = "macs2idr" && ${#IP_FILE[@]} -ge 2 ]] 
then

####### start to merge input bam files
    	echo " "
    	echo "start to merge input bam files from replicates for IDR analysis, $(date)"

    	INPUT_NUMBER=$( echo ${INPUT_FILE[*]/#/$SEQ_DIR"/"} |tr -s " " "\n" |sort |uniq |wc -l )

    	INPUT_LIST=$( echo ${INPUT_FILE[*]/#/$SEQ_DIR"/"} |sed "s#${SEQ_SUFFIX}#${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam#g" | \
    	sed "s#$SEQ_DIR#${MAP_OUTDIR}#g" | tr -s " " "\n" |sort |uniq |tr -s "\n" " " )

    	echo "There are $INPUT_NUMBER input files, which are $INPUT_LIST"

    	if [[ $INPUT_NUMBER -eq 1 ]]
    	then
		cp $( echo $INPUT_LIST ) ${MACS2_OUTDIR}"/"$PROJECT_NAME.Input.r0pr0.bam

    	elif [[ $INPUT_NUMBER -ge 2 ]]
    	then
		${SAMTOOLS}/samtools merge ${MACS2_OUTDIR}"/"$PROJECT_NAME.Input.r0pr0.bam $( echo $INPUT_LIST )
    	fi

########## start to merge bam files from IP replicates and split merged bam file into two pseudo replicates of equal size
	echo " "
	echo "start to merge IP bam files from replicates for IDR analysis, $(date)"

	IP_LIST=$( echo ${IP_FILE[*]/#/$SEQ_DIR"/"} |sed "s#${SEQ_SUFFIX}#${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam#g" | \
	sed "s#$SEQ_DIR#${MAP_OUTDIR}#g" | tr -s " " "\n" |sort |uniq |tr -s "\n" " " )

	echo "these are the IP files: $IP_LIST"

	$SAMTOOLS"/"samtools merge $MACS2_OUTDIR"/"$PROJECT_NAME.IP.r0pr0.bam $( echo $IP_LIST )

	echo " "
	echo "start to split merged bam file into two pseudo replicates for IDR analysis"

        ${SAMTOOLS}/samtools view -h ${MACS2_OUTDIR}"/"$PROJECT_NAME.IP.r0pr0.bam | \
        awk -v outdir=${MACS2_OUTDIR}"/"${PROJECT_NAME} 'BEGIN {FS="\t"; OFS="\t"}; {if ($1 ~ /\@/) print $0 >outdir".IP.r0pr0.bam.temp1.sam"; else print $0}' | \
        perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' | awk -v outdir=${MACS2_OUTDIR}"/"${PROJECT_NAME} 'NR%2 { print > outdir".IP.r0pr0.bam.temp2.sam"; next }; { print > outdir".IP.r0pr0.bam.temp3.sam" }'

         cat $MACS2_OUTDIR"/"${PROJECT_NAME}.IP.r0pr0.bam.temp1.sam $MACS2_OUTDIR"/"${PROJECT_NAME}.IP.r0pr0.bam.temp2.sam |${SAMTOOLS}/samtools view -Sbh - > $MACS2_OUTDIR"/"$PROJECT_NAME.IP.r0pr1.bam
         cat $MACS2_OUTDIR"/"${PROJECT_NAME}.IP.r0pr0.bam.temp1.sam $MACS2_OUTDIR"/"${PROJECT_NAME}.IP.r0pr0.bam.temp3.sam |${SAMTOOLS}/samtools view -Sbh - > $MACS2_OUTDIR"/"$PROJECT_NAME.IP.r0pr2.bam

         rm -f ${MACS2_OUTDIR}"/"${PROJECT_NAME}.IP.r0pr0.bam.temp[1-3].sam

########## start to call peaks from merged IP bam file and its two pseudo replicates (MACS2)
	echo " "
	echo "start to call peaks from merged IP and its two pseudo replicates, $(date)"

	for ((i=0;i<=2;i=i+1))
	do
		if [[ $SEQ_TYPE = "PE" ]]
		then
			SEQ1NAME=$( basename $SEQ_DIR"/"${IP_FILE[0]} .${SEQ_SUFFIX} )
			INSERT_SIZE=$( sort -k2,2n ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam.size.txt |tail -n 1 |cut -f 1 |awk '{print $1}' )

			echo "These are paired-end data, use fragment size of $INSERT_SIZE bp as estimated from the BWA mapping results"
			echo " "

			$MACS_PATH"/"macs2 callpeak -t $MACS2_OUTDIR"/"$PROJECT_NAME.IP.r0pr$i.bam -c $MACS2_OUTDIR"/"$PROJECT_NAME.Input.r0pr0.bam -n $MACS2_OUTDIR"/"${PROJECT_NAME}.${SEQ_TYPE}_r0pr${i}_macs2 $MACS2_ARGS --bw=$( echo $INSERT_SIZE )

		elif [[ $SEQ_TYPE = "SE" ]]
		then
			echo "These are single-end data, fragment size is $FRAGMENT_SIZE bp as specified in the run_info file"
			echo " "

			$MACS_PATH"/"macs2 callpeak -t $MACS2_OUTDIR"/"$PROJECT_NAME.IP.r0pr$i.bam -c $MACS2_OUTDIR"/"$PROJECT_NAME.Input.r0pr0.bam -n $MACS2_OUTDIR"/"${PROJECT_NAME}.${SEQ_TYPE}_r0pr${i}_macs2 $MACS2_ARGS --bw=$( echo $FRAGMENT_SIZE )
		
		fi
		
		# For newer versions of macs2, the output file has the extension .narrowPeak instead of .encodePeak
		# Copy the new extension to the older naming convention so later functions work.
		for filename in ${MACS2_OUTDIR}/*macs2_peaks.narrowPeak
		do
		if [[ -f "$filename" ]]
		then

			cp $filename ${filename%.narrowPeak}.encodePeak
            cut -f 1-4,9 $filename >${filename%.narrowPeak}.bed
                fi
		done

		perl -p -i -e "s#${MACS2_OUTDIR}"/"${PROJECT_NAME}.${SEQ_TYPE}_r0pr${i}_macs2_##g" ${MACS2_OUTDIR}"/"*${PROJECT_NAME}.${SEQ_TYPE}_r0pr*_macs2_peaks.*
		perl -p -i -e "s#${MACS2_OUTDIR}"/"${PROJECT_NAME}.${SEQ_TYPE}_r0pr${i}_macs2_##g" ${MACS2_OUTDIR}"/"*${PROJECT_NAME}.${SEQ_TYPE}_r0pr*_macs2_summits.bed

	done

	echo " "
	echo "Finish peak calling from merged IP and its two pseudo replicates, $(date)"
fi
##################### end of option 3 for IDR
