#!/bin/bash

set -x

########## split bam file from each IP replicate into 2 pseudo replicates of equal size
########## then call peaks from each IP replicate and its 2 pseudo replicates

# Parse run_info.txt file for variables
PROJECT_NAME=$(awk 'BEGIN {FS="="} {if ($1 ~ /^PROJECT_NAME/) print $2}' $1)
SEQ_DIR=$(awk 'BEGIN {FS="="} {if ($1 ~ /^SEQ_DIR/) print $2}' $1)
SEQ_TYPE=$(awk 'BEGIN {FS="="} {if ($1 ~ /^SEQ_TYPE/) print $2}' $1)
END1_SEQ=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^END1_SEQ/) print $2}' $1) )
END2_SEQ=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^END2_SEQ/) print $2}' $1) )
SEQ_SUFFIX=$(awk 'BEGIN {FS="="} {if ($1 ~ /^SEQ_SUFFIX/) print $2}' $1)
WORK_DIR=$(awk 'BEGIN {FS="="} {if ($1 ~ /^WORK_DIR/) print $2}' $1)
FILTER_TYPE=$(awk 'BEGIN {FS="="} {if ($1 ~ /^FILTER_TYPE/) print $2}' $1)
PEAK_CALLER=$(awk 'BEGIN {FS="="} {if ($1 ~ /^PEAK_CALLER/) print $2}' $1)
IP_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^IP_FILE/) print $2}' $1) )
INPUT_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^INPUT_FILE/) print $2}' $1) )

# Parse tool_info.txt file for variables
tool_info=$(awk 'BEGIN {FS="="} {if ($1 ~ /^TOOL_INFO/) print $2}' $1)
FASTQC=$( cat $tool_info | grep -w '^FASTQC' | cut -d '=' -f2)
BWA_REF=$( cat $tool_info | grep -w '^BWA_REF' | cut -d '=' -f2)
BWA_PATH=$( cat $tool_info | grep -w '^BWA_PATH' | cut -d '=' -f2)
MACS_PATH=$( cat $tool_info | grep -w '^MACS_PATH' | cut -d '=' -f2)
SICER=$( cat $tool_info | grep -w '^SICER' | cut -d '=' -f2)
SAMTOOLS=$( cat $tool_info | grep -w '^SAMTOOLS' | cut -d '=' -f2)
BEDTOOLS=$( cat $tool_info | grep -w '^BEDTOOLS' | cut -d '=' -f2)
PICARD=$( cat $tool_info | grep -w '^PICARD' | cut -d '=' -f2)
GENOME_TABLE=$( cat $tool_info | grep -w '^GENOME_TABLE' | cut -d '=' -f2)
TCLR_LIST=$( cat $tool_info | grep -w '^TCLR_LIST' | cut -d '=' -f2)
MACS2_ARGS=$( cat $tool_info | grep -w '^MACS2_ARGS' | cut -d '=' -f2)
FRAGMENT_SIZE=`grep -w '^FRAGMENT_SIZE' $tool_info | cut -d '=' -f2`
REMOVE_DUP=`grep -w '^REMOVE_DUP' $tool_info | cut -d '=' -f2`

# Constants
MAP_OUTDIR=$WORK_DIR/mapout
MACS2_OUTDIR=$WORK_DIR/macs2out

let ARRAY_JOB_INDEX=$SGE_TASK_ID-1

if [[ $PEAK_CALLER = "macs2idr" && ${#IP_FILE[@]} -ge 2 ]] 
then

echo "start to split each IP bam file into 2 pseudo replicates for IDR analysis, $(date)"

FileNum=$(($ARRAY_JOB_INDEX + 1))
SEQ1NAME=$( basename $SEQ_DIR"/"${IP_FILE[ARRAY_JOB_INDEX]} .${SEQ_SUFFIX} )

cp $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam \
${MACS2_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.r1pr0.bam

${SAMTOOLS}/samtools view -h $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam | \
awk -v outdir=${MACS2_OUTDIR}"/"${PROJECT_NAME} -v id=$FileNum 'BEGIN {FS="\t"; OFS="\t"}; {if ($1 ~ /\@/) print $0 >outdir".IP."id".temp1.sam"; else print $0}' | \
perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' | awk -v outdir=${MACS2_OUTDIR}"/"${PROJECT_NAME} -v id=$FileNum 'NR%2 { print > outdir".IP."id".temp2.sam"; next }; { print > outdir".IP."id".temp3.sam" }'

cat ${MACS2_OUTDIR}"/"${PROJECT_NAME}.IP.$FileNum.temp1.sam ${MACS2_OUTDIR}"/"${PROJECT_NAME}.IP.$FileNum.temp2.sam |${SAMTOOLS}/samtools view -Sbh - >\
${MACS2_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.r1pr1.bam

cat ${MACS2_OUTDIR}"/"${PROJECT_NAME}.IP.$FileNum.temp1.sam ${MACS2_OUTDIR}"/"${PROJECT_NAME}.IP.$FileNum.temp3.sam |${SAMTOOLS}/samtools view -Sbh - >\
${MACS2_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.r1pr2.bam

rm -f ${MACS2_OUTDIR}"/"${PROJECT_NAME}.IP.$FileNum.temp[1-3].sam

#### start peak calling
echo " "
echo "start to call peaks from $( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam and its pseudo replicates, $(date)"
echo " "

# TODO potential for parallelization
for ((j=0;j<=2;j=j+1))
do
	if [[ ${SEQ_TYPE} = "PE" ]]
	then
		INSERT_SIZE=$( sort -k2,2n ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam.size.txt |\
		tail -n 1 |cut -f 1 |awk '{print $1}' )

		echo "These are paired-end data, use fragment size of $INSERT_SIZE bp as estimated from the BWA mapping results"
		echo " "

		${MACS_PATH}"/"macs2 callpeak -t ${MACS2_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.r1pr$j.bam -c \
		${MACS2_OUTDIR}"/"$PROJECT_NAME.Input.r0pr0.bam -n \
		${MACS2_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}_r1pr${j}_macs2 ${MACS2_ARGS} --bw=$( echo $INSERT_SIZE )

	elif [[ ${SEQ_TYPE} = "SE" ]]
	then
		echo "These are single-end data, use fragment size of $FRAGMENT_SIZE bp as specified in the run_info file"
		echo " "

		${MACS_PATH}"/"macs2 callpeak -t ${MACS2_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.r1pr$j.bam -c \
		${MACS2_OUTDIR}"/"$PROJECT_NAME.Input.r0pr0.bam -n \
		${MACS2_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}_r1pr${j}_macs2 ${MACS2_ARGS} --bw=$( echo $FRAGMENT_SIZE )
	fi
	
	# For newer versions of macs2, the output file has the extension .narrowPeak instead of .encodePeak
	# Copy the new extension to the older naming convention so later functions work.
	for filename in ${MACS2_OUTDIR}/$( echo $SEQ1NAME ).${SEQ_TYPE}_r1pr${j}_macs2_peaks.narrowPeak
	do
		if [[ -f "$filename" ]]
		then

		cp $filename ${filename%.narrowPeak}.encodePeak
        cut -f 1-4,9 $filename >${filename%.narrowPeak}.bed
        fi
	done

        Peak_Label=$( echo $SEQ1NAME ).${SEQ_TYPE}_r1pr${j}_macs2_
        perl -p -i -e "s#${MACS2_OUTDIR}"/"${Peak_Label}##g" ${MACS2_OUTDIR}"/"*${SEQ1NAME}.${SEQ_TYPE}_r1pr${j}_macs2_peaks.*
        perl -p -i -e "s#${MACS2_OUTDIR}"/"${Peak_Label}##g" ${MACS2_OUTDIR}"/"*${SEQ1NAME}.${SEQ_TYPE}_r1pr${j}_macs2_summits.bed
        
        Peak_Label2=$( echo $SEQ1NAME ).${SEQ_TYPE}_r1pr0_macs2_
		TRACK_NAME=$(awk -v trackName=${Peak_Label2}peaks.bed 'BEGIN {print "track name=\""trackName"\" description=\""trackName"\""}' )
		perl -p -i -e "s#^chr#${TRACK_NAME}\nchr# if $. == 1" ${MACS2_OUTDIR}"/"${SEQ1NAME}.${SEQ_TYPE}_r1pr0_macs2_peaks.bed
        
done

fi

