#!/bin/bash
set -x

if [ $# != 1 ];
then
        echo "usage: pkcall.sh <config file>";
	exit
fi

# Parse run_info.txt file for variables
PROJECT_NAME=`grep -w '^PROJECT_NAME' $1 | cut -d '=' -f2`
SEQ_DIR=`grep -w '^SEQ_DIR' $1 | cut -d '=' -f2`
SEQ_TYPE=`grep -w '^SEQ_TYPE' $1 | cut -d '=' -f2`
END1_SEQ=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^END1_SEQ/) print $2}' $1) )
END2_SEQ=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^END2_SEQ/) print $2}' $1) )
SEQ_SUFFIX=`grep -w '^SEQ_SUFFIX' $1 | cut -d '=' -f2`
WORK_DIR=`grep -w '^WORK_DIR' $1 | cut -d '=' -f2`
FILTER_TYPE=`grep -w '^FILTER_TYPE' $1 | cut -d '=' -f2`
PEAK_CALLER=`grep -w '^PEAK_CALLER' $1 | cut -d '=' -f2`
IP_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^IP_FILE/) print $2}' $1) )
INPUT_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^INPUT_FILE/) print $2}' $1) )
PKGENE_NEIGHDIST=`grep -w '^PKGENE_NEIGHDIST' $1 | cut -d '=' -f2`
tool_info=`grep -w '^TOOL_INFO' $1 | cut -d '=' -f2`

LABLE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^INPUT_LABLES/) print $2}' $1) )

# Parse tool_info.txt file for variables
FASTQC=`grep -w '^FASTQC' $tool_info | cut -d '=' -f2`
BWA_REF=`grep -w '^BWA_REF' $tool_info | cut -d '=' -f2`
BWA_PATH=`grep -w '^BWA_PATH' $tool_info | cut -d '=' -f2`
MACS_PATH=`grep -w '^MACS_PATH' $tool_info | cut -d '=' -f2`
MACS_PYTHON=`grep -w '^MACS_PYTHON' $tool_info | cut -d '=' -f2`
## On internal system, need to find correct modules: find /usr/local/biotools/python/ -name _sqlite3.so
export PATH=/usr/local/biotools/python/2.7.3/bin/:$PATH
export PYTHONPATH=$MACS_PYTHON
export LD_LIBRARY_PATH=/usr/local/biotools/python/2.7.3/lib:$LD_LIBRARY_PATH
python -V
SICER=`grep -w '^SICER' $tool_info | cut -d '=' -f2`
SAMTOOLS=`grep -w '^SAMTOOLS' $tool_info | cut -d '=' -f2`
BEDTOOLS=`grep -w '^BEDTOOLS' $tool_info | cut -d '=' -f2`
PICARD=`grep -w '^PICARD' $tool_info | cut -d '=' -f2`
GENOME_TABLE=`grep -w '^GENOME_TABLE' $tool_info | cut -d '=' -f2`
TCLR_LIST=`grep -w '^TCLR_LIST' $tool_info | cut -d '=' -f2`
UCSC_REF_FLAT=`grep -w '^UCSC_REF_FLAT' $tool_info | cut -d '=' -f2`
MACS2_ARGS=`grep -w '^MACS2_ARGS' $tool_info | cut -d '=' -f2`
FRAGMENT_SIZE=`grep -w '^FRAGMENT_SIZE' $tool_info | cut -d '=' -f2`
REMOVE_DUP=`grep -w '^REMOVE_DUP' $tool_info | cut -d '=' -f2`
MAP_SE_ARGS=`grep -w '^MAP_SE_ARGS' $tool_info | cut -d '=' -f2`
MAP_PE_ARGS=`grep -w '^MAP_PE_ARGS' $tool_info | cut -d '=' -f2`
MAP_BOTH_ARGS=`grep -w '^MAP_BOTH_ARGS' $tool_info | cut -d '=' -f2`
SOURCE_DIR=`grep -w '^CHIPSEQ_DIR' $tool_info | cut -d '=' -f2`

# Constants used in this file
MAP_OUTDIR=$WORK_DIR/mapout
MACS2_OUTDIR=$WORK_DIR/macs2out

ARRAY_JOB_INDEX=0
if [[ ! -z "$SGE_TASK_ID" ]]; then ARRAY_JOB_INDEX=$(($SGE_TASK_ID-1)); fi
echo "Job Index = $ARRAY_JOB_INDEX"

# Error checking:

if [[ ! $PKGENE_NEIGHDIST ]]
then
	echo "WARNING: PKGENE_NEIGHDIST is null, will result in missing *peak_vs_gene.xls files."
fi

##################################### start peak calling ###################################
if [[ $PEAK_CALLER = "macs2noidr" ]]
then
	
	mkdir -p $MACS2_OUTDIR

	SEQ1NAME=$( basename ${IP_FILE[ARRAY_JOB_INDEX]} .${SEQ_SUFFIX} )
	SEQ2NAME=$( basename ${INPUT_FILE[ARRAY_JOB_INDEX]} .${SEQ_SUFFIX} )

	#### DETERMINE if lables are needed or not, bases on IP file names duplications
	NUM_IP=${#IP_FILE[@]}
	UNIQ_IP=($(echo "${IP_FILE[@]}" | tr ' ' '\n' | sort | uniq | tr '\n' ' '))
	NUM_UNIQ_IP=${#UNIQ_IP[@]}

	BASE_SUFFIX="${SEQ1NAME}.${SEQ_TYPE}"
	if [ "$NUM_UNIQ_IP" -lt "$NUM_IP" ]
	then
		BASE_SUFFIX="${SEQ1NAME}.${SEQ_TYPE}.${LABLE[$ARRAY_JOB_INDEX]}"
	fi

	#echo $BASE_SUFFIX
	#exit 0;
	
	echo " "
	echo "$PEAK_CALLER will be used to call peaks from IP $( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam and input $( echo $SEQ2NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam"

        if [[ $SEQ_TYPE = "PE" ]]
        then
        	echo " "
        	echo "$PEAK_CALLER is used to call peaks from $( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam"
        	echo "versus input $( echo $SEQ2NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam, $(date)"
        	echo " "

        	INSERT_SIZE=$( sort -k2,2n $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam.size.txt |tail -n 1 |cut -f 1 |awk '{print $1}' )

        	echo " "
        	echo "They are paired-end sequences"
        	echo "Estimate median fragment size from $( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam.size.txt"
        	echo "Set $INSERT_SIZE bp as median of the estimated fragment sizes "
        	echo " "

        	$MACS_PATH"/"macs2 callpeak --bdg -t $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam -c $MAP_OUTDIR"/"$( echo $SEQ2NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam -n $MACS2_OUTDIR"/"${BASE_SUFFIX}_macs2 $MACS2_ARGS --bw=$( echo $INSERT_SIZE )

		elif [[ $SEQ_TYPE = "SE" ]]
        then
        	echo " "
        	echo "$PEAK_CALLER is used to call peaks from $( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam"
        	echo "versus input $( echo $SEQ2NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam, $(date)"
        	echo "They are single-end sequences, set $( echo $FRAGMENT_SIZE ) as bandwidth, use 1/2 of the bandwidth as shift size "
        	echo " "

        	$MACS_PATH"/"macs2 callpeak --bdg -t $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam -c $MAP_OUTDIR"/"$( echo $SEQ2NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam -n $MACS2_OUTDIR"/"${BASE_SUFFIX}_macs2 $MACS2_ARGS --bw=$( echo $FRAGMENT_SIZE )

        fi
        
        # For newer versions of macs2, the output file has the extension .narrowPeak instead of .encodePeak
		# Copy the new extension to the older naming convention so later functions work.
        for filename in ${MACS2_OUTDIR}/${BASE_SUFFIX}_macs2_peaks.narrowPeak
		do

		if [[ -f "$filename" ]]
		then
			cp $filename ${filename%.narrowPeak}.encodePeak
        	cut -f 1-4,9 $filename >${filename%.narrowPeak}.bed

                fi

		done

        Peak_Label=${BASE_SUFFIX}_macs2_
        perl -p -i -e "s#${MACS2_OUTDIR}"/"${Peak_Label}##g" ${MACS2_OUTDIR}"/"${BASE_SUFFIX}_macs2_peaks.*
        perl -p -i -e "s#${MACS2_OUTDIR}"/"${Peak_Label}##g" ${MACS2_OUTDIR}"/"${BASE_SUFFIX}_macs2_summits.bed

		TRACK_NAME=$(awk -v trackName=${Peak_Label}peaks.bed 'BEGIN {print "track name=\""trackName"\" description=\""trackName"\""}' )
		perl -p -i -e "s#^chr#${TRACK_NAME}\nchr# if $. == 1" ${MACS2_OUTDIR}"/"${BASE_SUFFIX}_macs2_peaks.bed
	
		### for MACS2 output files (without IDR analysis), use columns 1-3 and column 8 from file below
        ## $SOURCE_DIR"/"find_nearby_genes.pl <(awk '{print $1"\t"$2"\t"$3"\t"$8}' $MACS2_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}_macs2_peaks.encodePeak) $USCS_REF_FLAT $MACS2_OUTDIR"/"peaks_and_nearby_genes_$i.xls $PKGENE_NEIGHDIST 
        ### NOTE: 3 lines below do same as above line
        awk '{print $1"\t"$2"\t"$3"\t"$8}' $MACS2_OUTDIR"/"${BASE_SUFFIX}_macs2_peaks.encodePeak > $MACS2_OUTDIR"/"${BASE_SUFFIX}_peaksBED.tmp
        $SOURCE_DIR"/"find_nearby_genes.pl $MACS2_OUTDIR"/"${BASE_SUFFIX}_peaksBED.tmp $UCSC_REF_FLAT $MACS2_OUTDIR"/"${BASE_SUFFIX}_peak_vs_gene.xls $PKGENE_NEIGHDIST 
        rm -f $MACS2_OUTDIR"/"${BASE_SUFFIX}_peaksBED.tmp
fi

echo " "
echo "Chipseq peak calling for project $PROJECT_NAME completed, $(date)"
echo " "

