#!/bin/bash
# 12/30/2012, Huihuang Yan
### Need a better fix than this...(below)
# On internal system, need to find correct modules: find /usr/local/biotools/python/ -name _sqlite3.so
## export PYTHONPATH=/projects/bsi/bictools/apps/chipseq/ceas/1.0.2/CEAS/lib/python2.7/site-packages/:/usr/local/biotools/python/2.7.3-centos6/lib/python2.7/lib-dynload/
set -x

#### ARGS
tool_info=`grep -w '^TOOL_INFO' $1 | cut -d '=' -f2`
SEQ_TYPE=`grep -w '^SEQ_TYPE' $1 | cut -d '=' -f2`
REMOVE_DUP=`grep -w '^REMOVE_DUP' $tool_info | cut -d '=' -f2`
FILTER_TYPE=`grep -w '^FILTER_TYPE' $1 | cut -d '=' -f2`
STEP_SIZE=`grep -w '^STEP_SIZE' $tool_info | cut -d '=' -f2`
PEAK_CALLER=`grep -w '^PEAK_CALLER' $1 | cut -d '=' -f2`
CEAS_PATH=`grep -w '^CEAS_PATH' $tool_info | cut -d '=' -f2`
CEAS_PYTHON=`grep -w '^CEAS_PYTHON' $tool_info | cut -d '=' -f2`
CEAS_REF=`grep -w '^CEAS_REF' $tool_info | cut -d '=' -f2`
WORK_DIR=`grep -w '^WORK_DIR' $1 | cut -d '=' -f2`

IDR_CUTOFF=`grep -w '^IDR_CUTOFF' $tool_info | cut -d '=' -f2`

## On internal system, need to find correct modules: find /usr/local/biotools/python/ -name _sqlite3.so
#export PATH=/usr/local/biotools/python/2.7.3/bin/:$PATH
#export PYTHONPATH=$CEAS_PYTHON:/usr/local/biotools/python/2.7.3-centos6/lib/python2.7/lib-dynload/
#export LD_LIBRARY_PATH=/usr/local/biotools/python/2.7.3/lib:$LD_LIBRARY_PATH

########### ceas args
RUN_CEAS=`grep -w '^RUN_CEAS' $1 | cut -d '=' -f2`
CEAS_ARGS=$(awk 'BEGIN {FS="S="} {if ($1 ~ /^>CEAS_ARG/) print $2}' $1)
echo "CEAS ARGS = $CEAS_ARGS"
CEAS_PEAK_CUTOFF=`grep -w '^CEAS_PEAK_CUTOFF' $1 | cut -d '=' -f2`
CEAS_GENE_DIR=`grep -w '^CEAS_GENE_DIR' $1 | cut -d '=' -f2`
CEAS_GENE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^CEAS_GENE_LIST/) print $2}' $1) )

MAP_OUTDIR=$WORK_DIR/mapout
CEAS_OUTDIR=$WORK_DIR/ceasout

### CEAS requries it's own modules + core modules that include _sqlite.so
#export PYTHONPATH=$CEAS_PATH:$SYSTEMSPECIFIC_PYMOD:$PYTHONPATH

#echo "P-PATH: $PYTHONPATH"
#echo "SGE-ID: $SGE_TASK_ID"
if [[ -z $SGE_TASK_ID ]]; then echo "BAD ARRAY INDEX, no SGE_TASK_ID"; exit 1; fi


echo "Start CEAS analysis, $(date)"
if [[ ! -d ${CEAS_OUTDIR} ]]
then
	mkdir ${CEAS_OUTDIR}
fi

## check user-provided gene list
if [[ ${#CEAS_GENE[@]} -ge 1 ]]
then
    echo -e "\nThere are ${#CEAS_GENE[@]} gene lists, which are:\n${CEAS_GENE[@]}"
        
    GENE_LIST=$( echo ${CEAS_GENE[*]} |tr -s " " "\n" |awk -v NAME=${CEAS_GENE_DIR} '{print NAME"/"$1}' |tr -s "\n" "," |sed 's/.$//' )
    GENE_LABEL=$( echo ${CEAS_GENE[*]} | awk 'BEGIN {FS=" "; OFS="," } {print $1,$2}' )
fi

## sort peaks
## consider macs2 with or without idr analysis

if [[ $PEAK_CALLER = "macs2noidr" ]]
then
	MACS2_OUTDIR=$WORK_DIR/macs2out
	echo -e "\nThese are macs2 peaks:"

	for files in ${MACS2_OUTDIR}"/"*.${SEQ_TYPE}_macs2_peaks.encodePeak
	do
		echo "$( basename $files )"
		FILENAME=$( basename $files .${SEQ_TYPE}_macs2_peaks.encodePeak )
		sort -k8,8nr $files | cut -f 1-3,7,8 > ${CEAS_OUTDIR}"/"${FILENAME}.${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.peak.pvalue.sorted
		cp ${MAP_OUTDIR}"/"${FILENAME}.${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam.s${STEP_SIZE}_total_based_norm.wig.gz ${CEAS_OUTDIR}
	done


elif [[ $PEAK_CALLER = "macs2idr" ]]
then
	IDR_OUTDIR=$WORK_DIR/idrout
	echo -e "\nThese are macs2 peaks:"
	for files in ${IDR_OUTDIR}"/"*.${SEQ_TYPE}_macs2_idr_peaks.encodePeak
	do
		echo "$( basename $files )"
		FILENAME=$( basename $files .${SEQ_TYPE}_macs2_idr_peaks.encodePeak )

            awk -v idr=${IDR_CUTOFF} 'BEGIN {FS="\t"; OFS="\t"} {if ($4 <= idr) print $0}' $files | \
		sort -k8,8nr | cut -f 1-3,7,8 > \
            ${CEAS_OUTDIR}"/"${FILENAME}.${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.peak.pvalue.sorted

		cp ${MAP_OUTDIR}"/"${FILENAME}.${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam.s${STEP_SIZE}_total_based_norm.wig* ${CEAS_OUTDIR}
	done


elif [[ $PEAK_CALLER = "sicer" ]]
then
	SICER_OUTDIR=$WORK_DIR/sicerout
	echo " "
	echo "These are sicer peaks:"
	for files in ${SICER_OUTDIR}"/"*${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1-W*-G*-islands-summary-FDR*[0-9]
	do
		echo "$( basename $files )"
		FILENAME=$( basename $files |awk 'BEGIN {FS="\."; OFS="\."} {$NF=""}1' |sed "s#.${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.##g" )
		awk 'BEGIN {FS="\t"; OFS="\t"} {if ($7 >=1.95) print $1,$2,$3,$7,$8}' $files | \
		awk 'BEGIN {FS="\t"; OFS="\t"} {if ($5 == 0) print $1,$2,$3,$4,"2000"; else print $1,$2,$3,$4,-log($5)/log(10)}' | \
		sort -k5,5nr > ${CEAS_OUTDIR}"/"${FILENAME}.${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.peak.pvalue.sorted
		cp ${MAP_OUTDIR}"/"${FILENAME}.${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam.s${STEP_SIZE}_total_based_norm.wig* ${CEAS_OUTDIR}
	done
fi

#### Parallel Index [SGE]
i=$SGE_TASK_ID-1

#### gunzip wig files
echo -e "\nThese are normalized wig files:"
gunzip -f ${CEAS_OUTDIR}/*gz

#### run CEAS
FILE_LIST=$( ls ${CEAS_OUTDIR}"/"*.peak.pvalue.sorted |awk '{print $NF}' |tr -s "\n" " ")
PEAK_FILE=( $( echo $FILE_LIST ) )

echo -e "\nList of sorted peak files:\n${PEAK_FILE[@]}"

FILENAME=$( basename ${PEAK_FILE[$i]} .peak.pvalue.sorted )
NUMBER_PEAK=$( awk -v e4=$CEAS_PEAK_CUTOFF 'BEGIN {FS=OFS="\t"} {if ($5 >=e4) print $0}' ${PEAK_FILE[$i]} |wc -l |sed -e 's/^[ \t]*//' )

echo -e "\nSelected Peaks from $FILENAME: $NUMBER_PEAK\n"

awk -v e4=$CEAS_PEAK_CUTOFF 'BEGIN {FS="\t"; OFS="\t"} {if ($5 >=e4) print $1,$2,$3}' ${PEAK_FILE[$i]} | \
sort -k1,1 -k2,2n > ${CEAS_OUTDIR}"/"${FILENAME}.${NUMBER_PEAK}peak.bed

set -

cd $CEAS_OUTDIR

if [[ ${#CEAS_GENE[@]} -lt 1 ]]
then
	echo "Single Gene List"
	$CEAS_PATH"/"ceas -b ${CEAS_OUTDIR}"/"${FILENAME}.${NUMBER_PEAK}peak.bed \
	-w ${CEAS_OUTDIR}"/"${FILENAME}.*.wig -g $CEAS_REF --name=$( echo ${FILENAME}_ceas ) $( echo ${CEAS_ARGS} )

elif [[ ${#CEAS_GENE[@]} -ge 1 ]]
then
	echo "Multiple Gene Lists"
	$CEAS_PATH"/"ceas -b ${CEAS_OUTDIR}"/"${FILENAME}.${NUMBER_PEAK}peak.bed \
	-w ${CEAS_OUTDIR}"/"${FILENAME}.*.wig -g $CEAS_REF --name=$( echo ${FILENAME}_ceas ) $( echo ${CEAS_ARGS} ) \
	--gn-groups=$(echo $GENE_LIST)  --gn-group-names=$(echo $GENE_LABEL)
fi


echo "Finish CEAS analysis, $(date)"

