#!/bin/bash
# tested for human and mouse data -> use SGE
# 08/13/2013, 08/16/2013, 08/17/2013, 08/18/2013, 08/20/2013, 08/21/2013 
# it generated the same results as v2
# only use "createRegulatoryDomains" from GREAT use R for binomial test and FDR estimation can take one 
# or multiple annotation source files work on one or multiple peak files
set -x

if [ $# != 1 ];
then
        echo "usage: pkcall.sh <config file>";
	exit
fi

TOOL_INFO=`grep -w '^TOOL_INFO' $1 | cut -d '=' -f2`
SOURCE_DIR=`grep -w '^CHIPSEQ_DIR' $TOOL_INFO | cut -d '=' -f2`
BEDTOOLS=`grep -w '^BEDTOOLS' $TOOL_INFO | cut -d '=' -f2`
R_PATH=`grep -w '^R_PATH' $TOOL_INFO | cut -d '=' -f2`
GENOME_TABLE=`grep -w '^GENOME_TABLE' $TOOL_INFO | cut -d '=' -f2`

## Other Scripts Called
ADDCOLUMNS=$SOURCE_DIR"/"addCols.pl


################ tool_info (new)
#### need to replace "REF_DIR" with "/data2/bsi/reference/chipseq/ref_files"
GREAT_PATH=/projects/bsi/bictools/apps/chipseq/great/2.0.2
GREAT_ARGS=(basalPlusExtension -maxExtension=200000 -basalUpstream=5000 -basalDownstream=1000)

REF_DIR=/data2/bsi/staff_analysis/m105265/packages/ChIPseq_v2_SOURCE 
ANNO_METHOD=qvalue
ANNO_R=${REF_DIR}"/"Anno.SigTest.R

# human data
GENE_TSS=${REF_DIR}"/"Hg19.great2.0.genes.v2.txt
TEST_TERM=(hs.GO_annotation.txt) 
ANTI_GAP=${REF_DIR}"/"hg19.non_gap.bed

# mouse data
#GENE_TSS=${REF_DIR}"/"Mm10.great2.0.genes.v2.txt
#TEST_TERM=(Mm.GO_annotation.txt) 
#ANTI_GAP=${REF_DIR}"/"mm10.non_gap.bed


################ run_info (copy from ChIP-Seq v2)
PEAK_CALLER=macs2noidr
WORK_DIR=/data2/bsi/staff_analysis/m105265/test_GREAT
MACS2_OUTDIR=$WORK_DIR/macs2out 


################ run_info (new)
RUN_GREAT=Yes
GREAT_OUTDIR=$WORK_DIR/greatout7
ANNO_TYPE=(GO) 

# human data
INFILE=(NS-1-mi.FCD19BEACXX_L8_IGCCAAT.PE_macs2_peaks.encodePeak si-1-mi.FCD19BEACXX_L8_ICAGATC.PE_macs2_peaks.encodePeak)

# mouse data
#INFILE=(36b_H3K4me3.FCD1NDUACXX_L5_R1_ICAGATC.SE_macs2_peaks.encodePeak 36b_H3K4me3.FCD1NDUACXX_L5_R1_ICGATGT.SE_macs2_peaks.encodePeak)

echo " "
echo "Start functional analysis for ChIP-seq peaks, $(date)"


################ check outdir
if [[ ! -d ${GREAT_OUTDIR} ]]
   then
   mkdir ${GREAT_OUTDIR}
fi


i=$SGE_TASK_ID
let i=$i-1


#############################################################################
################ create gene regulatory domains

FNAME=$( basename ${GENE_TSS} .txt )

GREAT_VAR=$( echo ${GREAT_ARGS[@]} |sed 's#-maxExtension=##' | sed 's#-basalUpstream=##' |sed 's#-basalDownstream=##' |tr -s " " "_")

if [[ ${RUN_GREAT} = "Yes" && ! -f ${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms ]]
then

echo " "
echo "start to generate gene regulatory domains file:"
echo "${FNAME}.${GREAT_VAR}.$i.RegDoms"

$GREAT_PATH"/"createRegulatoryDomains ${GENE_TSS} ${GENOME_TABLE} ${GREAT_ARGS[@]} \
${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms.temp

awk 'BEGIN {FS=OFS="\t"} {gsub(/^0$/, "1",$2); print $0}' \
${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms.temp > \
${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms

rm -rf ${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms.temp


elif [[ ${RUN_GREAT} = "Yes" && -f ${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms ]]
then
echo "File ${FNAME}.${GREAT_VAR}.$i.RegDoms already existed!"

fi



#############################################################################
################ assign one or multiple sets of ontology terms to regulatory domains
#### output: ${FNAME}.${GREAT_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt

for ((k=0;k<${#ANNO_TYPE[@]};k=k+1))
do

if [[ ! -f ${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt ]]
   then

   perl $ADDCOLUMNS -i ${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms \
   -ic 3 -r ${REF_DIR}"/"${TEST_TERM[k]} -rc 3 -o ${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt

   echo " "
   echo "start to assign ${ANNO_TYPE[k]} terms to gene regulatory domains, output file:"
   echo "${FNAME}.${GREAT_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt"

elif [[ -f ${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt ]]
   then
   echo "file ${FNAME}.${GREAT_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt already existed!"

fi

done


#############################################################################
################ assign peaks to gene ontology/gene regulatory domains

for ((i=0;i<${#INFILE[@]};i=i+1))
do


echo " "
echo "Use the following input files:"
echo "peak file: ${INFILE[i]}"
echo "Annotated ontology term file: ${TEST_TERM[@]}"
echo "Annotated gene TSS file: $( basename ${GENE_TSS} )"
echo "Gap-free region file: $( basename ${ANTI_GAP} )"



################ define the peak center positions
PEAK_FILE=$( basename ${MACS2_OUTDIR}"/"${INFILE[i]} )

NUM_COLUMN=$(head -1 ${MACS2_OUTDIR}"/"${INFILE[i]} |awk 'BEGIN {FS=OFS="\t"} END {print NF}' )

echo " "
echo "start to reformat peak file into 4-column bed file:"
echo "${INFILE[i]}"

if [[ ${NUM_COLUMN} -eq 10 ]]
   then

   echo "Original peaks were in encodePeak format from MACS2 output"
   awk 'BEGIN {FS=OFS="\t"} {print $1,($2+$10-4),($2+$10+5),"peak_"NR}' ${MACS2_OUTDIR}"/"${INFILE[i]} > \
   ${GREAT_OUTDIR}"/"${PEAK_FILE}.4col.bed

elif [[ $NUM_COLUMN -lt 10 ]]
   then

   echo "peaks are in ${NUM_COLUMN}_column bed format"
   awk 'BEGIN {FS=OFS="\t"} {print $1,int(($2+$3)/2)}' ${MACS2_OUTDIR}"/"${INFILE[i]} | \
   awk 'BEGIN {FS=OFS="\t"} {print $1,($2-4),($2+5),"peak_"NR}' > \
   ${GREAT_OUTDIR}"/"${PEAK_FILE}.4col.bed

fi


################ assign peaks to gene regulatory domains

for ((k=0;k<${#ANNO_TYPE[@]};k=k+1))
do

echo " "
echo "start to assign peaks to gene regulatory domains and ${ANNO_TYPE[k]}, output file:"
echo "${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.txt"

$BEDTOOLS"/"intersectBed -a ${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt -b \
${GREAT_OUTDIR}"/"${PEAK_FILE}.4col.bed -wa -c | \
awk 'BEGIN {FS=OFS="\t"} {if ($11 == 0) print $0,"0","0","0"}' > \
${GREAT_OUTDIR}"/"${PEAK_FILE}.4col.bed.nohit

$BEDTOOLS"/"intersectBed -a ${GREAT_OUTDIR}"/"${FNAME}.${GREAT_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt -b \
${GREAT_OUTDIR}"/"${PEAK_FILE}.4col.bed -wa -wb | \
cat - ${GREAT_OUTDIR}"/"${PEAK_FILE}.4col.bed.nohit | \
sort -k1,1 -k2,2n -k3,3n -k4,4 > \
${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.txt

done


#############################################################################
################ extract info for binomial test of each ontology term
#### size of non-gap regions (ANTI_GAP_SIZE)
#### number of peaks assigned to RegDoms (PEAK_HIT_ALL)

################ Total size of non-gap regions

ANTI_GAP_SIZE=$(cat ${ANTI_GAP} |awk 'BEGIN {FS=OFS="\t"} {print ($3-$2+1)}' | \
awk 'BEGIN {Total = 0} {Total += $1} END {print Total}' )

echo " "
echo "Total size of non-gap genomic regions is : ${ANTI_GAP_SIZE} bp"



################ Total number of peaks assigned to gene regulatory domains
################ Number of associated peaks for each ontology term
################ total size of regulatory domains per ontology term
################ merge data for binomial test

for ((k=0;k<${#ANNO_TYPE[@]};k=k+1))
do

####### Total number of peaks assigned to gene regulatory domains

PEAK_HIT_ALL=$(awk 'BEGIN {FS=OFS="\t"} {if ($12 >=1) print $14}' \
${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.txt | \
sort |uniq | awk 'BEGIN {FS=OFS="\t"} END {print NR}' )

echo " "
echo "start to extract information for binomial test from file:"
echo "${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.txt"
echo "The total number of peaks assigned to gene regulatory domains: ${PEAK_HIT_ALL}"


####### Number of associated peaks for each ontology term
####### exclude ontology terms with no associated peak

TAB=`echo -e "\t"`
awk 'BEGIN {FS=OFS="\t"} {if (($8 !~ /^0$/) && ($14 !~ /^0$/)) print $7,$8,$9,$14}' \
${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.txt | \
sort -t "$TAB" -k2,2 -k4,4 |uniq |awk 'BEGIN {FS="\t";OFS="\t"} {print $1,$2,$3}' | \
awk 'BEGIN {FS="\t";OFS="\t"} {a[$1"\t"$2"\t"$3]++}END{for(i in a){print i,a[i]}}' | \
sort -t "$TAB" -k2,2 > \
${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.peakCount

ANNO_TERM_COUNT=$(awk 'BEGIN {FS=OFS="\t"} END {print NR}' \
${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.peakCount)

echo "The total number of ${ANNO_TYPE[k]} with at least one peak per term: ${ANNO_TERM_COUNT}"


####### total size of regulatory domains per ontology term
####### exclude ontology terms with no associated peak

echo " "
echo "start to estimate total size for regulatory domains associated with each ontology term"
echo "output file: ${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.regSize"

awk 'BEGIN {FS=OFS="\t"} {if ($8 !~ /^0$/) print $1,$2,$3,$8}' \
${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.txt | \
sort -k4,4 -k1,1 -k2,2n -k3,3n | uniq | \
awk 'BEGIN {FS=OFS="\t"} {print $1"_"$4,$2,$3}' | \
mergeBed -i stdin |tr -s "_" "\t" | \
awk 'BEGIN {FS=OFS="\t"} {print $2,($4-$3+1)}' | \
awk 'BEGIN {FS="\t";OFS="\t"} {a[$1]+=$2}END{for(i in a){print i,a[i]}}' | \
sort -k1,1 > \
${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.regSize



####### merge data
## col1-3: ontology_term_id/type/description
## col4: total peaks assigned to RegDoms
## col5: No_peak for each ontology term
## col6-7: total size of RegDoms fro each ontology term and its genome fraction

echo " "
echo "Start to merge data for binomial test"

join -1 2 -2 1 ${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.peakCount \
${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.regSize -t $'\t' | \
awk -v peak=${PEAK_HIT_ALL} -v size=${ANTI_GAP_SIZE} 'BEGIN {FS=OFS="\t"} {printf("%s\t%s\t%s\t%d\t%d\t%d\t%.6f\n",$1,$2,$3,peak,$4,$5,($5/size))}' > \
${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.${ANNO_TERM_COUNT}term.tab


#rm -rf ${GREAT_OUTDIR}"/"${PEAK_FILE}.4col.bed
#rm -rf ${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.txt
#rm -rf ${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.peakCount
#rm -rf ${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.regSize


####### binomial test

echo " "
echo "start binomial test for ${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.${ANNO_TERM_COUNT}term.tab"

ANNO_IN=${GREAT_OUTDIR}"/"${PEAK_FILE}.vs.${GREAT_VAR}.RegDoms.${ANNO_TYPE[k]}.${ANNO_TERM_COUNT}term.tab
ANNO_LOG=${GREAT_OUTDIR}"/"$( basename ${ANNO_R} ).${ANNO_TYPE[k]}.log${i}.txt

Rscript --slave ${ANNO_R} infile=${ANNO_IN} method=${ANNO_METHOD} >${ANNO_LOG}

done

echo " "
echo "Finish functional analysis for ${PEAK_FILE}, $(date)"

done


