#!/bin/bash
# tested for human and mouse data
# 08/13/2013-08/20/2013, 10/14/2013 
# can take one or multiple annotation source files
# work on one or multiple peak files
# for SGE

if [ $# != 1 ];
then
        echo "usage: gom_annotation.sh <config file>";
	exit
fi

#set -x

TOOL_INFO=`grep -w '^TOOL_INFO' $1 | cut -d '=' -f2`
SOURCE_DIR=`grep -w '^CHIPSEQ_DIR' $TOOL_INFO | cut -d '=' -f2`
BEDTOOLS=`grep -w '^BEDTOOLS' $TOOL_INFO | cut -d '=' -f2`
R_PATH=`grep -w '^R_PATH' $TOOL_INFO | cut -d '=' -f2`
GENOME_TABLE=`grep -w '^GENOME_TABLE' $TOOL_INFO | cut -d '=' -f2`
ORGANISM=`grep -w '^ORGANISM' $TOOL_INFO | cut -d '=' -f2`
WORK_DIR=`grep -w '^WORK_DIR' $1 | cut -d '=' -f2`
PEAK_CALLER=`grep -w '^PEAK_CALLER' $1 | cut -d '=' -f2`
IDR_CUTOFF=`grep -w '^IDR_CUTOFF' $TOOL_INFO | cut -d '=' -f2`

## Specific annot params
REF_DIR=`grep -w '^ANNOTATION_DIR' $TOOL_INFO | cut -d '=' -f2`
GENE_TSS=${REF_DIR}/`grep -w '^GENE_TSS' $TOOL_INFO | cut -d '=' -f2`
TEST_TERM=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^TEST_TERM/) print $2}' $TOOL_INFO) )
ANNO_TYPE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^ANNO_TYPE/) print $2}' $TOOL_INFO) ) 
#TEST_TERM=(hs.GO_annotation.txt) 
ANTI_GAP=`grep -w '^ANTI_GAP' $TOOL_INFO | cut -d '=' -f2`
### Run Info Params
#REG_ARGS=`grep -w '^REG_ARGS' $1 | cut -d '=' -f2`
REG_ARGS=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^REG_ARGS/) print $2}' $1) )
ANNO_METHOD=`grep -w '^ANNO_METHOD' $1 | cut -d '=' -f2`
RUN_ANNO=`grep -w '^RUN_GOM' $1 | cut -d '=' -f2 | tr [:upper:] [:lower:]` # case insensitive


## Other Scripts Called
ADDCOLUMNS=$SOURCE_DIR"/"addCols.pl
ANNO_R=${SOURCE_DIR}"/"anntation-sig-test.r
ANNO_OUTDIR=$WORK_DIR/geneOntology

## consider macs2 with or without idr analysis
if [[ $PEAK_CALLER = "macs2noidr" ]]
then

PEAK_OUTDIR=$WORK_DIR/macs2out 
INFILE=(`ls ${PEAK_OUTDIR}"/"*_macs2_peaks.encodePeak | awk -F/ '{print $NF}' `)

elif [[ $PEAK_CALLER = "macs2idr" ]]
then

PEAK_OUTDIR=$WORK_DIR/idrout 
INFILE=(`ls ${PEAK_OUTDIR}"/"*_macs2_idr_peaks.encodePeak | awk -F/ '{print $NF}' `)

fi


echo -e "\nStart functional analysis for ChIP-seq peaks, $(date)"

# check outdir
if [[ ! -d ${ANNO_OUTDIR} ]]; then mkdir -p ${ANNO_OUTDIR}; fi

if [[ -z $SGE_TASK_ID ]]; then echo "BAD ARRAY INDEX, no SGE_TASK_ID"; exit 1; fi

i=$SGE_TASK_ID
let i=$i-1


######################################
## create gene regulatory domains
## this is the new code
 set -x

FNAME=$( basename ${GENE_TSS} .txt )
REG_VAR=$( echo ${REG_ARGS[@]} |awk 'BEGIN {FS=" "; OFS="_"} {print "U"$1,"D"$2,"UE"$3,"DE"$4}' )

if [[ ${RUN_ANNO} = "yes" && ! -f ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.$i.RegDoms ]]
then

echo " "
echo "start to generate gene regulatory domains file:"
echo "${FNAME}.${REG_VAR}.$i.RegDoms"


#### define basal regulatory domain for each TSS

#awk -v U=${REG_ARGS[0]} -v D=${REG_ARGS[1]} 'BEGIN {FS=OFS="\t"} {if ($3 ~ /\+/) print $0,($2-U),($2+D),$1; else if ($3 ~ /\-/) print $0,($2-D),($2+U),$1}' \
#${GENE_TSS} | \
#awk 'BEGIN {FS="\t"; OFS="\t"} {gsub(/chrX/, "23",$7); gsub(/chrY/, "24",$7); gsub(/chrM/, "25",$7); gsub(/chr/, "",$7); print $0}' | \
#sort -k7,7n -k2,2n -k4,4 | \
#cut -f 1-6 | \
#join -1 1 -2 1 -t $'\t' - ${GENOME_TABLE} | \
#awk 'BEGIN {FS=OFS="\t"} {if ($5 <=0) print $1,"1",$6,$2,$4,$3; else if ($6 >=$7) print $1,$5,$7,$2,$4,$3; else print $1,$5,$6,$2,$4,$3}' > \
#${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp1

awk -v U=${REG_ARGS[0]} -v D=${REG_ARGS[1]} 'BEGIN {FS=OFS="\t"} {if ($3 ~ /\+/) print $0,($2-U),($2+D),$1; else if ($3 ~ /\-/) print $0,($2-D),($2+U),$1}' ${GENE_TSS} |\
awk 'BEGIN {FS="\t"; OFS="\t"} {gsub(/chrX/, "23",$7); gsub(/chrY/, "24",$7); gsub(/chrM/, "25",$7); gsub(/chr/, "",$7); print $0}' |\
sort -k7,7n -k2,2n -k4,4 |\
cut -f 1-6 |\
join -1 1 -2 1 -t $'\t' - <(grep "chr17" ${GENOME_TABLE}) |\
awk 'BEGIN {FS=OFS="\t"} {if ($5 <=0) print $1,"1",$6,$2,$4,$3; else if ($6 >=$7) print $1,$5,$7,$2,$4,$3; else print $1,$5,$6,$2,$4,$3}' > \
${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp1


#### regions for extending basal regulatory domain
awk '{print $1"\t1\t"$2}' $GENOME_TABLE | \
$BEDTOOLS"/"subtractBed -a stdin -b ${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp1 > \
${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp2


#### upstream basal regulatory domain
awk -v UE=${REG_ARGS[2]} 'BEGIN {FS=OFS="\t"} {if ($6 ~ /\+/) print $1,$2,$4,$4,$5,$6,($2-UE),$2; else if ($6 ~ /\-/) print $1,$4,$3,$4,$5,$6,$3,($3+UE)}' ${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp1 | \
join -1 1 -2 1 -t $'\t' - <(grep "chr17" ${GENOME_TABLE}) | \
awk 'BEGIN {FS=OFS="\t"} {if ($7 <=0) print $1,$2,$3,$4,$5,$6,"1",$8; else if ($8 >=$9) print $1,$2,$3,$4,$5,$6,$7,$9; else print $1,$2,$3,$4,$5,$6,$7,$8}' | \
$BEDTOOLS"/"closestBed -a - -b ${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp2 > \
${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp3


#### extended upstream basal regulatory domain
awk 'BEGIN {FS=OFS="\t"} {if (($8<=$10) || ($7>=$11)) print $0,"0","0"; else if (($7<=$10) && ($8<=$11)) print $0,$10,$8; else if (($7<=$10) && ($8>$11)) print $0,$10,$11; else if (($7>$10) && ($8<=$11)) print $0,$7,$8; else if (($7>$10) && ($8>$11)) print $0,$7,$11}' \
${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp3 | \
awk 'BEGIN {FS=OFS="\t"} {if ($6 ~ /\+/) print $0,($8-$13),($12-$7); else print $0,($7-$12),($8-$13)}' | \
awk 'BEGIN {FS=OFS="\t"} {if (($12 ==0) || (($12 >0) && ($14 >=100)) || (($12 >0) && ($14 <=-100))) print $1,$2,$3,$4,$5,$6,$7,$8,$2,$3; else if (($6 ~ /\+/) && ($12 >0) && ($14 >-100) && ($14 <100)) print $1,$2,$3,$4,$5,$6,$7,$8,$12,$3; else if (($6 ~ /\-/) && ($12 >0) && ($14 >-100) && ($14 <100)) print $1,$2,$3,$4,$5,$6,$7,$8,$2,$13}' > \
${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp4


#### downstream basal regulatory domain
awk -v DE=${REG_ARGS[3]} 'BEGIN {FS=OFS="\t"} {if ($6 ~ /\+/) print $1,$4,$3,$4,$5,$6,$3,($3+DE); else if ($6 ~ /\-/) print $1,$2,$4,$4,$5,$6,($2-DE),$2}' \
${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp1 | \
join -1 1 -2 1 -t $'\t' - <(grep "chr17" ${GENOME_TABLE}) | \
awk 'BEGIN {FS=OFS="\t"} {if ($7 <=0) print $1,$2,$3,$4,$5,$6,"1",$8; else if ($8 >=$9) print $1,$2,$3,$4,$5,$6,$7,$9; else print $1,$2,$3,$4,$5,$6,$7,$8}' | \
$BEDTOOLS"/"closestBed -a - -b ${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp2 > \
${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp5


#### extended downstream basal regulatory domain
awk 'BEGIN {FS=OFS="\t"} {if (($8<=$10) || ($7>=$11)) print $0,"0","0"; else if (($7<=$10) && ($8<=$11)) print $0,$10,$8; else if (($7<=$10) && ($8>$11)) print $0,$10,$11; else if (($7>$10) && ($8<=$11)) print $0,$7,$8; else if (($7>$10) && ($8>$11)) print $0,$7,$11}' \
${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp5 | \
awk 'BEGIN {FS=OFS="\t"} {if ($6 ~ /\+/) print $0,($7-$12),($8-$13); else print $0,($8-$13),($12-$7)}' | \
awk 'BEGIN {FS=OFS="\t"} {if (($12 ==0) || (($12 >0) && ($14 >=100)) || (($12 >0) && ($14 <=-100))) print $1,$2,$3,$4,$5,$6,$7,$8,$2,$3; else if (($6 ~ /\+/) && ($12 >0) && ($14 >-100) && ($14 <100)) print $1,$2,$3,$4,$5,$6,$7,$8,$2,$13; else if (($6 ~ /\-/) && ($12 >0) && ($14 >-100) && ($14 <100)) print $1,$2,$3,$4,$5,$6,$7,$8,$12,$3}' > \
${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp6


#### final extended regulatory domain
paste ${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp4 \
${ANNO_OUTDIR}"/"${FNAME}.$REG_VAR.$i.RegDoms.temp6 | \
awk 'BEGIN {FS=OFS="\t"} {if ($6 ~ /\+/) print $1,$9,$20,$5,$4,$6; else if ($6 ~ /\-/) print $1,$19,$10,$5,$4,$6}' > \
${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.$i.RegDoms


elif [[ ${RUN_ANNO} = "yes" && -f ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.$i.RegDoms ]]
then
echo "File ${FNAME}.${REG_VAR}.$i.RegDoms already existed"

fi



#############################################################################
################ assign ontology terms to regulatory domains
#### output: ${FNAME}.${REG_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt

for ((k=0;k<${#ANNO_TYPE[@]};k=k+1))
do

if [[ ! -f ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt ]]
   then

   perl $ADDCOLUMNS -i ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.$i.RegDoms \
   -ic 3 -r ${REF_DIR}"/"${TEST_TERM[k]} -rc 3 -o ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt

   echo " "
   echo "start to assign ${ANNO_TYPE[k]} terms to gene regulatory domains, output file:"
   echo "${FNAME}.${REG_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt"

elif [[ -f ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt ]]
   then
   echo "file ${FNAME}.${REG_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt already existed"

fi

done



#############################################################################
################ assign peaks to gene ontology/gene regulatory domains

################ start analysis for each peak file

echo " "
echo "Use the following input files:"
echo "peak file: ${INFILE[i]}"
echo "Annotated ontology term file: ${TEST_TERM[@]}"
echo "Annotated gene TSS file: $( basename ${GENE_TSS} )"
echo "Gap-free region file: $( basename ${ANTI_GAP} )"



################ define the peak center
#### consider macs2 with or without idr annalysis
PEAK_FILE=$( basename ${PEAK_OUTDIR}"/"${INFILE[i]} )

NUM_COLUMN=$(head -1 ${PEAK_OUTDIR}"/"${INFILE[i]} |awk 'BEGIN {FS=OFS="\t"} END {print NF}' )

echo " "
echo "start to reformat peak file into 4-column bed file:"
echo "${INFILE[i]}"

if [[ ${NUM_COLUMN} -eq 10 ]]
   then

   echo "Original peaks were in encodePeak format from MACS2 output"

   if [[ $PEAK_CALLER = "macs2noidr" ]]
   then

   awk 'BEGIN {FS=OFS="\t"} {print $1,($2+$10-4),($2+$10+5),"peak_"NR}' ${PEAK_OUTDIR}"/"${INFILE[i]} > \
   ${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed

   elif [[ $PEAK_CALLER = "macs2idr" ]]
   then
   
   awk -v idr=${IDR_CUTOFF} 'BEGIN {FS=OFS="\t"} {if ($4 <= idr) print $1,($2+$10-4),($2+$10+5),"peak_"NR}' ${PEAK_OUTDIR}"/"${INFILE[i]} > \
   ${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed

    fi


elif [[ $NUM_COLUMN -lt 10 ]]
   then

   echo "peaks are in ${NUM_COLUMN}_column bed format"
   awk 'BEGIN {FS=OFS="\t"} {print $1,int(($2+$3)/2)}' ${PEAK_OUTDIR}"/"${INFILE[i]} | \
   awk 'BEGIN {FS=OFS="\t"} {print $1,($2-4),($2+5),"peak_"NR}' > \
   ${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed

fi



################ assign peaks to gene regulatory domains

for ((k=0;k<${#ANNO_TYPE[@]};k=k+1))
do

echo " "
echo "start to assign peaks to gene regulatory domains and ${ANNO_TYPE[k]}, output file:"
echo "${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt"

$BEDTOOLS"/"intersectBed -a ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt -b \
${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed -wa -c | \
awk 'BEGIN {FS=OFS="\t"} {if ($11 == 0) print $0,"0","0","0"}' > \
${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed.nohit

$BEDTOOLS"/"intersectBed -a ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.$i.RegDoms.${ANNO_TYPE[k]}.txt -b \
${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed -wa -wb | \
cat - ${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed.nohit | \
sort -k1,1 -k2,2n -k3,3n -k4,4 > \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt


done


#############################################################################
################ extract info for significant test
#### size of non-gap regions (ANTI_GAP_SIZE)
#### number of peaks assigned to RegDoms (PEAK_HIT_ALL)

################ Total size of non-gap regions

ANTI_GAP_SIZE=$(cat ${REF_DIR}/${ANTI_GAP} |awk 'BEGIN {FS=OFS="\t"} {print ($3-$2+1)}' | \
awk 'BEGIN {Total = 0} {Total += $1} END {print Total}' )

echo " "
echo "Total size of non-gap genomic regions is : ${ANTI_GAP_SIZE} bp"



################ Total number of peaks assigned to gene regulatory domains
################ Number of associated peaks for each ontology term
################ total size of regulatory domains per ontology term
################ merge data significant test

for ((k=0;k<${#ANNO_TYPE[@]};k=k+1))
do

####### Total number of peaks assigned to gene regulatory domains

PEAK_HIT_ALL=$(awk 'BEGIN {FS=OFS="\t"} {if ($12 >=1) print $14}' \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt | \
sort |uniq | awk 'BEGIN {FS=OFS="\t"} END {print NR}' )

echo " "
echo "start to extract information for binomial test from file:"
echo "${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt"
echo "The total number of peaks assigned to gene regulatory domains: ${PEAK_HIT_ALL}"


####### Number of associated peaks for each ontology term
####### exclude ontology terms with no associated peak

TAB=`echo -e "\t"`
awk 'BEGIN {FS=OFS="\t"} {if (($8 !~ /^0$/) && ($14 !~ /^0$/)) print $7,$8,$9,$14}' \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt | \
sort -t "$TAB" -k2,2 -k4,4 |uniq |awk 'BEGIN {FS="\t";OFS="\t"} {print $1,$2,$3}' | \
awk 'BEGIN {FS="\t";OFS="\t"} {a[$1"\t"$2"\t"$3]++}END{for(i in a){print i,a[i]}}' | \
sort -t "$TAB" -k2,2 > \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.peakCount

ANNO_TERM_COUNT=$(awk 'BEGIN {FS=OFS="\t"} END {print NR}' \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.peakCount)

echo "The total number of ${ANNO_TYPE[k]} with at least one peak per term: ${ANNO_TERM_COUNT}"


####### total size of regulatory domains per ontology term
####### exclude ontology terms with no associated peak

echo " "
echo "start to estimate total size for regulatory domains associated with each ontology term"
echo "output file: ${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.regSize"

awk 'BEGIN {FS=OFS="\t"} {if ($8 !~ /^0$/) print $1,$2,$3,$8}' \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt | \
sort -k4,4 -k1,1 -k2,2n -k3,3n | uniq | \
awk 'BEGIN {FS=OFS="\t"} {print $1"_"$4,$2,$3}' | \
$BEDTOOLS"/"mergeBed -i stdin |tr -s "_" "\t" | \
awk 'BEGIN {FS=OFS="\t"} {print $2,($4-$3+1)}' | \
awk 'BEGIN {FS="\t";OFS="\t"} {a[$1]+=$2}END{for(i in a){print i,a[i]}}' | \
sort -k1,1 > \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.regSize



####### merge data
## col1-3: ontology_term_id/type/description
## col4: total peaks assigned to RegDoms
## col5: No_peak for each ontology term
## col6-7: total size of RegDoms fro each ontology term and its genome fraction

echo " "
echo "Start to merge data for binomial test"

join -1 2 -2 1 ${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.peakCount \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.regSize -t $'\t' | \
awk -v peak=${PEAK_HIT_ALL} -v size=${ANTI_GAP_SIZE} 'BEGIN {FS=OFS="\t"} {printf("%s\t%s\t%s\t%d\t%d\t%d\t%.6f\n",$1,$2,$3,peak,$4,$5,($5/size))}' > \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.${ANNO_TERM_COUNT}term.tab


####### significant test

echo " "
echo "start test for ${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.${ANNO_TERM_COUNT}term.tab"

ANNO_IN=${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.${ANNO_TERM_COUNT}term.tab
ANNO_LOG=${ANNO_OUTDIR}"/"$( basename ${ANNO_R} ).${ANNO_TYPE[k]}.log${i}.txt

Rscript --slave ${ANNO_R} infile=${ANNO_IN} method=${ANNO_METHOD} >${ANNO_LOG}


echo " "
echo "Finish functional analysis for ${PEAK_FILE}, $(date)"

done

echo " "
echo "Finish functional analysis for ${PEAK_FILE}, $(date)"

################ end










