#!/bin/bash
# tested for human and mouse data
# 08/13/2013-08/20/2013, 10/14/2013 
# can take one or multiple annotation source files
# work on one or multiple peak files
# use for loop

set -x

################ tool_info (copy from ChIP-Seq v2)
BEDTOOLS=/projects/bsi/bictools/apps/misc/BEDTools/2.16.2/bin/
R_PATH=/usr/local/biotools/r/R-2.14.0/bin/R

# human data
GENOME_TABLE=/data2/bsi/reference/chipseq/ref_files/genome_table.human.hg19.txt
ORGANISM=human

# mouse data
#GENOME_TABLE=/data2/bsi/reference/chipseq/ref_files/genome_table.mm10.txt
#ORGANISM=mouse


################ tool_info (new)
#### need to replace "REF_DIR" with "/data2/bsi/reference/chipseq/ref_files?"

REF_DIR=/data2/bsi/staff_analysis/m105265/packages/ChIPseq_v2_SOURCE
REG_ARGS=(5000 1000 100000 5000)
ANNO_METHOD=qvalue
#ANNO_R=${REF_DIR}"/"Anno.SigTest.R
ANNO_R=${REF_DIR}"/"anntation-sig-test.r

# human data
GENE_TSS=${REF_DIR}"/"Hg19.great2.0.genes.v2.txt
TEST_TERM=(hs.GO_annotation.txt) 
ANTI_GAP=${REF_DIR}"/"hg19.non_gap.bed

# mouse data
#GENE_TSS=${REF_DIR}"/"Mm10.great2.0.genes.v2.txt
#TEST_TERM=(Mm.GO_annotation.txt) 
#ANTI_GAP=${REF_DIR}"/"mm10.non_gap.bed


################ run_info (copy from ChIP-Seq v2)
WORK_DIR=/data2/bsi/staff_analysis/m105265/test_anno
MACS2_OUTDIR=$WORK_DIR/macs2out 


################ run_info (new)
RUN_ANNO=Yes
ANNO_OUTDIR=$WORK_DIR/annotation
ANNO_TYPE=(GO) 

# human peak data
INFILE=(NS-1-mi.FCD19BEACXX_L8_IGCCAAT.PE_macs2_peaks.encodePeak si-1-mi.FCD19BEACXX_L8_ICAGATC.PE_macs2_peaks.encodePeak)

# mouse peak data
#INFILE=(36b_H3K4me3.FCD1NDUACXX_L5_R1_ICAGATC.SE_macs2_peaks.encodePeak 36b_H3K4me3.FCD1NDUACXX_L5_R1_ICGATGT.SE_macs2_peaks.encodePeak)

echo " "
echo "Start functional analysis for ChIP-seq peaks, $(date)"


################ check outdir
if [[ ! -d ${ANNO_OUTDIR} ]]
   then
   mkdir ${ANNO_OUTDIR}
fi



#############################################################################
################ create gene regulatory domains
################ this is the new code


FNAME=$( basename ${GENE_TSS} .txt )

REG_VAR=$( echo ${REG_ARGS[@]} |awk 'BEGIN {FS=" "; OFS="\_"} {print "U"$1,"D"$2,"UE"$3,"DE"$4}' )

if [[ ${RUN_ANNO} = "Yes" && ! -f ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.RegDoms ]]
then

echo " "
echo "start to generate gene regulatory domains file:"
echo "${FNAME}.${REG_VAR}.RegDoms"


#### define basal regulatory domain for each TSS

awk -v U=${REG_ARGS[0]} -v D=${REG_ARGS[1]} 'BEGIN {FS=OFS="\t"} {if ($3 ~ /\+/) print $0,($2-U),($2+D),$1; else if ($3 ~ /\-/) print $0,($2-D),($2+U),$1}' \
${GENE_TSS} | \
awk 'BEGIN {FS="\t"; OFS="\t"} {gsub(/chrX/, "23",$7); gsub(/chrY/, "24",$7); gsub(/chrM/, "25",$7); gsub(/chr/, "",$7); print $0}' | \
sort -k7,7n -k2,2n -k4,4 | \
cut -f 1-6 | \
join -1 1 -2 1 -t $'\t' - ${GENOME_TABLE} | \
awk 'BEGIN {FS=OFS="\t"} {if ($5 <=0) print $1,"1",$6,$2,$4,$3; else if ($6 >=$7) print $1,$5,$7,$2,$4,$3; else print $1,$5,$6,$2,$4,$3}' > \
${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}.RegDoms.temp1


#### regions for extending basal regulatory domain

awk '{print $1"\t1\t"$2}' $GENOME_TABLE | \
$BEDTOOLS"/"subtractBed -a stdin -b ${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}.RegDoms.temp1 > \
${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}.RegDoms.temp2


#### upstream basal regulatory domain

awk -v UE=${REG_ARGS[2]} 'BEGIN {FS=OFS="\t"} {if ($6 ~ /\+/) print $1,$2,$4,$4,$5,$6,($2-UE),$2; else if ($6 ~ /\-/) print $1,$4,$3,$4,$5,$6,$3,($3+UE)}' ${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}.RegDoms.temp1 | \
join -1 1 -2 1 -t $'\t' - ${GENOME_TABLE} | \
awk 'BEGIN {FS=OFS="\t"} {if ($7 <=0) print $1,$2,$3,$4,$5,$6,"1",$8; else if ($8 >=$9) print $1,$2,$3,$4,$5,$6,$7,$9; else print $1,$2,$3,$4,$5,$6,$7,$8}' | \
$BEDTOOLS"/"closestBed -a - -b ${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}.RegDoms.temp2 > \
${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}_UE${REG_ARGS[2]}.RegDoms.temp3


#### extended upstream basal regulatory domain

awk 'BEGIN {FS=OFS="\t"} {if (($8<=$10) || ($7>=$11)) print $0,"0","0"; else if (($7<=$10) && ($8<=$11)) print $0,$10,$8; else if (($7<=$10) && ($8>$11)) print $0,$10,$11; else if (($7>$10) && ($8<=$11)) print $0,$7,$8; else if (($7>$10) && ($8>$11)) print $0,$7,$11}' \
${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}_UE${REG_ARGS[2]}.RegDoms.temp3 | \
awk 'BEGIN {FS=OFS="\t"} {if ($6 ~ /\+/) print $0,($8-$13),($12-$7); else print $0,($7-$12),($8-$13)}' | \
awk 'BEGIN {FS=OFS="\t"} {if (($12 ==0) || (($12 >0) && ($14 >=100)) || (($12 >0) && ($14 <=-100))) print $1,$2,$3,$4,$5,$6,$7,$8,$2,$3; else if (($6 ~ /\+/) && ($12 >0) && ($14 >-100) && ($14 <100)) print $1,$2,$3,$4,$5,$6,$7,$8,$12,$3; else if (($6 ~ /\-/) && ($12 >0) && ($14 >-100) && ($14 <100)) print $1,$2,$3,$4,$5,$6,$7,$8,$2,$13}' > \
${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}_UE${REG_ARGS[2]}.RegDoms.temp4


#### downstream basal regulatory domain

awk -v DE=${REG_ARGS[3]} 'BEGIN {FS=OFS="\t"} {if ($6 ~ /\+/) print $1,$4,$3,$4,$5,$6,$3,($3+DE); else if ($6 ~ /\-/) print $1,$2,$4,$4,$5,$6,($2-DE),$2}' \
${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}.RegDoms.temp1 | \
join -1 1 -2 1 -t $'\t' - ${GENOME_TABLE} | \
awk 'BEGIN {FS=OFS="\t"} {if ($7 <=0) print $1,$2,$3,$4,$5,$6,"1",$8; else if ($8 >=$9) print $1,$2,$3,$4,$5,$6,$7,$9; else print $1,$2,$3,$4,$5,$6,$7,$8}' | \
$BEDTOOLS"/"closestBed -a - -b ${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}.RegDoms.temp2 > \
${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}_DE${REG_ARGS[3]}.RegDoms.temp3


#### extended downstream basal regulatory domain

awk 'BEGIN {FS=OFS="\t"} {if (($8<=$10) || ($7>=$11)) print $0,"0","0"; else if (($7<=$10) && ($8<=$11)) print $0,$10,$8; else if (($7<=$10) && ($8>$11)) print $0,$10,$11; else if (($7>$10) && ($8<=$11)) print $0,$7,$8; else if (($7>$10) && ($8>$11)) print $0,$7,$11}' \
${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}_DE${REG_ARGS[3]}.RegDoms.temp3 | \
awk 'BEGIN {FS=OFS="\t"} {if ($6 ~ /\+/) print $0,($7-$12),($8-$13); else print $0,($8-$13),($12-$7)}' | \
awk 'BEGIN {FS=OFS="\t"} {if (($12 ==0) || (($12 >0) && ($14 >=100)) || (($12 >0) && ($14 <=-100))) print $1,$2,$3,$4,$5,$6,$7,$8,$2,$3; else if (($6 ~ /\+/) && ($12 >0) && ($14 >-100) && ($14 <100)) print $1,$2,$3,$4,$5,$6,$7,$8,$2,$13; else if (($6 ~ /\-/) && ($12 >0) && ($14 >-100) && ($14 <100)) print $1,$2,$3,$4,$5,$6,$7,$8,$12,$3}' > \
${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}_DE${REG_ARGS[3]}.RegDoms.temp4


#### final extended regulatory domain

paste ${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}_UE${REG_ARGS[2]}.RegDoms.temp4 \
${ANNO_OUTDIR}"/"${FNAME}.U${REG_ARGS[0]}_D${REG_ARGS[1]}_DE${REG_ARGS[3]}.RegDoms.temp4 | \
awk 'BEGIN {FS=OFS="\t"} {if ($6 ~ /\+/) print $1,$9,$20,$5,$4,$6; else if ($6 ~ /\-/) print $1,$19,$10,$5,$4,$6}' > \
${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.RegDoms


elif [[ ${RUN_ANNO} = "Yes" && -f ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.RegDoms ]]
then
echo "File ${FNAME}.${REG_VAR}.RegDoms already existed"

fi



#############################################################################
################ assign ontology terms to regulatory domains
#### output: ${FNAME}.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt

for ((k=0;k<${#ANNO_TYPE[@]};k=k+1))
do

if [[ ! -f ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt ]]
   then

   perl ${REF_DIR}"/"addCols.pl.txt -i ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.RegDoms \
   -ic 3 -r ${REF_DIR}"/"${TEST_TERM[k]} -rc 3 -o ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt

   echo " "
   echo "start to assign ${ANNO_TYPE[k]} terms to gene regulatory domains, output file:"
   echo "${FNAME}.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt"

elif [[ -f ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt ]]
   then
   echo "file ${FNAME}.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt already existed"

fi

done



#############################################################################
################ assign peaks to gene ontology/gene regulatory domains

################ start for loop over peak files
####### This for loop ends at the end of this script

for ((i=0;i<${#INFILE[@]};i=i+1))
do


echo " "
echo "Use the following input files:"
echo "peak file: ${INFILE[i]}"
echo "Annotated ontology term file: ${TEST_TERM[@]}"
echo "Annotated gene TSS file: $( basename ${GENE_TSS} )"
echo "Gap-free region file: $( basename ${ANTI_GAP} )"



################ define the peak center
PEAK_FILE=$( basename ${MACS2_OUTDIR}"/"${INFILE[i]} )

NUM_COLUMN=$(head -1 ${MACS2_OUTDIR}"/"${INFILE[i]} |awk 'BEGIN {FS=OFS="\t"} END {print NF}' )

echo " "
echo "start to reformat peak file into 4-column bed file:"
echo "${INFILE[i]}"

if [[ ${NUM_COLUMN} -eq 10 ]]
   then

   echo "Original peaks were in encodePeak format from MACS2 output"
   awk 'BEGIN {FS=OFS="\t"} {print $1,($2+$10-4),($2+$10+5),"peak_"NR}' ${MACS2_OUTDIR}"/"${INFILE[i]} > \
   ${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed

elif [[ $NUM_COLUMN -lt 10 ]]
   then

   echo "peaks are in ${NUM_COLUMN}_column bed format"
   awk 'BEGIN {FS=OFS="\t"} {print $1,int(($2+$3)/2)}' ${MACS2_OUTDIR}"/"${INFILE[i]} | \
   awk 'BEGIN {FS=OFS="\t"} {print $1,($2-4),($2+5),"peak_"NR}' > \
   ${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed

fi



################ assign peaks to gene regulatory domains

for ((k=0;k<${#ANNO_TYPE[@]};k=k+1))
do

echo " "
echo "start to assign peaks to gene regulatory domains and ${ANNO_TYPE[k]}, output file:"
echo "${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt"

$BEDTOOLS"/"intersectBed -a ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt -b \
${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed -wa -c | \
awk 'BEGIN {FS=OFS="\t"} {if ($11 == 0) print $0,"0","0","0"}' > \
${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed.nohit

$BEDTOOLS"/"intersectBed -a ${ANNO_OUTDIR}"/"${FNAME}.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt -b \
${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed -wa -wb | \
cat - ${ANNO_OUTDIR}"/"${PEAK_FILE}.4col.bed.nohit | \
sort -k1,1 -k2,2n -k3,3n -k4,4 > \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt

done



#############################################################################
################ extract info for significant test
#### size of non-gap regions (ANTI_GAP_SIZE)
#### number of peaks assigned to RegDoms (PEAK_HIT_ALL)

################ Total size of non-gap regions

ANTI_GAP_SIZE=$(cat ${ANTI_GAP} |awk 'BEGIN {FS=OFS="\t"} {print ($3-$2+1)}' | \
awk 'BEGIN {Total = 0} {Total += $1} END {print Total}' )

echo " "
echo "Total size of non-gap genomic regions is : ${ANTI_GAP_SIZE} bp"



################ Total number of peaks assigned to gene regulatory domains
################ Number of associated peaks for each ontology term
################ total size of regulatory domains per ontology term
################ merge data for significant test

for ((k=0;k<${#ANNO_TYPE[@]};k=k+1))
do

####### Total number of peaks assigned to gene regulatory domains

PEAK_HIT_ALL=$(awk 'BEGIN {FS=OFS="\t"} {if ($12 >=1) print $14}' \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt | \
sort |uniq | awk 'BEGIN {FS=OFS="\t"} END {print NR}' )

echo " "
echo "start to extract information for binomial test from file:"
echo "${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt"
echo "The total number of peaks assigned to gene regulatory domains: ${PEAK_HIT_ALL}"


####### Number of associated peaks for each ontology term
####### exclude ontology terms with no associated peak

TAB=`echo -e "\t"`
awk 'BEGIN {FS=OFS="\t"} {if (($8 !~ /^0$/) && ($14 !~ /^0$/)) print $7,$8,$9,$14}' \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt | \
sort -t "$TAB" -k2,2 -k4,4 |uniq |awk 'BEGIN {FS="\t";OFS="\t"} {print $1,$2,$3}' | \
awk 'BEGIN {FS="\t";OFS="\t"} {a[$1"\t"$2"\t"$3]++}END{for(i in a){print i,a[i]}}' | \
sort -t "$TAB" -k2,2 > \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.peakCount

ANNO_TERM_COUNT=$(awk 'BEGIN {FS=OFS="\t"} END {print NR}' \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.peakCount)

echo "The total number of ${ANNO_TYPE[k]} with at least one peak per term: ${ANNO_TERM_COUNT}"


####### total size of regulatory domains per ontology term
####### exclude ontology terms with no associated peak

echo " "
echo "start to estimate total size for regulatory domains associated with each ontology term"
echo "output file: ${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.regSize"

awk 'BEGIN {FS=OFS="\t"} {if ($8 !~ /^0$/) print $1,$2,$3,$8}' \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.txt | \
sort -k4,4 -k1,1 -k2,2n -k3,3n | uniq | \
awk 'BEGIN {FS=OFS="\t"} {print $1"_"$4,$2,$3}' | \
mergeBed -i stdin |tr -s "_" "\t" | \
awk 'BEGIN {FS=OFS="\t"} {print $2,($4-$3+1)}' | \
awk 'BEGIN {FS="\t";OFS="\t"} {a[$1]+=$2}END{for(i in a){print i,a[i]}}' | \
sort -k1,1 > \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.regSize



####### merge data
## col1-3: ontology_term_id/type/description
## col4: total peaks assigned to RegDoms
## col5: No_peak for each ontology term
## col6-7: total size of RegDoms fro each ontology term and its genome fraction

echo " "
echo "Start to merge data for binomial test"

join -1 2 -2 1 ${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.peakCount \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.regSize -t $'\t' | \
awk -v peak=${PEAK_HIT_ALL} -v size=${ANTI_GAP_SIZE} 'BEGIN {FS=OFS="\t"} {printf("%s\t%s\t%s\t%d\t%d\t%d\t%.6f\n",$1,$2,$3,peak,$4,$5,($5/size))}' > \
${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.${ANNO_TERM_COUNT}term.tab



####### significant test

echo " "
echo "start test for ${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.${ANNO_TERM_COUNT}term.tab"

ANNO_IN=${ANNO_OUTDIR}"/"${PEAK_FILE}.vs.${REG_VAR}.RegDoms.${ANNO_TYPE[k]}.${ANNO_TERM_COUNT}term.tab
ANNO_LOG=${ANNO_OUTDIR}"/"$( basename ${ANNO_R} ).${ANNO_TYPE[k]}.log${i}.txt

Rscript --slave ${ANNO_R} infile=${ANNO_IN} method=${ANNO_METHOD} >${ANNO_LOG}

done

done


echo " "
echo "Finish functional analysis for ${PEAK_FILE}, $(date)"


################ end for loop over peak files

