################################################
######################## run_MEME_qsub.sh
#!/bin/bash
# run_MEME.sh
# 12/31/2012, Huihuang Yan
# all MEME_PEAK_CUTOFF is tested, 1/1/2013, 06/17/2013

if [ $# != 1 ];
then
    echo "usage: meme_annotation.sh <config file>";
	exit 1;
fi

set -x

######################## ARGS

########### copy from original run_info
tool_info=`grep -w '^TOOL_INFO' $1 | cut -d '=' -f2`

SEQ_TYPE=`grep -w '^SEQ_TYPE' $1 | cut -d '=' -f2`
PEAK_CALLER=`grep -w '^PEAK_CALLER' $1 | cut -d '=' -f2`
IDR_CUTOFF=`grep -w '^IDR_CUTOFF' $tool_info | cut -d '=' -f2`

WORK_DIR=`grep -w '^WORK_DIR' $1 | cut -d '=' -f2`
MACS2_OUTDIR=$WORK_DIR/macs2out
IDR_OUTDIR=$WORK_DIR/idrout

BWA_REF=`grep -w '^BWA_REF' $tool_info | cut -d '=' -f2`
BEDTOOLS=`grep -w '^BEDTOOLS' $tool_info | cut -d '=' -f2`

echo "seq_type: $SEQ_TYPE; PEAK_CALLER: $PEAK_CALLER; IDR_CUTOFF: $IDR_CUTOFF"
echo "WORK_DIR: $WORK_DIR"


########### new args
MEME_OUTDIR=$WORK_DIR/memeout

MEME=`grep -w '^MEME_PATH' $tool_info | cut -d '=' -f2`

RUN_MEME=`grep -w '^RUN_MEME' $1 | cut -d '=' -f2`
MEME_ARGS=`grep -w '^MEME_ARGS' $1 | cut -d '=' -f2`
MEME_PEAK_SIZE=`grep -w '^MEME_PEAK_SIZE' $1 | cut -d '=' -f2`
MEME_PEAK_CUTOFF=`grep -w '^MEME_PEAK_CUTOFF' $1 | cut -d '=' -f2`

echo "MEME_OUTDIR: $MEME_OUTDIR"
echo "MEME_ARGS: $MEME_ARGS"
echo "MEME_PEAK_SIZE: $MEME_PEAK_SIZE; MEME_PEAK_CUTOFF: $MEME_PEAK_CUTOFF"

if [[ -z $SGE_TASK_ID ]]; then echo "BAD ARRAY INDEX, no SGE_TASK_ID"; exit 1; fi


######################## check outdir

if [[ $RUN_MEME = "Yes" && ! -d ${MEME_OUTDIR} ]]
then
    mkdir -p ${MEME_OUTDIR}
fi


######################## sort peak files (*_macs2_peaks.encodePeak)

echo " "
echo "Start MEME motif finding, $(date)"
echo "Start to sort macs2 encodePeak files"

if [[ $RUN_MEME = "Yes" && $PEAK_CALLER = "macs2noidr" ]]
then
            for files in ${MACS2_OUTDIR}"/"*.${SEQ_TYPE}_macs2_peaks.encodePeak
            do
            echo "$files"
            sort -k8,8nr $files > $files.pvalue.sorted

            mv $files.pvalue.sorted ${MEME_OUTDIR}
            done


elif [[ $RUN_MEME = "Yes" && $PEAK_CALLER = "macs2idr" ]]
then

            for files in ${IDR_OUTDIR}"/"*.${SEQ_TYPE}_macs2_idr_peaks.encodePeak
            do

            echo "$files"
            FNAME=$( basename $files .${SEQ_TYPE}_macs2_idr_peaks.encodePeak )
            awk -v idr=${IDR_CUTOFF} 'BEGIN {FS="\t"; OFS="\t"} {if ($4 <= idr) print $0}' $files | \
            sort -k8,8nr > ${MEME_OUTDIR}"/"$FNAME.${SEQ_TYPE}_macs2_peaks.encodePeak.pvalue.sorted

            done
fi


######################## parse a subset of peaks based on cutoff
######################## do MEME motif finding

if [[ $RUN_MEME = "Yes" ]]
then

FILE_LIST=$( ls ${MEME_OUTDIR}"/"*.pvalue.sorted |awk '{print $NF}' |tr -s "\n" " ")
PEAK_FILE=( $( echo $FILE_LIST ) )

echo "List of sorted peak files: ${PEAK_FILE[@]}"

i=$SGE_TASK_ID
let i=$i-1

          
HALF_SIZE=$( awk -v e6=$MEME_PEAK_SIZE 'BEGIN {print int(e6/2)}' )
PEAK_CUTOFF=$( awk -v e4=$MEME_PEAK_CUTOFF 'BEGIN {print 1000*e4}' )
TOTAL_PEAK=$(  wc -l ${PEAK_FILE[i]} |tr -s "    " "\t" |cut -f 1 )

echo "There are a total of $TOTAL_PEAK peaks from $( basename ${PEAK_FILE[i]} .pvalue.sorted)"
echo "$HALF_SIZE bp will be extracted fro both sides around peak summit"
echo "peak cutoff is: $MEME_PEAK_CUTOFF"


          if [[ $PEAK_CUTOFF -gt 1001 ]] 
          then
               NUMBER_PEAK=$MEME_PEAK_CUTOFF

                     if [[ $NUMBER_PEAK -gt $TOTAL_PEAK ]]
                     then
                     echo "Warning: the selected number of peaks are more than the actual number of peaks, will use all peaks"

                     elif [[ $NUMBER_PEAK -le $TOTAL_PEAK ]]
                     then
                     echo "number of selected peaks for motif finding: $NUMBER_PEAK"
                     fi


          elif [[ $PEAK_CUTOFF -ge 1 && $PEAK_CUTOFF -le 1000 ]]
          then 
               NUMBER_PEAK=$( awk -v ratio=${MEME_PEAK_CUTOFF} 'BEGIN {FS="\t"; OFS="\t"} END {print int(NR*ratio)}' ${PEAK_FILE[i]} )

                     if [[ $NUMBER_PEAK -eq 0 ]]
                     then 
                     echo "Warning: no peak was selected, please increase the ratio of selected peaks/total peaks"

                     elif [[ $NUMBER_PEAK -gt 0 ]]
                     then
                     echo "number of selected peaks for motif finding: $NUMBER_PEAK"
                     fi 
          fi


FILENAME=$( basename ${PEAK_FILE[i]} .${SEQ_TYPE}_macs2_peaks.encodePeak.pvalue.sorted )
OUTDIR=${FILENAME}_${SEQ_TYPE}_${NUMBER_PEAK}peak_meme
TEMP_OUT=${MEME_OUTDIR}"/"${OUTDIR}

echo " "
echo "Top $NUMBER_PEAK peaks will be used to identify motifs from $( basename ${PEAK_FILE[i]} .pvalue.sorted) using MEME"
echo " "

head -n $NUMBER_PEAK ${PEAK_FILE[i]} |awk 'BEGIN {FS="\t"; OFS="\t"} {print $1,($2+$10-1)}' | \
awk -v span=$HALF_SIZE 'BEGIN {FS="\t"; OFS="\t"} {print $1,($2-span),($2-1+span)}' | \
awk 'BEGIN {FS="\t"; OFS="\t"} {if ($2 <=0) print $1,"1",$3; else print $0}' > \
${MEME_OUTDIR}"/"${FILENAME}.${SEQ_TYPE}.${NUMBER_PEAK}peak.bed
     
$BEDTOOLS"/"bedtools getfasta -fi $BWA_REF -bed ${MEME_OUTDIR}"/"${FILENAME}.${SEQ_TYPE}.${NUMBER_PEAK}peak.bed -fo \
${MEME_OUTDIR}"/"${FILENAME}.${SEQ_TYPE}.${NUMBER_PEAK}peak.$( echo $MEME_PEAK_SIZE )bp.fasta

${MEME} ${MEME_OUTDIR}"/"${FILENAME}.${SEQ_TYPE}.${NUMBER_PEAK}peak.$( echo $MEME_PEAK_SIZE )bp.fasta ${MEME_ARGS} -oc $TEMP_OUT

echo "${MEME} ${MEME_OUTDIR}"/"${FILENAME}.${SEQ_TYPE}.${NUMBER_PEAK}peak.$( echo $MEME_PEAK_SIZE )bp.fasta \ 
$( echo ${MEME_ARGS} ) -oc $TEMP_OUT"


ls ${MEME_OUTDIR}"/"${OUTDIR}"/"*.gif ${MEME_OUTDIR}"/"${OUTDIR}"/"*.eps \
${MEME_OUTDIR}"/"${OUTDIR}"/"*.html ${MEME_OUTDIR}"/"${OUTDIR}"/"*.xml \
${MEME_OUTDIR}"/"${OUTDIR}"/"*.txt |awk '{print $NF}' |awk '{print("mv "$1" "$1)}' | awk -v FN=${OUTDIR}"/" -v FN2=${OUTDIR}"/"${OUTDIR}_ 'OFS=" " {gsub(FN,FN2,$3); print $0}' | /bin/sh

echo "meme is done for ${PEAK_FILE[i]}, $(date)"


fi

