#!/bin/bash
# Usage: /projects/bsi/bictools/scripts/ChIPSeq/tags/v2.0/chipseq.sh -r /home/m110443/projects/chipseq2_svn/CONFIG/run_info.txt [--keepintermediatefiles ]
# Usage note: Make sure mayobiotools -> python v 2.7.3 is on 
# Usage note: Make sure mayobiotools -> R-2.14.0 is on 

set -x

echo `date`

usage() {
	cat <<EOF
	
Usage: chipseq.sh -r /full/path/to/run_info.txt [--keepintermediatefiles] [--skip-align]
Run ChIP-Seq workflow using specified run_info.txt file.

Mandatory arguments to long options are mandatory for short options too.

  --keepintermediatefiles     Keep intermediate files instead of cleaning up after
                                workflow execution.
  -r, --runinfo               Specify the file containing the options that need to be run.
  --skip-align                Skip the alignment step  

Report problems using chipseq.sh using FogBugz (Mayo internal site: http://bsu-bugs)
EOF
}

TEMP=`getopt -o hr: --long help,runinfo:,keepintermediatefiles,skip-align,skipalign,skipalignment -n 'chipseq.sh' -- "$@"`

eval set -- "$TEMP"

RUN_INFO=
CLEAN_UP_FILES=true
PERFORM_ALIGNMENT=true

while true; do
	case "$1" in
		-r | --runinfo ) RUN_INFO=$2; shift 2;;
		--keepintermediatefiles ) CLEAN_UP_FILES=false; shift ;;
		-h | --help ) usage; exit ;;
		--skip-align | --skipalign | --skipalignment ) PERFORM_ALIGNMENT=false; shift ;;
		-- ) break; ;;
		* ) usage; exit; ;;
	esac
done

# perform initial check to ensure run info file exists
if [ ! $RUN_INFO ]
then
	usage;
	echo -e "\n\nERROR : run_info file was not specified";
	exit 1;
fi

dir_info=`dirname $RUN_INFO`
if [ "$dir_info" = "." ]
then
	echo -e "ERROR : run_info=$RUN_INFO should be specified as a complete path\n";
	exit 1;
fi

if [ ! -s $RUN_INFO ]
then
	echo -e "ERROR : run_info=$RUN_INFO does not exist\n";
	exit 1;
fi

# Gather variables from configuration files
dos2unix $RUN_INFO

# Check TOOL_INFO
TOOL_INFO=`grep -w '^TOOL_INFO' $RUN_INFO | cut -d '=' -f2`

if [[ ! -e $TOOL_INFO ]]
then
	echo "The tool_info file specified at location $TOOL_INFO does not exist. Please verify that your run_info file contains the correct tool_info file location."
	exit 1
fi

SOURCE_DIR=`grep -w '^CHIPSEQ_DIR' $TOOL_INFO | cut -d '=' -f2`

##### Check/Validate All Parameters #####
VARCHECK=`perl $SOURCE_DIR/checkCONFIG.pl $RUN_INFO`
if [ "$VARCHECK" != "" ]
then
	echo $VARCHECK
	exit 1;
fi

# Parse run_info file
PEAK_CALLER=`grep -w '^PEAK_CALLER' $RUN_INFO | cut -d '=' -f2`
WORK_DIR=`grep -w '^WORK_DIR' $RUN_INFO | cut -d '=' -f2`
RUNID=`grep -w '^RUNID' $RUN_INFO | cut -d '=' -f2`
SAMPLENAME=`grep -w '^SAMPLENAME' $RUN_INFO | cut -d '=' -f2`
SEQ_TYPE=`grep -w '^SEQ_TYPE' $RUN_INFO | cut -d '=' -f2`
END1_SEQ=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^END1_SEQ/) print $2}' $RUN_INFO) )
FILTER_TYPE=`grep -w '^FILTER_TYPE' $RUN_INFO | cut -d '=' -f2`
IP_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^IP_FILE/) print $2}' $RUN_INFO) )
INPUT_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^IP_FILE/) print $2}' $RUN_INFO) )
MEMORY_INFO=`grep -w '^MEMORY_INFO' $RUN_INFO | cut -d '=' -f2`
BOOL_CEAS=`grep -w '^RUN_CEAS' $RUN_INFO | cut -d '=' -f2 | tr [:upper:] [:lower:]`   # case insensitive
BOOL_MEME=`grep -w '^RUN_MEME' $RUN_INFO | cut -d '=' -f2 | tr [:upper:] [:lower:]`   # case insensitive
BOOL_GOM=`grep -w '^RUN_GOM' $RUN_INFO | cut -d '=' -f2 | tr [:upper:] [:lower:]` # case insensitive

# Parse tool_info.txt file

JAVA=`grep -w '^JAVA' $TOOL_INFO | cut -d '=' -f2`
TOOL_VERSION=`grep -w '^TOOL_VERSION' $TOOL_INFO | cut -d '=' -f2`
NGS_PORTAL_PATH=`grep -w '^NGS_PORTAL_PATH' $TOOL_INFO | cut -d '=' -f2`
SOURCE_DIR=`grep -w '^CHIPSEQ_DIR' $TOOL_INFO | cut -d '=' -f2`
QUEUE=`grep -w '^QUEUE' $TOOL_INFO | cut -d '=' -f2`

if [[ ! -e $MEMORY_INFO ]]
then
	echo "The memory_info file specified at location $MEMORY_INFO does not exist. Please verify that your run_info file contains the correct memory_info file location."
	exit 1
fi

# Parse memory_info.txt file
ALIGN_VMEM=`grep -w '^ALIGN_VMEM' $MEMORY_INFO | cut -d '=' -f2`
CLEANUP_VMEM=`grep -w '^CLEANUP_VMEM' $MEMORY_INFO | cut -d '=' -f2`
DASHBOARD_VMEM=`grep -w '^DASHBOARD_VMEM' $MEMORY_INFO | cut -d '=' -f2`
DEFAULT_VMEM=`grep -w '^DEFAULT_VMEM' $MEMORY_INFO | cut -d '=' -f2`
DELIVERY_VMEM=`grep -w '^DELIVERY_VMEM' $MEMORY_INFO | cut -d '=' -f2`
PORTAL_JVM=`grep -w '^PORTAL_JVM' $MEMORY_INFO | cut -d '=' -f2`

# Referenced scripts
ALIGN=$SOURCE_DIR"/"align.sh
MAPQC=$SOURCE_DIR"/"mapqc.sh
MAPVIS=$SOURCE_DIR"/"mapvis.sh
PKCALL=$SOURCE_DIR"/"pkcall.sh
NOIDR=$SOURCE_DIR"/"noidr.sh
IDRPREP=$SOURCE_DIR"/"idrprep.sh
REPCALL=$SOURCE_DIR"/"repcall.sh
IDR=$SOURCE_DIR"/"idr.sh
NEARGENES=$SOURCE_DIR"/"neargenes.sh
ANNOT_CEAS=$SOURCE_DIR"/"ceas_annotation.sh
ANNOT_MEME=$SOURCE_DIR"/"meme_annotation.sh
ANNOT_GOM=$SOURCE_DIR"/"gom_annotation.sh
DELIVERY=$SOURCE_DIR"/"delivery.sh
CLEANUP=$SOURCE_DIR"/"cleanup.sh
DASHBOARD=$SOURCE_DIR"/"dashboard.sh

SEC_ANLYS=$NGS_PORTAL_PATH"/"AddSecondaryAnalysis.jar
PROPS=$NGS_PORTAL_PATH"/"AddSecondaryAnalysis.properties

# Verify configuration file
# TODO: implement additional checks

if [ -e $WORK_DIR/delivery ]
then
	echo -e "ERROR : delivery directory=$WORK_DIR/delivery exists. Stopping to preserve data.\n"
	exit 1
fi

# Make certain that output directory exists
mkdir -p $WORK_DIR/logs

if [[ $PERFORM_ALIGNMENT = true ]]
then
	for((i=1; i<=${#END1_SEQ[@]};i=i+1))
	do
		SGE_TASK_ID=$i $ALIGN $RUN_INFO
	done
else
	# TODO: Move verification to separate file to make this cleaner
	# verify that aligned & sorted bams exist
	ERROR=false

    if [ -e $WORK_DIR/fastqc ]
    then
        # Check for directories for each of the inputs. If any of these are missing, exit with an error.
		for ((i=0;i<${#INPUT_FILE[@]};i=i+1))
		do
			DIRECTORY=$WORK_DIR/fastqc/$( basename ${INPUT_FILE[i]} .fastq.gz )_fastqc
			
			if [ ! -e $DIRECTORY ]
			then
				echo "ERROR: Unable to find $DIRECTORY, but ${INPUT_FILE[i]} is specified in run_info.txt."
				exit 1
			fi
		done
		
		for ((i=0;i<${#IP_FILE[@]};i=i+1))
		do
			DIRECTORY=$WORK_DIR/fastqc/$( basename ${IP_FILE[i]} .fastq.gz )_fastqc
			
			if [ ! -e $DIRECTORY ]
			then
				echo "ERROR: Unable to find $DIRECTORY, but ${IP_FILE[i]} is specified in run_info.txt."
				exit 1
			fi
		done
    else
    	ERROR=true	
	fi
	
	if [[ $ERROR = true ]]
	then
		echo "ERROR: Could not find $WORK_DIR/fastqc files. Please ensure fastqc files are available in $WORK_DIR/fastqc."
		exit 1;
	fi
	
	if [ -e $WORK_DIR/mapout ]
    then
        # Check for directories for each of the inputs. If any of these are missing, exit with an error.
		for ((i=0;i<${#INPUT_FILE[@]};i=i+1))
		do
			BASE_FILE_NAME=$WORK_DIR/mapout/$( basename ${INPUT_FILE[i]} .fastq.gz ).$SEQ_TYPE
			
			FILE_LIST_SIZE=0
			
			FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.sorted.bam
			FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.$FILTER_TYPE.dedup.s1.bam
			FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.$FILTER_TYPE.dedup_metrics.txt
			
			if [[ $SEQ_TYPE = "PE" ]]
			then
				FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.bam
				FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.bam.$FILTER_TYPE.bam
				FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.$FILTER_TYPE.dedup.bam
				FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.$FILTER_TYPE.sorted.bam
			fi
		   
			for((j=0;j<7;j=j+1))
			do
				FILE=${FILE_LIST[j]}
				if [ ! -f $FILE ]
				then
					echo "ERROR: Unable to find $FILE, but ${INPUT_FILE[i]} is specified in run_info.txt."
					exit 1
				fi
			done
		done
		
		for ((i=0;i<${#IP_FILE[@]};i=i+1))
		do
			BASE_FILE_NAME=$WORK_DIR/mapout/$( basename ${IP_FILE[i]} .fastq.gz ).$SEQ_TYPE
			
			FILE_LIST_SIZE=0
			
			FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.sorted.bam
			FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.$FILTER_TYPE.dedup.s1.bam
			FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.$FILTER_TYPE.dedup_metrics.txt
			
			if [[ $SEQ_TYPE = "PE" ]]
			then
				FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.bam
				FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.bam.$FILTER_TYPE.bam
				FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.$FILTER_TYPE.dedup.bam
				FILE_LIST[$(( FILE_LIST_SIZE++ ))]=$BASE_FILE_NAME.$FILTER_TYPE.sorted.bam
			fi
		   
			for((j=0;j<${#FILE_LIST[@]};j=j+1))
			do
				FILE=${FILE_LIST[j]}
				if [ ! -f $FILE ]
				then
					echo "ERROR: Unable to find $FILE, but ${IP_FILE[i]} is specified in run_info.txt."
					exit 1
				fi
			done
		done
    else
    	ERROR=true	
	fi
	
	if [[ $ERROR = true ]]
	then
		echo "ERROR: Could not find $WORK_DIR/mapout files. Please ensure all BAM/SAM/SAI files are available in $WORK_DIR/fastqc."
		exit 1;
	fi
fi

for((i=1; i<=${#END1_SEQ[@]};i=i+1))
do
	### 1.2. Generate mapping summary and QC files
	SGE_TASK_ID=$i $MAPQC $RUN_INFO
	
	### 1.3. Generate visualization files
	SGE_TASK_ID=$i $MAPVIS $RUN_INFO
done

### 1.4. Run Peak Caller
for((i=1; i<=${#IP_FILE[@]};i=i+1))
do
	SGE_TASK_ID=$i $PKCALL $RUN_INFO
done

#### 2. Collect statistics

if [[ $PEAK_CALLER = "sicer" ]]
then
	### 2.1 noidr.sh
	$NOIDR $RUN_INFO

elif [[ $PEAK_CALLER = "macs2idr" ]]
then
	### 2.2 idrprep.sh 
	$IDRPREP $RUN_INFO
	
	### 2.3 repcall.sh 
	### split bam file from each IP replicate into 2 same-sized pseudo reps, call peaks from each IP rep and also its two pseudo reps
	for((i=1; i<=${#IP_FILE[@]};i=i+1))
	do
		SGE_TASK_ID=$i $REPCALL $RUN_INFO
	done

	### 2.4 idr.sh
	$IDR $RUN_INFO
else
	# IDR analysis is not necessary
    echo "IDR Analysis is not performed"
fi

### 3.1 annot.sh
if [[ $BOOL_MEME == "yes" ]] || [[ $BOOL_CEAS == "yes" ]] || [[ $BOOL_GOM == "yes" ]]
then
	if [[ $PEAK_CALLER == "macs2idr" ]] 
	then
		MAX_LOOP_COUNT=2
		echo "Warning: only the first two samples listed in runinfo.txt:IP_FILES will be used for IDR analysis and annotation."
	else
		MAX_LOOP_COUNT=${#IP_FILE[@]}
	fi
	
	for((i=1; i<=$MAX_LOOP_COUNT;i=i+1))
	do
		export SGE_TASK_ID=$i
		if [[ $BOOL_MEME == "yes" ]]; then $ANNOT_MEME $RUN_INFO; fi
		if [[ $BOOL_CEAS == "yes" ]]; then $ANNOT_CEAS $RUN_INFO; fi
		if [[ $BOOL_GOM == "yes" ]]; then $ANNOT_GOM $RUN_INFO; fi
	done
fi	
	
### 4.1 delivery.sh
# TODO MTK: Note: hold_jid needs to be changed when annotation is implemented
$DELIVERY $RUN_INFO

if [[ $CLEAN_UP_FILES = true ]]
then
	### 5.1 cleanup.sh
	$CLEANUP $RUN_INFO
fi

set +x
echo "The workflow is complete. The results are available in $WORK_DIR"
firefox "file://$WORK_DIR/delivery/MainDocument.html" &


echo `date`
