#!/bin/bash
# This script is used to map both paired-end (PE) and single-end (SE) ChIP-Seq data
# For PE data, it has the option to parse PE w/ both ends uniquely mapped, PE w/ one or both ends uniquely mapped, or primary alignment
# For SE data, it has the option to parse uniquely mapped reads or primary alignment, with or without mapping quality-based filtering 
# For both PE and SE data, it has the option to either do duplicates removal or not do duplicates removal 
# sort the original bam file
# keep sorted bam file, instead of the unsorted bam file

set -x

if [ $# != 1 ];
then
    echo "usage: align.sh <config file>";
	exit
fi

# Parse run_info.txt file for variables 
PROJECT_NAME=`grep -w '^PROJECT_NAME' $1 | cut -d '=' -f2`
SEQ_DIR=`grep -w '^SEQ_DIR' $1 | cut -d '=' -f2`
SEQ_TYPE=`grep -w '^SEQ_TYPE' $1 | cut -d '=' -f2`
END1_SEQ=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^END1_SEQ/) print $2}' $1) )
END2_SEQ=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^END2_SEQ/) print $2}' $1) )
SEQ_SUFFIX=`grep -w '^SEQ_SUFFIX' $1 | cut -d '=' -f2`
WORK_DIR=`grep -w '^WORK_DIR' $1 | cut -d '=' -f2`
FILTER_TYPE=`grep -w '^FILTER_TYPE' $1 | cut -d '=' -f2`
PEAK_CALLER=`grep -w '^PEAK_CALLER' $1 | cut -d '=' -f2`
IP_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^IP_FILE/) print $2}' $1) )
INPUT_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^INPUT_FILE/) print $2}' $1) )
tool_info=`grep -w '^TOOL_INFO' $1 | cut -d '=' -f2`

# Parse tool_info.txt file for variables
FASTQC=`grep -w '^FASTQC' $tool_info | cut -d '=' -f2`
BWA_REF=`grep -w '^BWA_REF' $tool_info | cut -d '=' -f2`
BWA_PATH=`grep -w '^BWA_PATH' $tool_info | cut -d '=' -f2`
MACS_PATH=`grep -w '^MACS_PATH' $tool_info | cut -d '=' -f2`
SICER=`grep -w '^SICER' $tool_info | cut -d '=' -f2`
SAMTOOLS=`grep -w '^SAMTOOLS' $tool_info | cut -d '=' -f2`
BEDTOOLS=`grep -w '^BEDTOOLS' $tool_info | cut -d '=' -f2`
PICARD=`grep -w '^PICARD' $tool_info | cut -d '=' -f2`
GENOME_TABLE=`grep -w '^GENOME_TABLE' $tool_info | cut -d '=' -f2`
TCLR_LIST=`grep -w '^TCLR_LIST' $tool_info | cut -d '=' -f2`
UCSC_REF_FLAT=`grep -w '^UCSC_REF_FLAT' $tool_info | cut -d '=' -f2`
JAVA=`grep -w '^JAVA' $tool_info | cut -d '=' -f2`
MAP_QUALITY=`grep -w '^MAP_QUALITY' $tool_info | cut -d '=' -f2`
REMOVE_DUP=`grep -w '^REMOVE_DUP' $tool_info | cut -d '=' -f2`
MAP_SE_ARGS=`grep -w '^MAP_SE_ARGS' $tool_info | cut -d '=' -f2`
MAP_PE_ARGS=`grep -w '^MAP_PE_ARGS' $tool_info | cut -d '=' -f2`
MAP_BOTH_ARGS=`grep -w '^MAP_BOTH_ARGS' $tool_info | cut -d '=' -f2`

# Constants used in this file
MAP_OUTDIR=$WORK_DIR/mapout
FASTQC_DIR=$WORK_DIR/fastqc

mkdir -p $FASTQC_DIR
mkdir -p $MAP_OUTDIR

# Used to index into variables below
let ARRAY_JOB_INDEX=$SGE_TASK_ID-1

##################################### check file info ######################################

##################################### start mapping PE reads ###############################

echo "Start to analyze ChIP-Seq data ${PROJECT_NAME}, $(date)"

if [[ $SEQ_TYPE = "PE" ]]
then
	echo " "
	echo "These are paired-end data, start fastqc then BWA mapping of ${END1_SEQ[ARRAY_JOB_INDEX]} and ${END2_SEQ[ARRAY_JOB_INDEX]}, $(date)"
	
	$FASTQC -Dfastqc.output_dir=$FASTQC_DIR $SEQ_DIR"/"${END1_SEQ[ARRAY_JOB_INDEX]}
	$FASTQC -Dfastqc.output_dir=$FASTQC_DIR $SEQ_DIR"/"${END2_SEQ[ARRAY_JOB_INDEX]}
	
	echo "Done running PE fastqc on" $SEQ_DIR"/"${END1_SEQ[ARRAY_JOB_INDEX]} "into" $FASTQC_DIR 	
	
	SEQ1NAME=$( basename ${END1_SEQ[ARRAY_JOB_INDEX]} .${SEQ_SUFFIX} )
	SEQ2NAME=$( basename ${END2_SEQ[ARRAY_JOB_INDEX]} .${SEQ_SUFFIX} )
	
	$BWA_PATH"/"bwa aln $MAP_BOTH_ARGS $BWA_REF $SEQ_DIR"/"${END1_SEQ[ARRAY_JOB_INDEX]} >$MAP_OUTDIR"/"$SEQ1NAME.sai
	$BWA_PATH"/"bwa aln $MAP_BOTH_ARGS $BWA_REF $SEQ_DIR"/"${END2_SEQ[ARRAY_JOB_INDEX]} >$MAP_OUTDIR"/"$SEQ2NAME.sai
	
	$BWA_PATH"/"bwa sampe $MAP_PE_ARGS $BWA_REF $MAP_OUTDIR"/"$SEQ1NAME.sai $MAP_OUTDIR"/"$SEQ2NAME.sai $SEQ_DIR"/"${END1_SEQ[ARRAY_JOB_INDEX]} $SEQ_DIR"/"${END2_SEQ[ARRAY_JOB_INDEX]} |${SAMTOOLS}/samtools view -Sbh - > $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam
	
	$JAVA -Xmx6g -Xms512m \
		-jar $PICARD/SortSam.jar \
		INPUT=$MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam \
		OUTPUT=$MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.sorted.bam \
		MAX_RECORDS_IN_RAM=2000000 \
		SO=coordinate \
		TMP_DIR=$MAP_OUTDIR \
		VALIDATION_STRINGENCY=SILENT
	
	rm -f $MAP_OUTDIR"/"$SEQ1NAME.sai
	rm -f $MAP_OUTDIR"/"$SEQ2NAME.sai
	
	echo "Finish BWA mapping of ${END1_SEQ[ARRAY_JOB_INDEX]} and ${END2_SEQ[ARRAY_JOB_INDEX]}, $(date)"




	##################################### start processing mapped PE reads #####################
	##################################### start extract mapped pairs #####################
	######### extract header lines and properly mapped pairs #########
        ######### modified code based on reviewer's suggestion, for parsing U12, U22 and U02 #########

${SAMTOOLS}/samtools view -H $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam > \
$MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.${FILTER_TYPE}.sam


${SAMTOOLS}/samtools view $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam | \
awk 'BEGIN {FS="\t"; OFS="\t"} {if (($12 !~ /^XT:A:U/) && ($12 !~ /^XT:A:R/)) print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,"XT:A:N"; else print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' | \
awk 'NR%2 {printf $0"\t";next;}1' | \
awk 'BEGIN {FS="\t"; OFS="\t"} {if ((($2==83) || ($2==99)) && (($3 ~ /^chr[1-9]$/) || ($3 ~ /^chr[1-2][0-9]$/) || ($3 ~ /^chr[M-Y]$/)) && (($15 ~ /^chr[1-9]$/) || ($15 ~ /^chr[1-2][0-9]$/) || ($15 ~ /^chr[M-Y]$/))) print $0}' > \
$MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.sam


######### parse uniquely-mapped pairs #########
if [[ $FILTER_TYPE = "U22" ]]
then

echo -e "\nStart to parse uniquely mapped pairs from $( echo $SEQ1NAME ).${SEQ_TYPE}.bam"

awk -v mq=${MAP_QUALITY} 'BEGIN {FS="\t"; OFS="\t"} {if (($12 ~ /^XT:A:U/) && ($24 ~ /^XT:A:U/) && ($5 >=mq) && ($17 >=mq)) print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12"\n"$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24}' $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.sam | \
cat $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.${FILTER_TYPE}.sam - | \
${SAMTOOLS}/samtools view -Sbh - > \
${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.${FILTER_TYPE}.bam



######### parse pairs with one or both ends being uniquely mapped #########
elif [[ $FILTER_TYPE = "U12" ]]
then

echo -e "\nStart to parse pairs with one or both ends being uniquely mapped from $( echo $SEQ1NAME ).${SEQ_TYPE}.bam"

awk -v mq=${MAP_QUALITY} 'BEGIN {FS="\t"; OFS="\t"} {if (($12 ~ /^XT:A:U/) && ($24 ~ /^XT:A:U/) && ($5 >=mq) && ($17 >=mq)) print $0; else if (($12 ~ /^XT:A:U/) && ($24 ~ /^XT:A:R/) && ($5 >=mq) && ($17 >=mq)) print $0; else if (($12 ~ /^XT:A:R/) && ($24 ~ /^XT:A:U/) && ($5 >=mq) && ($17 >=mq)) print $0}' $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.sam | \
awk 'BEGIN {FS="\t"; OFS="\t"} {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12"\n"$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24}' | \
cat $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.${FILTER_TYPE}.sam - | \
${SAMTOOLS}/samtools view -Sbh - > \
${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.${FILTER_TYPE}.bam


######### parse primary alignment #########
######### if both ends have multiple matches, parse a random match per pair #########
elif [[ $FILTER_TYPE = "U02" ]]
then

echo -e "\nStart to parse primary mapped pairs from $( echo $SEQ1NAME ).${SEQ_TYPE}.bam"

awk -v mq=${MAP_QUALITY} 'BEGIN {FS="\t"; OFS="\t"} {if (($12 ~ /^XT:A:U/) && ($24 ~ /^XT:A:U/) && ($5 >=mq) && ($17 >=mq)) print $0; else if (($12 ~ /^XT:A:U/) && ($24 ~ /^XT:A:R/) && ($5 >=mq) && ($17 >=mq)) print $0; else if (($12 ~ /^XT:A:R/) && ($24 ~ /^XT:A:U/) && ($5 >=mq) && ($17 >=mq)) print $0; else if (($12 ~ /^XT:A:R/) && ($24 ~ /^XT:A:R/)) print $0}' $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.sam | \
awk 'BEGIN {FS="\t"; OFS="\t"} {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12"\n"$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24}' | \
cat $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.${FILTER_TYPE}.sam - | \
${SAMTOOLS}/samtools view -Sbh - > \
${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.${FILTER_TYPE}.bam

fi


		$JAVA -Xmx6g -Xms512m \
		-jar $PICARD/SortSam.jar \
		INPUT=$MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.${FILTER_TYPE}.bam \
		OUTPUT=$MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam \
        	MAX_RECORDS_IN_RAM=2000000 \
        	SO=coordinate \
        	TMP_DIR=$MAP_OUTDIR \
        	VALIDATION_STRINGENCY=SILENT

## should remove these files
rm $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.${FILTER_TYPE}.sam
rm $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.sam
rm $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.bam.${FILTER_TYPE}.bam




	##################################### start remove duplicates ########################
	if [[ ${REMOVE_DUP} = "dedup" ]]
    	then

    	echo " "
	echo "start to filter out duplicates from $( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam"

	$JAVA -Xmx6g -Xms512m -jar $PICARD"/"MarkDuplicates.jar INPUT=${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam \
		OUTPUT=${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.bam METRICS_FILE=${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}_metrics.txt \
		REMOVE_DUPLICATES=TRUE ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT MAX_FILE_HANDLES=100 TMP_DIR=$MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}_${REMOVE_DUP}

    ${SAMTOOLS}/samtools view -h ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.bam |awk 'OFS="\t" {if (($1 ~ /\@/) || ($2==83) || ($2==99)) print $0}' |\
	${SAMTOOLS}/samtools view -Sbh - >${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam

    ## should remove the following two file/dir
	rm -rf $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}_${REMOVE_DUP}
	# rm -f ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.bam
 
    elif [[ $REMOVE_DUP = "nodedup" ]]
	then
		echo " "
		echo "escape the filtering of duplicates from $( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam"

        ${SAMTOOLS}/samtools view -h ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam |awk 'OFS="\t" {if (($1 ~ /\@/) || ($2==83) || ($2==99)) print $0}' |\
		${SAMTOOLS}/samtools view -Sbh - >${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam
     
    fi

    # rm -f ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam

##################################### start map SE reads ###################################
elif [[ ${SEQ_TYPE} = "SE" && ${#END1_SEQ[@]} -lt 1 ]]
then
	echo " "
	echo "These are single-end data, but the sequence file was not found, stop BWA mapping"

elif [[ ${SEQ_TYPE} = "SE" && ${#END1_SEQ[@]} -ge 1 ]]
then
	echo " "
	echo "These are single-end data, start BWA mapping and fastqc of ${END1_SEQ[ARRAY_JOB_INDEX]}"
	
	$FASTQC -Dfastqc.output_dir=$FASTQC_DIR $SEQ_DIR"/"${END1_SEQ[ARRAY_JOB_INDEX]}
	
	echo "Done running SE fastqc on" $SEQ_DIR"/"${END1_SEQ[ARRAY_JOB_INDEX]} "into" $FASTQC_DIR 	
	
	SEQ1NAME=$( basename ${END1_SEQ[ARRAY_JOB_INDEX]} .${SEQ_SUFFIX} )
	echo "$SEQ1NAME"
	
	${BWA_PATH}"/"bwa aln ${MAP_BOTH_ARGS} ${BWA_REF} ${SEQ_DIR}"/"${END1_SEQ[ARRAY_JOB_INDEX]} >${MAP_OUTDIR}"/"$SEQ1NAME.sai
	${BWA_PATH}"/"bwa samse ${MAP_SE_ARGS} -f ${MAP_OUTDIR}"/"$SEQ1NAME.sam ${BWA_REF} ${MAP_OUTDIR}"/"$SEQ1NAME.sai ${SEQ_DIR}"/"${END1_SEQ[ARRAY_JOB_INDEX]}
#	${SAMTOOLS}/samtools view -Shb ${MAP_OUTDIR}"/"$SEQ1NAME.sam |${SAMTOOLS}/samtools sort -o - ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).temp3 - > ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.sorted.bam

        ${SAMTOOLS}/samtools view -Sbh ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).sam > ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).bam

		$JAVA -Xmx6g -Xms512m \
			-jar $PICARD/SortSam.jar \
			INPUT=${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).bam \
			OUTPUT=${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.sorted.bam \
        	MAX_RECORDS_IN_RAM=2000000 \
        	SO=coordinate \
        	TMP_DIR=$MAP_OUTDIR \
        	VALIDATION_STRINGENCY=SILENT

	
	rm -f ${MAP_OUTDIR}"/"$SEQ1NAME.sai
	rm -f ${MAP_OUTDIR}"/"$SEQ1NAME.sam
	rm -f ${MAP_OUTDIR}"/"$SEQ1NAME.bam


	############################################################################################
	################################ start process mapped reads ################################
    echo " "
    echo echo "start process mapped reads from $( echo $SEQ1NAME ).${SEQ_TYPE}.sorted.bam"

	######### parse unique alignments #########
    if [[ ${FILTER_TYPE} = "U1" ]]
    then
	
	   	${SAMTOOLS}/samtools view -h -q ${MAP_QUALITY} ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.sorted.bam |\
	   	awk 'BEGIN {FS="\t";OFS="\t"} {if (($1 ~ /^\@/) || ($12 ~ /^XT\:A\:U/)) print $0}' |\
	   	awk 'BEGIN {FS="\t"; OFS="\t"} {if (($1 ~ /^\@/) || ($3 ~ /^chr[1-9]$/) || ($3 ~ /^chr[1-2][0-9]$/) || ($3 ~ /^chr[M-Y]$/)) print $0}' |\
	   	${SAMTOOLS}/samtools view -Sbh - >${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam

	######### parse primary alignments #########
    elif [[ ${FILTER_TYPE} = "U0" ]]
    then

		${SAMTOOLS}/samtools view -h -F 4 -q ${MAP_QUALITY} ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.sorted.bam | \ 
    	awk 'BEGIN {FS="\t"; OFS="\t"} {if (($1 ~ /^\@/) || ($3 ~ /^chr[1-9]$/) || ($3 ~ /^chr[1-2][0-9]$/) || ($3 ~ /^chr[M-Y]$/)) print $0}' |\
    	${SAMTOOLS}/samtools view -Sbh - \
	    	>${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam
   	fi

	############################################################################################
	######################## start remove duplicates from mapped reads #########################
    if [[ ${REMOVE_DUP} = "dedup" ]]
    then

        echo " "
        echo echo "start filter out duplicates from $( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam"

    	$JAVA -Xmx5g -Xms512m -jar $PICARD"/"MarkDuplicates.jar INPUT=${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam \
    	OUTPUT=${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam METRICS_FILE=${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}_metrics.txt \
    	REMOVE_DUPLICATES=TRUE ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT MAX_FILE_HANDLES=100 TMP_DIR=$MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}_${REMOVE_DUP}

    	rm -f ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam
    	rm -rf $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}_${REMOVE_DUP}

 
    elif [[ ${REMOVE_DUP} = "nodedup" ]]
    then
        echo " "
        echo "escape the filtering of duplicates from $( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam"

        mv $MAP_OUTDIR"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.sorted.bam ${MAP_OUTDIR}"/"$( echo $SEQ1NAME ).${SEQ_TYPE}.${FILTER_TYPE}.${REMOVE_DUP}.s1.bam
        
	fi
fi




