#!/bin/bash
# This script is used to generate qc files for both paired-end (PE) and single-end (SE) ChIP-Seq data

set -x

if [ $# != 1 ];
then
        echo "usage: mapqc.sh <config file>";
	exit
fi

# Parse run_info.txt file for variables 
PROJECT_NAME=`grep -w '^PROJECT_NAME' $1 | cut -d '=' -f2`
SEQ_DIR=`grep -w '^SEQ_DIR' $1 | cut -d '=' -f2`
SEQ_TYPE=`grep -w '^SEQ_TYPE' $1 | cut -d '=' -f2`
END1_SEQ=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^END1_SEQ/) print $2}' $1) )
END2_SEQ=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^END2_SEQ/) print $2}' $1) )
SEQ_SUFFIX=`grep -w '^SEQ_SUFFIX' $1 | cut -d '=' -f2`
OUT_DIR=`grep -w '^WORK_DIR' $1 | cut -d '=' -f2`
PEAK_CALLER=`grep -w '^PEAK_CALLER' $1 | cut -d '=' -f2`
IP_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^IP_FILE/) print $2}' $1) )
INPUT_FILE=( $(awk 'BEGIN {FS="="} {if ($1 ~ /^INPUT_FILE/) print $2}' $1) )
tool_info=`grep -w '^TOOL_INFO' $1 | cut -d '=' -f2`

# Parse tool_info.txt file for variables 
FASTQC=`grep -w '^FASTQC' $tool_info | cut -d '=' -f2`
BWA_REF=`grep -w '^BWA_REF' $tool_info | cut -d '=' -f2`
BWA_PATH=`grep -w '^BWA_PATH' $tool_info | cut -d '=' -f2`
MACS_PATH=`grep -w '^MACS_PATH' $tool_info | cut -d '=' -f2`
SICER=`grep -w '^SICER' $tool_info | cut -d '=' -f2`
SAMTOOLS=`grep -w '^SAMTOOLS' $tool_info | cut -d '=' -f2`
BEDTOOLS=`grep -w '^BEDTOOLS' $tool_info | cut -d '=' -f2`
PICARD=`grep -w '^PICARD' $tool_info | cut -d '=' -f2`
GENOME_TABLE=`grep -w '^GENOME_TABLE' $tool_info | cut -d '=' -f2`
TCLR_LIST=`grep -w '^TCLR_LIST' $tool_info | cut -d '=' -f2`

# Constants used in this file
MAP_OUTDIR=$OUT_DIR/mapout

let ARRAY_JOB_INDEX=$SGE_TASK_ID-1

SEQ1NAME=$( basename ${END1_SEQ[ARRAY_JOB_INDEX]} .${SEQ_SUFFIX} )

##################################### start generate summary files #########################
##################################### Generate mapping summary file
##################################### Generate library complexity summary file

echo " "
echo "start generate mapping summary file and mapping QC file: $SEQ1NAME"
echo " "

if [[ $SEQ_TYPE = "PE" ]]
then

#files=$MAP_OUTDIR"/"$SEQ1NAME.sorted.PE.bam
files=$MAP_OUTDIR"/"$SEQ1NAME.PE.bam

basename $files $MAP_OUTDIR"/" >>$MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mapping.summary.txt
basename $files $MAP_OUTDIR"/" >>$MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mappingQC.summary.txt

${SAMTOOLS}/samtools view $files |awk 'BEGIN {FS="\t"; OFS="\t"} {if ($12 ~ /XT/) print $2,$12; else print $2,"NA"}' |awk 'NR%2 {printf $0"\t";next;}1' |awk 'BEGIN {FS="\t";OFS="\t"} {a[$1"\t"$2"\t"$3"\t"$4]++}END{for(i in a){print i,a[i]}}' |awk 'BEGIN {FS="\t"; OFS="\t"; c2a = 0; c2b=0; c2c=0; c0 =0; c1=0; c3=0; sum=0} {if ((($1 ==83) || ($1 ==99)) && (($3 ==163) || ($3 ==147))) {if (($2 ~ /XT:A:U/) && ($4 ~ /XT:A:U/)) c2a +=$5; else if ((($2 ~ /XT:A:U/) && ($4 ~ /XT:A:R/)) || (($2 ~ /XT:A:R/) && ($4 ~ /XT:A:U/))) c2b +=$5; else if (($2 ~ /XT:A:R/) && ($4 ~ /XT:A:R/)) c2c +=$5} else if (($2 ~ /NA/) && ($4 ~ /NA/)) c0 +=$5; else if ((($2 ~ /NA/) && ($4 !~ /NA/)) || (($4 ~ /NA/) && ($2 !~ /NA/))) c1 +=$5; else c3 +=$5} END {sum=c2a+c2b+c2c+c0+c1+c3; print c2a,c2b,c2c,c0,c1,c3,sum}' >>$MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mapping.summary.txt

${SAMTOOLS}/samtools view $files | awk 'OFS="\t" {if (($2 ==83) || ($2 ==99) || ($2 ==163) || ($2 ==147)) print $2,$3,$4,$12}' |awk '/^[8-9]/ {sub(/$/,"\t"); getline t; print $0 t; next}; 1' |awk '{if ((($2 ~ /chr[1-9]/) || ($2 ~ /chr[1-2][0-9]/) || ($2 ~ /chr[X-Y]/)) && ($4 ~ /XT:A:U/) && ($8 ~ /XT:A:U/)) print $1"#"$2"#"$3"#"$6"#"$7}' |awk 'BEGIN {FS="\t";OFS="\t"} {a[$1]++}END{for(i in a){print i,a[i]}}' |awk -F"\t" 'BEGIN {Pos_c1 = 0; Pos_c2=0} {if ($2 ==1) Pos_c1 +=$2; else Pos_c2 += $2} END {printf "%d\t%d\t%d\t%.4f\t%.4f\n",Pos_c1,FNR,(Pos_c1+Pos_c2),FNR/(Pos_c1+Pos_c2),Pos_c1/FNR}' >>$MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mappingQC.summary.txt

awk 'BEGIN {FS="\t"; OFS="\t"; print "\n\n###### Mapping summary\n# Column 1 is the file name\n# Column 2 is the number of uniquely mapped pairs\n# Column 3 is the number of pairs with one end uniquely mapped\n# Column 4 is the number of pairs with both ends mapped to multiple locations\n# Columns 5 is the number of unmapped pairs\n# Columns 6 is the number of pairs with only one end mapped (to one or multiple locations)\n# Columns 7 is the number of unproperly mapped pairs (wrong orientation, wrong size, etc)\n# Columns 8 is the total number of pairs\n"} {key=$0; getline; print key,$0;}' $MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mapping.summary.txt > $MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mapping.summary.tab

awk 'BEGIN {FS="\t"; OFS="\t"; print "\n\n###### Summary of library complexity\n# Column 1 is the file name\n# Column 2 is the number of genomic coordinates with a single uniquely mapped pair\n# Column 3 is the number of genomic coordinates with one or more uniquely mapped pairs\n# Column 4 is the total number of uniquely mapped pairs\n# Columns 5 is the ratio of column 3 over column 4\n# Columns 6 is the ratio of column 2 over column 3\n"} {key=$0; getline; print key,$0;}' $MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mappingQC.summary.txt > $MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mappingQC.summary.tab

elif [[ $SEQ_TYPE = "SE" ]]
then

files=$MAP_OUTDIR"/"$SEQ1NAME.SE.sorted.bam

basename $files $MAP_OUTDIR"/" >>$MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mapping.summary.txt
basename $files $MAP_OUTDIR"/" >>$MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mappingQC.summary.txt

${SAMTOOLS}/samtools view $files |awk 'BEGIN {FS="\t";OFS="\t"} {a[$2"\t"$12]++}END{for(i in a){print i,a[i]}}' |awk 'BEGIN {FS="\t"; OFS="\t"; c1=0; c2=0; c3=0; sum=0} {if (($1 ==0) || ($1 ==16)) {if ($2 ~ /XT:A:U/) c1 +=$3; else if ($2 ~ /XT:A:R/) c2 +=$3} else if ($2 !~ /XT:A/) c3 +=$3} END {sum=c1+c2+c3; print c1,c2,c3,sum}' >>$MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mapping.summary.txt

${SAMTOOLS}/samtools view $files | awk 'OFS="\t" {if ((($2 ==0) || ($2 ==16)) && ($12 ~ /XT:A:U/)) {if (($3 ~ /chr[1-9]/) || ($3 ~ /chr[1-2][0-9]/) || ($3 ~ /chr[X-Y]/)) print $2"#"$3"#"$4 }}' |awk 'BEGIN {FS="\t";OFS="\t"} {a[$1]++}END{for(i in a){print i,a[i]}}' |awk -F"\t" 'BEGIN {Pos_c1 = 0; Pos_c2=0} {if ($2 ==1) Pos_c1 +=$2; else Pos_c2 += $2} END {printf "%d\t%d\t%d\t%.4f\t%.4f\n",Pos_c1,FNR,(Pos_c1+Pos_c2),FNR/(Pos_c1+Pos_c2),Pos_c1/FNR}' >>$MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mappingQC.summary.txt

awk 'BEGIN {FS="\t"; OFS="\t"; print "\n\n###### Mapping summary\n# Column 1 is the file name\n# Column 2 is the number of uniquely mapped reads\n# Column 3 is the number of reads with multiple hits\n# Column 4 is the number of unmapped reads\n# Column 5 is the total number of reads\n"} {key=$0; getline; print key,$0;}' $MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mapping.summary.txt > $MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mapping.summary.tab

awk 'BEGIN {FS="\t"; OFS="\t"; print "\n\n###### Summary of library complexity\n# Column 1 is the file name\n# Column 2 is the number of genomic positions with a single uniquely mapped reads\n# Column 3 is the number of genomic positions with one or more uniquely mapped reads\n# Column 4 is the total number of uniquely mapped reads\n# Columns 5 is the ratio of column 3 over column 4\n# Columns 6 is the ratio of column 2 over column 3\n"} {key=$0; getline; print key,$0;}' $MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mappingQC.summary.txt > $MAP_OUTDIR"/"${SEQ1NAME}.${SEQ_TYPE}.mappingQC.summary.tab

fi

rm -f $MAP_OUTDIR"/"$SEQ1NAME.$SEQ_TYPE.mapping.summary.txt
rm -f $MAP_OUTDIR"/"$SEQ1NAME.$SEQ_TYPE.mappingQC.summary.txt

