#!/bin/bash

########################################################
######  WRAPPER SCRIPT FOR CIRCRNA DETECTION

######          Program:                        circseq_wrapper.sh
######          Date:                           01/11/2016
######          Summary:                        Runs circRNA detection and BLAT filter
########################################################

if [ $# != 1 ]
then
        echo -e "Usage: Wrapper for circRNA detection\n circseq_wrapper.sh \n
        1. full path to config file\n"
else
	set -x
	START=`date`
	echo `date`
	config=$1

	circ_dir=$( cat $config | grep -w '^CIRC_DIR' | cut -d '=' -f2)
	email=$( cat $config | grep -w '^EMAIL' | cut -d '=' -f2)
	script_path=$( cat $config | grep -w '^WORKFLOW_PATH' | cut -d '=' -f2)
	samples=$( cat $config | grep -w '^SAMPLENAMES' | cut -d '=' -f2)
	python=$( cat $config | grep -w '^PYTHON' | cut -d '=' -f2)
	bowtie2=$( cat $config | grep -w '^BOWTIE' | cut -d '=' -f2)
	anchor_size=$( cat $config | grep -w '^ANCHOR_SIZE' | cut -d '=' -f2)
	bam=$( cat $config | grep -w '^BAM_DIR' | cut -d '=' -f2)
	bowtie_index_base=$( cat $config | grep -w '^REF_BOWTIE' | cut -d '=' -f2)
        reference_fa_dir=$( cat $config | grep -w '^REF_GENOME_DIR' | cut -d '=' -f2)
        samtools=$( cat $config | grep -w '^SAMTOOLS' | cut -d '=' -f2)
	blat_ref=$( cat $config | grep -w '^BLAT_REF' | cut -d '=' -f2)
        blat=$( cat $config | grep -w '^BLAT' | cut -d '=' -f2)
        bedtools=$( cat $config | grep -w '^BEDTOOLS' | cut -d '=' -f2)
	wf=$( cat $config | grep -w '^WORKFLOW' | cut -d '=' -f2)
	run_name=$( cat $config | grep -w '^RUN_NAME' | cut -d '=' -f2)
	exon_start_boundary=$( cat $config | grep -w '^EXON_START_BOUNDARY' | cut -d '=' -f2)
        exon_end_boundary=$( cat $config | grep -w '^EXON_END_BOUNDARY' | cut -d '=' -f2)
        intron_start_boundary=$( cat $config | grep -w '^INTRON_START_BOUNDARY' | cut -d '=' -f2)
        intron_end_boundary=$( cat $config | grep -w '^INTRON_END_BOUNDARY' | cut -d '=' -f2)
        exons=$( cat $config | grep -w '^EXONS' | cut -d '=' -f2)

	## convert : into array items
        sampleNames=$( echo $samples | tr ":" "\n" )
        t=1
        for sample in $sampleNames
        do
                sampleArray[$t]=$sample
                let t=t+1
        done
	num_samples=${#sampleArray[@]}

	#### STEP1: circRNA detection module	
	for i in `seq 1 $num_samples`
	do
		sample=${sampleArray[$i]}
		mkdir $circ_dir/$sample
		sample_dir=$circ_dir/$sample
#		$python/python $script_path/unmapped2anchors.py -a $anchor_size $bam/$sample.unmapped.bam | gzip > $sample_dir/$sample.unmapped.anchors.gz
		$python $script_path/unmapped2anchors.py -a $anchor_size $bam/$sample.unmapped.bam | gzip > $sample_dir/$sample.unmapped.anchors.gz
		$bowtie2/bowtie2 --reorder --mm -D$anchor_size --score-min=C,-15,0 -q -x $bowtie_index_base -U $sample_dir/$sample.unmapped.anchors.gz | $python $script_path/find_circ.py -G $reference_fa_dir -p ${sample}_ -s $sample_dir/$sample.sites.log > $sample_dir/$sample.sites.bed 2 > $sample_dir/$sample.sites.reads 2>$sample_dir/$sample.log
		cat $sample_dir/$sample.sites.reads | grep circ > $sample_dir/$sample.circRNA.reads
		cat $sample_dir/$sample.circRNA.reads | awk '{if ($5 >= 5) print}' > $sample_dir/$sample.circRNA.expressed.reads
		cat $sample_dir/$sample.circRNA.expressed.reads | sort -k 1,1 -k 2,2n -k 3,3n > $sample_dir/$sample.circRNA.expressed.sorted.reads
		cat $sample_dir/$sample.circRNA.expressed.sorted.reads | awk '$0 !~ /#/'| cut -f1,2,3,4,5,6 >> $sample_dir/$sample.circRNA.junctions.bed
		for j in `cat $sample_dir/$sample.circRNA.junctions.bed | cut -f4`
		do 
			cat $sample_dir/$sample.log | grep -w $j | awk '{print ">"$1"\t"$2}'>> $sample_dir/$sample.expressed.circRNA.sequences
		done
		mkdir $sample_dir/tmp
		tmp=$sample_dir/tmp
	        cat $sample_dir/$sample.expressed.circRNA.sequences | tr "\t" "#" | sort | uniq | tr "#" "\t" > $sample_dir/$sample.expressed.circRNA.sequences.BLAT_input
        	cp $sample_dir/$sample.expressed.circRNA.sequences.BLAT_input $tmp/$sample.expressed.circRNA.sequences.BLAT_input
	        cd $tmp
        	split -l 1 $tmp/$sample.expressed.circRNA.sequences.BLAT_input 1
	        ls 1* > $tmp/list
        	var=`wc -l $tmp/list | cut -f1 -d " "`
		for k in `seq 1 $var`
		do
			cat $tmp/list | mv `awk 'NR == d {print}'  d="${k}"` $tmp/$k.txt
	                cat $tmp/$k.txt | tr "\t" "\n" > $tmp/tmp
        	        mv $tmp/tmp $tmp/$k.txt
		done
		for h in `seq 1 $var`
		do
			$blat/blat -minScore=20 -tileSize=6 -minIdentity=100 $blat_ref $tmp/$h.txt -out=blast8 $tmp/$h.out.txt
			linecount=`wc -l $tmp/$h.out.txt | cut -f1 -d " "`
		        if [ "$linecount" -lt 2 ]
		        then
                		rm $tmp/$h.out.txt
        		fi
		done
		for y in `ls $tmp/*.out.txt`
        	do
	                cat $y >> $sample_dir/tmp.BLAT_output
	        done
		mv $sample_dir/tmp.BLAT_output $sample_dir/$sample.expressed.circRNA.sequences.BLAT_output
		cat $sample_dir/$sample.expressed.circRNA.sequences.BLAT_output | awk '{print $2"\t"$9"\t"$10"\t"$1}' | tr "\t" "#" | sort | uniq | tr "#" "\t" | awk '{if ($2 > $3)print $1"\t"$3"\t"$2"\t"$4; else print $1"\t"$2"\t"$3"\t"$4}'| sort -k 1,1 -k 2,2n -k 3,3n > $sample_dir/$sample.BLAT_output.bed
		rm -rf $sample_dir/tmp
		$bedtools/intersectBed -a $sample_dir/$sample.circRNA.junctions.bed -b $sample_dir/$sample.BLAT_output.bed -wa -wb > $sample_dir/$sample.circRNA.BLAT_output.intersect.bed
		cat $sample_dir/$sample.circRNA.BLAT_output.intersect.bed | awk '{if ($3-$2 > 10)print}' | awk 'BEGIN{OFS="#"} {print $1,$2,$3,$4,$5,$6}' | sort | uniq | tr "#" "\t"  > $sample_dir/$sample.circRNA.final.bed
	        cat $sample_dir/$sample.sites.reads | head -1 > $sample_dir/$sample.circRNA.expressed.txt
		for f in `cat $sample_dir/$sample.circRNA.final.bed | cut -f4`
		do 
			cat $sample_dir/$sample.circRNA.reads | grep -w $f >> $sample_dir/$sample.circRNA.expressed.txt
			cat $sample_dir/$sample.circRNA.reads | grep -w $f >> $sample_dir/$sample.circRNA.expressed.bed
			cat $sample_dir/$sample.log | grep -w $f | awk '{print ">"$1"\t"$2}' >> $sample_dir/$sample.circRNA.3prime5prime_fused_junction.fasta
		done
		$bedtools/intersectBed -a $sample_dir/$sample.circRNA.expressed.bed -b $exon_start_boundary -wa -wb >> $sample_dir/$sample.intersect.bed
	        $bedtools/intersectBed -a $sample_dir/$sample.circRNA.expressed.bed -b $exon_end_boundary -wa -wb >> $sample_dir/$sample.intersect.bed
	        $bedtools/intersectBed -a $sample_dir/$sample.circRNA.expressed.bed -b $intron_start_boundary -wa -wb >> $sample_dir/$sample.intersect.bed
	        $bedtools/intersectBed -a $sample_dir/$sample.circRNA.expressed.bed -b $intron_end_boundary -wa -wb >> $sample_dir/$sample.intersect.bed
	        $bedtools/intersectBed -a $sample_dir/$sample.circRNA.expressed.bed -b $exons -wa -wb >> $sample_dir/$sample.intersect.bed
	        perl $script_path/circRNA_annotation.pl $sample_dir/$sample.circRNA.expressed.txt $sample_dir/$sample.intersect.bed $sample_dir/$sample.circRNA.expressed.annotated.txt
		mv $sample_dir/$sample.circRNA.expressed.annotated.txt $sample_dir/$sample.circRNA.expressed.txt
		rm $sample_dir/$sample.sites.bed $sample_dir/$sample.sites.reads $sample_dir/$sample.circRNA.expressed.reads $sample_dir/$sample.circRNA.expressed.sorted.reads $sample_dir/$sample.circRNA.junctions.bed $sample_dir/$sample.expressed.circRNA.sequences $sample_dir/$sample.expressed.circRNA.sequences.BLAT_input $sample_dir/$sample.expressed.circRNA.sequences.BLAT_output $sample_dir/$sample.BLAT_output.bed $sample_dir/$sample.circRNA.BLAT_output.intersect.bed $sample_dir/$sample.circRNA.final.bed $sample_dir/$sample.log $sample_dir/$sample.circRNA.expressed.bed $sample_dir/$sample.intersect.bed
	done

	END=`date`
        SUB="$wf workflow completion for RunID ${run_name} "
        MESG="$wf workflow began analysis for ${run_name} at ${START} and completed on ${END}. Circular RNA results are ready at ${sample_dir}"
        ## send the completion email
        TO=`id |awk -F '(' '{print $2}' | cut -f1 -d ')'`
        echo -e "$MESG" | mailx -v -s "$SUB" "$TO"

	echo `date`
fi
