#!/bin/bash

########################################################
######  FILTER IDENTIFY CIRCULAR RNAs FOR FALSE POSITIVES

######          Program:                        BLAT_filter.sh
######          Date:                           01/11/2016
######          Summary:                        Merge log files, extract sequences, format to FA, run BLAT, filter out FP
########################################################

if [ $# != 3 ]
then
	echo -e "Usage: Filter circRNAs\n BLAT_filter.sh \n
	1. full path to config file\n
	2. sample name\n
	3. Start time\n"
else
	set -x
	echo `date`
	config=$1
	sample=$2
	start_date=$3
	
	qsub=$( cat $config | grep -w '^SGE' | cut -d '=' -f2)
	bam=$( cat $config | grep -w '^BAM_DIR' | cut -d '=' -f2)
	circ_dir=$( cat $config | grep -w '^CIRC_DIR' | cut -d '=' -f2)
	email=$( cat $config | grep -w '^EMAIL' | cut -d '=' -f2)
	threads=$( cat $config | grep -w '^THREADS' | cut -d '=' -f2)
	queue=$( cat $config | grep -w '^QUEUE' | cut -d '=' -f2)
	script_path=$( cat $config | grep -w '^WORKFLOW_PATH' | cut -d '=' -f2)
	blat_ref=$( cat $config | grep -w '^BLAT_REF' | cut -d '=' -f2)
	blat=$( cat $config | grep -w '^BLAT' | cut -d '=' -f2)
	bedtools=$( cat $config | grep -w '^BEDTOOLS' | cut -d '=' -f2)
	wf=$( cat $config | grep -w '^WORKFLOW' | cut -d '=' -f2)
	run_name=$( cat $config | grep -w '^RUN_NAME' | cut -d '=' -f2)
	exon_start_boundary=$( cat $config | grep -w '^EXON_START_BOUNDARY' | cut -d '=' -f2)
	exon_end_boundary=$( cat $config | grep -w '^EXON_END_BOUNDARY' | cut -d '=' -f2)
	intron_start_boundary=$( cat $config | grep -w '^INTRON_START_BOUNDARY' | cut -d '=' -f2)
	intron_end_boundary=$( cat $config | grep -w '^INTRON_END_BOUNDARY' | cut -d '=' -f2)
	exons=$( cat $config | grep -w '^EXONS' | cut -d '=' -f2)

	sample_dir=$circ_dir/$sample
	for i in `cat $sample_dir/$sample.circRNA.junctions.bed | cut -f4`; do cat $circ_dir/logs/$sample.circRNA_detection.e* | grep -w $i | awk '{print ">"$1"\t"$2}'>> $sample_dir/$sample.expressed.circRNA.sequences; done

	mkdir $sample_dir/tmp
	tmp=$sample_dir/tmp
	cat $sample_dir/$sample.expressed.circRNA.sequences | tr "\t" "#" | sort | uniq | tr "#" "\t" > $sample_dir/$sample.expressed.circRNA.sequences.BLAT_input
	cp $sample_dir/$sample.expressed.circRNA.sequences.BLAT_input $tmp/$sample.expressed.circRNA.sequences.BLAT_input
	cd $tmp
	split -l 1 $tmp/$sample.expressed.circRNA.sequences.BLAT_input 1
	ls 1* > $tmp/list
	var=`wc -l $tmp/list | cut -f1 -d " "`
	
	for i in `seq 1 $var`
	do 
		cat $tmp/list | mv `awk 'NR == d {print}'  d="${i}"` $tmp/$i.txt
		cat $tmp/$i.txt | tr "\t" "\n" > $tmp/tmp
		mv $tmp/tmp $tmp/$i.txt
	done
	ls $tmp/*.txt > $tmp/list
	mkdir $tmp/logs
	BLAT=`$qsub/qsub -V -wd $tmp/logs -m ae -M $email -l h_vmem=8G -l h_stack=10M -q $queue -t 1-$var:1 $script_path/blat.sh $config $sample $tmp`
	job_ids=`echo $BLAT | cut -d ' ' -f3 | tr "." "\t" | cut -f1`
	sleep 5m
	for i in `seq 1 $var`
	do
		while [ `cat $tmp/logs/blat.sh.o${job_ids}.$i | wc -l` -lt 2 ]
		do
			echo "BLAT in progress"
			sleep 5m
		done
	done
	for i in `ls $tmp/*.out.txt`
	do
		cat $i >> $sample_dir/tmp.BLAT_output
	done
	mv $sample_dir/tmp.BLAT_output $sample_dir/$sample.expressed.circRNA.sequences.BLAT_output
# Eliminate duplicate entries, conver to BED format and run mergeBed to merge overlapping features
        cat $sample_dir/$sample.expressed.circRNA.sequences.BLAT_output | awk '{print $2"\t"$9"\t"$10"\t"$1}' | tr "\t" "#" | sort | uniq | tr "#" "\t" | awk '{if ($2 > $3)print $1"\t"$3"\t"$2"\t"$4; else print $1"\t"$2"\t"$3"\t"$4}'| sort -k 1,1 -k 2,2n -k 3,3n > $sample_dir/$sample.BLAT_output.bed

	rm -rf $sample_dir/tmp
# Run intersectBed to find at least two entries for each circRNA when compared to the BLAT output
	$bedtools/intersectBed -a $sample_dir/$sample.circRNA.junctions.bed -b $sample_dir/$sample.BLAT_output.bed -wa -wb > $sample_dir/$sample.circRNA.BLAT_output.intersect.bed

# Filter out circRNA candidates which are of smaller lengths
	cat $sample_dir/$sample.circRNA.BLAT_output.intersect.bed | awk '{if ($3-$2 > 10)print}' | awk 'BEGIN{OFS="#"} {print $1,$2,$3,$4,$5,$6}' | sort | uniq | tr "#" "\t"  > $sample_dir/$sample.circRNA.final.bed
	cat $sample_dir/$sample.sites.reads | head -1 > $sample_dir/$sample.circRNA.expressed.txt
	for i in `cat $sample_dir/$sample.circRNA.final.bed | cut -f4`; do cat $sample_dir/$sample.circRNA.reads | grep -w $i >> $sample_dir/$sample.circRNA.expressed.txt;done

	for i in `cat $sample_dir/$sample.circRNA.final.bed | cut -f4`; do cat $sample_dir/$sample.circRNA.reads | grep -w $i >> $sample_dir/$sample.circRNA.expressed.bed;done
	$bedtools/intersectBed -a $sample_dir/$sample.circRNA.expressed.bed -b $exon_start_boundary -wa -wb >> $sample_dir/$sample.intersect.bed
	$bedtools/intersectBed -a $sample_dir/$sample.circRNA.expressed.bed -b $exon_end_boundary -wa -wb >> $sample_dir/$sample.intersect.bed
	$bedtools/intersectBed -a $sample_dir/$sample.circRNA.expressed.bed -b $intron_start_boundary -wa -wb >> $sample_dir/$sample.intersect.bed
	$bedtools/intersectBed -a $sample_dir/$sample.circRNA.expressed.bed -b $intron_end_boundary -wa -wb >> $sample_dir/$sample.intersect.bed
	$bedtools/intersectBed -a $sample_dir/$sample.circRNA.expressed.bed -b $exons -wa -wb >> $sample_dir/$sample.intersect.bed
	perl $script_path/circRNA_annotation.pl $sample_dir/$sample.circRNA.expressed.txt $sample_dir/$sample.intersect.bed $sample_dir/$sample.circRNA.expressed.annotated.txt
	mv $sample_dir/$sample.circRNA.expressed.annotated.txt $sample_dir/$sample.circRNA.expressed.txt

	for i in `cat $sample_dir/$sample.circRNA.final.bed | cut -f4`; do cat $circ_dir/logs/$sample.circRNA_detection.e* | grep -w $i | awk '{print ">"$1"\t"$2}'>> $sample_dir/$sample.circRNA.3prime5prime_fused_junction.fasta;done
# clean up
	rm $sample_dir/$sample.sites.bed $sample_dir/$sample.sites.reads $sample_dir/$sample.circRNA.expressed.reads $sample_dir/$sample.circRNA.expressed.sorted.reads $sample_dir/$sample.circRNA.junctions.bed $sample_dir/$sample.expressed.circRNA.sequences $sample_dir/$sample.expressed.circRNA.sequences.BLAT_input $sample_dir/$sample.expressed.circRNA.sequences.BLAT_output $sample_dir/$sample.BLAT_output.bed $sample_dir/$sample.circRNA.BLAT_output.intersect.bed $sample_dir/$sample.circRNA.final.bed $sample_dir/$sample.intersect.bed $sample_dir/$sample.circRNA.expressed.bed

	START=$(echo $start_date | tr "_" " ")
        END=`date`
        SUB="$wf workflow completion for RunID ${run_name} "
       MESG="$wf workflow for ${run_name} began analysis at ${START} and completed on ${END}. Circular RNA results are ready at ${sample_dir}"
        ## send the completion email
        TO=`id |awk -F '(' '{print $2}' | cut -f1 -d ')'`
        echo -e "$MESG" | mailx -v -s "$SUB" "$TO"

	echo `date`
fi	
