#!/usr/bin/perl -w
use strict;

# Outputs a vcf file with tag EC with the coverage in the exon around each variant
# from the vcf file

my $usage = "get_exon_cvg.pl ref input_bam input_vcf > out_vcf Adds coverage and distance to junction information\n";
$usage   .= "nref is a bed file with the exon reference\n";

my $ref = shift or die $usage;
my $invcf = shift or die $usage;
my $bedtools = shift or die $usage;
#my $inbam = shift or die $usage;

my $closestBed=$bedtools . "/closestBed";

# We use closestBed to find the closest exon for each line in the vcf
my $in_closest = `$closestBed -header -t first -a $invcf -b $ref`;

# We process the vcf-bed, calculating coverage line by line
foreach my $line (split /[\r\n]+/, $in_closest) {

    # Skip the header
    if ($line =~ m/^\#\#/) {
	print $line."\n";
	next;
    }
    if ($line =~ m/^\#/) {
	#print "\#\#INFO=<ID=EC,Number=1,Type=Float,Description=\"Average coverage in closest exon\">\n";
	print "\#\#INFO=<ID=DJ,Number=1,Type=Integer,Description=\"Distance to closest junction\">\n";
	print $line."\n";
	next;
    }

    # Get the coordinates of the exon and calculate coverage
    my @fields = split (/\t/, $line);
    my ($chr, $start, $end) = @fields[10..12];
    #my $cvg = sprintf("%.2f", calc_coverage($chr, $start, $end, $inbam));
    my $dist = calc_dist($fields[1],$start,$end);
    
    # Print the coverage and distance to junction in a new field
    #$fields[7].= ";EC=$cvg;DJ=$dist";
    $fields[7].= ";DJ=$dist";	
	print join ("\t", @fields[0..9])."\n";
}


sub calc_dist {
    my ($pos, $start, $end) = @_;
    
    my ($ldist, $rdist) = (abs($pos-$start), abs($end-$pos));
    return (($ldist<$rdist)?$ldist:$rdist);
}

# Coverage using samtools. It is approximate, and the readlen is hardcoded, but is much faster.
sub calc_coverage {
    my ($chr, $start, $end, $inbam) = @_;
    my $READLEN = 50;
    
    my $num=`samtools view $inbam $chr:$start-$end | wc -l`;

    return $num*$READLEN/($end-$start+1);
}

#Coverage using GATK. It tends to very slow due to initializing the JVM each time
sub calc_coverage_gatk {
    my ($chr, $start, $end, $inbam) = @_;
    my $JAVA="/usr/java/latest/bin/java";
    my $GATK="/projects/bsi/bictools/apps/alignment/GenomeAnalysisTK/2.4-3-g2a7af43/";
    my $REF="/data2/bsi/staff_analysis/m087717/rna_variant/supplied_results/allchr.sorted.fa";
    
    # GATK DiagnoseTarget command
    my $CMD="$JAVA -jar $GATK/GenomeAnalysisTK.jar -fixMisencodedQuals -K $GATK/Hossain.Asif_mayo.edu.key ";
    $CMD.=  "-l ERROR -R $REF -T DiagnoseTargets -I $inbam -L $chr:$start-$end -dt none 2>>tmp.err ";

    #print $CMD."\n";
    # Get the relevant information using shell
    my $cvg = `$CMD | grep -v \"^#\" | cut -f8 | cut -d \";\" -f1 | cut -d \"=\" -f2 `;
    
    chomp $cvg;
    return $cvg;
}
