#!/usr/local/biotools/perl/5.14.2/bin/perl

=head1 NAME
   exon_count.pl

=head1 SYNOPSIS
    USAGE: exon_count.pl -r=run_info.txt -s=sample name -o=output_dir

=head1 OPTIONS

B<--run_info, -r>
	Run info file

B<--output_dir, -o>
	Output directory

B<--sample, -s>
	Sample name

B<--help,-h>


=head1  DESCRIPTION
	Run exon-count (bedtools intersect) for a given sample.

=head1  INPUT

=head1  OUTPUT


=head1  CONTACT
  bjaysheel@gmail.com


==head1 EXAMPLE
	./exon_count.pl -r=run_info.txt -s=sample_name -o=output_dir

=cut

use lib "/data2/bsi/reference/perl_workflow_ref/lib";
use lib "/data2/bsi/reference/perl_workflow_ref/lib/perl5/x86_64-linux/auto";
use strict;
use warnings;
use Data::Dumper;
use Pod::Usage;
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
use ParseConfig;
use MyUtility;
use Workflow::Logger;

my %options = ();
my $results = GetOptions (\%options,
                          'run_info|r=s',
						  'output_dir|o=s',
						  'input_dir|i=s',
						  'sample|s=s',
						  'log|l=s',
			              'debug=s',
						  'help|h') || pod2usage();

## display documentation
if( $options{'help'} ){
    pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
}

#############################################################################
#### set global vars
############################################
my $LONG_WAIT = 300;

#### make sure everything passed was peachy
&check_parameters(\%options);

#### parse X_info files
my $config = new ParseConfig($options{run_info});
my $util = new MyUtility;

my $logger = new Workflow::Logger('LOG_FILE'=>"$config->{RunInfo}->{logs}/ExonCount.$options{sample}.log",
								  'LOG_LEVEL'=>$options{'debug'});
$logger = Workflow::Logger::get_logger();

$logger->info("Exon count for sample $options{sample} started");

create_dir_struct(\%options);

my $this = {};
$this->{input} = "$options{input_dir}/tophat_$options{sample}/$options{sample}-sorted.unique.bam";
$this->{output} = "$options{output_dir}/counts";

#### check input file
check_input($this->{input});

if (! -s $config->{ToolInfo}->{features}->{value}){
	$logger->logdie("Feature file $config->{ToolInfo}->{features}->{value} is empty");
}


#### run bedtools intersect on coordinate sorted bam file.
my $cmd = "$config->{ToolInfo}->{bedtools}->{value}/intersectBed -wb -bed -abam $this->{input}";
$cmd .= " -b $config->{ToolInfo}->{exon_bed}->{value}";
$cmd .= " > $this->{output}/$options{sample}.exon.bed.i";
execute_cmd($cmd);

#### get start, stop coordinate, exon id and count column from intersed bed output
#### this is specific to BEDtools 2.16.2 and bed file containing only exons (ref tool_info file Exon_BED)
#### if using any other version check to make sure
#### intersectBed output format hasn't changed.
#### e.g
#### chr1    16943   16993   R0230412_0111:5:2102:1650:81534#ACTGAT/2        50      +       16943   16993   0,0,0   1       50,     0,      chr1    16858   17055   WASH7P.NR_024540.E5
#### chr1    16962   17012   R0230412_0111:5:2307:14905:84483#ACTGAT/1       50      +       16962   17012   0,0,0   1       50,     0,      chr1    16858   17055   WASH7P.NR_024540.E5
#### chr1    17986   18036   R0230412_0111:5:2205:7738:22572#ACTGAT/1        50      +       17986   18036   0,0,0   1       50,     0,      chr1    17915   18061   WASH7P.NR_024540.E8
#### chr1    134984  135034  R0230412_0111:5:2107:19828:46092#ACTGAT/1       50      +       134984  135034  0,0,0   1       50,     0,      chr1    134773  139696  LOC729737.NR_039983.E1
#### chr1    134985  135035  R0230412_0111:5:1108:2808:120254#ACTGAT/1       50      +       134985  135035  0,0,0   1       50,     0,      chr1    134773  139696  LOC729737.NR_039983.E1
#### chr1    134987  135037  R0230412_0111:5:1307:13710:96706#ACTGAT/1       50      +       134987  135037  0,0,0   1       50,     0,      chr1    134773  139696  LOC729737.NR_039983.E1
#### chr1    134997  135047  R0230412_0111:5:1302:15551:38431#ACTGAT/1       50      +       134997  135047  0,0,0   1       50,     0,      chr1    134773  139696  LOC729737.NR_039983.E1
#### chr1    134997  135047  R0230412_0111:5:2305:1925:69148#ACTGAT/1        50      +       134997  135047  0,0,0   1       50,     0,      chr1    134773  139696  LOC729737.NR_039983.E1
#### chr1    134998  135048  R0230412_0111:5:1202:6415:154479#ACTGAT/1       50      +       134998  135048  0,0,0   1       50,     0,      chr1    134773  139696  LOC729737.NR_039983.E1

$cmd = "cat $this->{output}/$options{sample}.exon.bed.i";
$cmd .= " | awk '{print \$13\"\\t\"\$14\"\\t\"\$15\"\\t\"\$16}'";
$cmd .= " | awk 'NF{a[\$0]++}END{for(i in a)print i,a[i]}'";
$cmd .= " > $this->{output}/$options{sample}.exon.bed.i.next";
execute_cmd($cmd);


####script to go throught the reference and get all the positions and if
#### there is no read then make the count as 0 and sort in the same order
#### for all the samples
$cmd = "perl $config->{ToolInfo}->{workflow_path}->{value}/backfill_exon.pl";
$cmd .= " -i=$this->{output}/$options{sample}.exon.bed.i.next";
$cmd .= " -r=$config->{ToolInfo}->{exon_bed}->{value}";
$cmd .= " -o=$this->{output}/$options{sample}.exon.count.tsv";
execute_cmd($cmd);

#### check if exon count output was created.
if (-s "$this->{output}/$options{sample}.exon.count.tsv") {
	$cmd = " rm $this->{output}/$options{sample}.exon.bed.i";
	$cmd .= " | rm $this->{output}/$options{sample}.exon.bed.i.next";
	execute_cmd($cmd);
} else {
	$logger->logdie("Could not create exon count for $options{sample}");
}

$logger->info("Exon count for sample $options{sample} completed");
exit();

#############################################################################
sub check_parameters {
    my $options = shift;

	my @required = qw(run_info output_dir input_dir sample);

	foreach my $key (@required) {
		unless ($options{$key}) {
			print STDERR "ARG: $key is required\n";
			pod2usage({-exitval => 2,  -message => "error message", -verbose => 1, -output => \*STDERR});
			exit(-1);
		}
	}

	$options{'debug'} = 3 unless ($options{'debug'});
}

#############################################################################
sub execute_cmd {
	my $cmd = shift;

	$logger->info("$cmd");
	system($cmd);

	while (( $? >> 8 ) != 0 ){
		$logger->logdie("ERROR: Following command failed to execute. Exiting execution of workflow\n$cmd");

		exit(-1);
	}
}

#############################################################################
sub create_dir_struct {
	my $options = shift;

	my $dir = "$options{output_dir}/counts";
	if ( -d $dir) {
		$logger->info("Directory $dir exist");
	} else {
		execute_cmd("mkdir -p $dir");
	}
}

#############################################################################
sub check_input {
	my $file = shift;

	my $job_id = 0;
	my $sgeerr = "";
	my $sgeout = "";

	if (defined $ENV{JOB_ID}){
		$job_id = $ENV{JOB_ID};
	}

	if (defined $ENV{SGE_STDERR_PATH}){
		$sgeerr = $ENV{SGE_STDERR_PATH};
	}

	if (defined $ENV{SGE_STDOUT_PATH}){
		$sgeout = $ENV{SGE_STDOUT_PATH};
	}

	if (! -s $file){
		my $error_name = "$config->{RunInfo}->{base_output_dir}/$config->{RunInfo}->{pi}/$config->{RunInfo}->{type}/$config->{RunInfo}->{output_folder}/error/ExonCount.$options{sample}.err";
		$util->createErrorFile($error_name, "EXPECTED FILE WHILE RUNNING EXON COUNT STEP IS MISSING\n\n$file");

		$util->reportErrorSGE($config->{RunInfo}->{email},
							  $file,
							  "Exon Count",
							  $error_name,
							  $job_id,
							  $sgeerr,
							  $sgeout);
		exit(100);
	}
}
