#!/usr/local/biotools/perl/5.14.2/bin/perl

=head1 NAME
   nearest_gene.pl

=head1 SYNOPSIS

    USAGE: nearest_gene.pl -r=run_info.txt -o=output_dir

=head1 OPTIONS


B<--run_info,-r>
	Required. Complete path to run info file

B<--output_dir, -o>
	Required.  Root output dir.  eg: /data2/bsi/secondary/PI/mrnaseq/RUN_ID

B<--help,-h>


=head1 DESCRIPTION
	Identify nearest gene to given lincRNA

=head1 INPUT


=head1 OUTPUT

=head1 VERSION
	1.0

=head1  CONTACT
  bjaysheel@gmail.com


==head1 EXAMPLE
	./nearest_gene.pl -r=run_info.txt -o=output_dir

=cut

use lib "/data2/bsi/reference/perl_workflow_ref/lib";
use lib "/data2/bsi/reference/perl_workflow_ref/lib/perl5/x86_64-linux/auto";
use strict;
use warnings;
use Data::Dumper;
use Pod::Usage;
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
use ParseConfig;
use MyUtility;
use Workflow::Logger;

my %options = ();
my $results = GetOptions (\%options,
                          'run_info|r=s',
						  'output_dir|o=s',
						  'log|l=s',
			              'debug=s',
						  'help|h') || pod2usage();

#### display documentation
if( $options{'help'} ){
    pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
}

#############################################################################
#### set global vars
my $MAX_JOB_LIMIT = 3000;
my $LONG_WAIT = 300;
my $SHORT_WAIT = 30;
my $WAIT = 5;

#### create hash of all config info.
my $config = new ParseConfig($options{run_info});
my $util = new MyUtility;

#### make sure everything passed was peachy
&check_parameters(\%options);

#### setup log object
my $logger;

$logger = new Workflow::Logger('LOG_FILE'=>"$config->{RunInfo}->{logs}/NearestGene.log",
							  'LOG_LEVEL'=>$options{'debug'});

$logger = Workflow::Logger::get_logger();

$logger->info("Nearest Gene started");

create_dir_struct(\%options);

#### set local variables.
my $cmd = "";
my $this;

$this->{dir} = "$options{output_dir}/lincRNA/neargene";
$this->{input} =  "$options{output_dir}/Reports/GeneCount.tsv";

&check_input($this->{input});

#### get lincRNA gene list
$cmd = "awk '{print \$1\"\\t\"\$3\"\\t\"\$4\"\\t\"\$2}' $this->{input} > $this->{dir}/lincRNACount.bed";
$cmd .= " && sed -i '1d' $this->{dir}/lincRNACount.bed";
$cmd .= " && awk '{print \$1\"\\t\"\$3\"\\t\"\$4\"\\t\"\$2}' $config->{ToolInfo}->{gene_mapping_hg19}->{value}";
$cmd .= " > $this->{dir}/gene_info.bed";
$cmd .= " && $config->{ToolInfo}->{bedtools}->{value}/windowBed";
$cmd .= " -a $this->{dir}/lincRNACount.bed";
$cmd .= " -b $this->{dir}/gene_info.bed";
$cmd .= " -w $config->{ToolInfo}->{nearest_gene_window}->{value}";
$cmd .= " | awk '{print \$4\"\#\"\$8}' | sort | uniq > $this->{dir}/lincRNA.gene.list";
execute_cmd($cmd);

#### find nearest genes.
$cmd = "perl $config->{ToolInfo}->{workflow_path}->{value}/FindNearByGenes.pl";
$cmd .= " $this->{dir}/lincRNA.gene.list > $this->{dir}/NearestGenes.list";
execute_cmd($cmd);

#### get gene names
$cmd = "cut -f4 $this->{dir}/lincRNACount.bed > $this->{dir}/lincRNA.list";
execute_cmd($cmd);

#### fill in all gene info.
$cmd = "perl $config->{ToolInfo}->{workflow_path}->{value}/FillGeneInfo.pl";
$cmd .= " $this->{dir}/lincRNA.list $this->{dir}/NearestGenes.list $this->{dir}/Output.list";
execute_cmd($cmd);

$logger->info("Nearest Gene completed");
exit();

#############################################################################
sub check_parameters {
    my $options = shift;

    my @required = qw(run_info output_dir);

    foreach my $key (@required) {
	    unless (defined $options{$key}) {
		    print STDERR "ARG: $key is required\n";
		    pod2usage({-exitval => 2,  -message => "error message", -verbose => 1, -output => \*STDERR});
		    exit(-1);
	    }
    }

    $options{'debug'} = 3 unless (defined $options{'debug'});
}

#############################################################################
sub create_dir_struct {
    my $options = shift;

    my $dir = "$options->{output_dir}/lincRNA/neargene";
    if ( -d $dir ) {
	    $logger->info("Directory $dir exist");
    } else {
	    execute_cmd("mkdir -p $dir");
    }
}

#############################################################################
sub execute_cmd {
    my $cmd = shift;

    $logger->info("$cmd");
    system($cmd);

    while (( $? >> 8 ) != 0 ){
		$logger->logdie("ERROR: Following command failed to execute. Exiting execution of workflow\n$cmd");
		exit(100);
	}
}

#############################################################################
sub check_input {
    my $file = shift;

    my $job_id = 0;
    my $sgeerr = "";
    my $sgeout = "";

    if (defined $ENV{JOB_ID}){
	    $job_id = $ENV{JOB_ID};
    }

    if (defined $ENV{SGE_STDERR_PATH}){
	    $sgeerr = $ENV{SGE_STDERR_PATH};
    }

    if (defined $ENV{SGE_STDOUT_PATH}){
	    $sgeout = $ENV{SGE_STDOUT_PATH};
    }

    if (! -s $file){
	    my $error_name = "$config->{RunInfo}->{base_output_dir}/$config->{RunInfo}->{pi}/$config->{RunInfo}->{type}/$config->{RunInfo}->{output_folder}/error/NearestGene.err";
	    $util->createErrorFile($error_name, "EXPECTED FILE WHILE RUNNING NEAREST GENE STEP IS MISSING\n\n$file");

	    $util->reportErrorSGE($config->{RunInfo}->{email},
						      $file,
						      "Nearest Gene",
						      $error_name,
						      $job_id,
						      $sgeerr,
						      $sgeout);
	    exit(100);
    }
}
