#!/usr/local/biotools/perl/5.14.2/bin/perl

=head1 NAME
   gene_count.pl

=head1 SYNOPSIS
    USAGE: gene_count.pl -r=run_info.txt -s=sample name -o=output_dir

=head1 OPTIONS

B<--run_info, -r>
	Run info file

B<--output_dir, -o>
	Output directory

B<--sample, -s>
	Sample name

B<--help,-h>


=head1  DESCRIPTION
	Run htseq-count for a given sample.

=head1  INPUT

=head1  OUTPUT


=head1  CONTACT
  bjaysheel@gmail.com


==head1 EXAMPLE
	./gene_count.pl -r=run_info.txt -s=sample_name -o=output_dir

=cut

use lib "/data2/bsi/reference/perl_workflow_ref/lib";
use lib "/data2/bsi/reference/perl_workflow_ref/lib/perl5/x86_64-linux/auto";
use strict;
use warnings;
use Data::Dumper;
use Pod::Usage;
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
use ParseConfig;
use MyUtility;
use Workflow::Logger;

my %options = ();
my $results = GetOptions (\%options,
                          'run_info|r=s',
						  'output_dir|o=s',
						  'input_dir|i=s',
						  'sample|s=s',
						  'log|l=s',
			              'debug=s',
						  'help|h') || pod2usage();

## display documentation
if( $options{'help'} ){
    pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
}

#############################################################################
#### set global vars
############################################
my $LONG_WAIT = 300;

#### make sure everything passed was peachy
&check_parameters(\%options);

#### parse X_info files
my $config = new ParseConfig($options{run_info});
my $util = new MyUtility;

my $logger = new Workflow::Logger('LOG_FILE'=>"$config->{RunInfo}->{logs}/GeneCount.$options{sample}.log",
								  'LOG_LEVEL'=>$options{'debug'});
$logger = Workflow::Logger::get_logger();

$logger->info("Gene count for sample $options{sample} started");

create_dir_struct(\%options);

#### set env variables.
if (defined $ENV{'PYTHONPATH'}) {
	$ENV{'PYTHONPATH'}="$config->{ToolInfo}->{htseqpath}->{value}:$ENV{'PYTHONPATH'}";
} else {
	$ENV{'PYTHONPATH'}="$config->{ToolInfo}->{htseqpath}->{value}";
}

my $this = {};
$this->{input} = "$options{input_dir}/tophat_$options{sample}/$options{sample}-sorted.id.bam";
$this->{output} = "$options{output_dir}/counts";

#### check input file
check_input($this->{input});
my $cmd = "";

## run htseq-count on id sorted bam file.
$cmd = "$config->{ToolInfo}->{samtools}->{value}/samtools view $this->{input}";
$cmd .= " | $config->{ToolInfo}->{python}->{value}/python $config->{ToolInfo}->{htseq}->{value}/htseq-count";
$cmd .= " -m intersection-nonempty -q -s no - $config->{ToolInfo}->{features}->{value}";
$cmd .= " > $this->{output}/$options{sample}.gene.count.tsv";
execute_cmd($cmd);

## check if htseq output was created.
if (! -s "$this->{output}/$options{sample}.gene.count.tsv") {
	$logger->logdie("Gene count failed for $options{sample}");
}

## if input is paried end data than double htseq-count value.
if ($config->{RunInfo}->{paired} == 1) {
	$cmd = "cat $this->{output}/$options{sample}.gene.count.tsv | awk '{print \$1\"\\t\"\$2*2}'";
	$cmd .= " > $this->{output}/$options{sample}.gene.count.tsv.tmp";
	execute_cmd($cmd);

	$cmd = "mv $this->{output}/$options{sample}.gene.count.tsv.tmp $this->{output}/$options{sample}.gene.count.tsv";
	execute_cmd($cmd);
}


fix_pseudoautosomal_genes("$this->{output}/$options{sample}.gene.count.tsv");

$logger->info("Gene count for sample $options{sample} completed");
exit();

#############################################################################
sub check_parameters {
    my $options = shift;

	my @required = qw(run_info output_dir input_dir sample);

	foreach my $key (@required) {
		unless ($options{$key}) {
			print STDERR "ARG: $key is required\n";
			pod2usage({-exitval => 2,  -message => "error message", -verbose => 1, -output => \*STDERR});
			exit(-1);
		}
	}

	$options{'debug'} = 3 unless ($options{'debug'});
}

#############################################################################
sub execute_cmd {
	my $cmd = shift;

	$logger->info("$cmd");
	system($cmd);

	#while (( $? >> 8 ) != 0 ){
	#	$logger->logdie("ERROR: Following command failed to execute. Exiting execution of workflow\n$cmd");

	#	exit(-1);
	#}
}

#############################################################################
sub create_dir_struct {
	my $options = shift;

	my $dir = "$options{output_dir}/counts";
	if ( -d $dir) {
		$logger->info("Directory $dir exist");
	} else {
		execute_cmd("mkdir -p $dir");
	}
}

#############################################################################
sub check_input {
	my $file = shift;

	my $job_id = 0;
	my $sgeerr = "";
	my $sgeout = "";

	if (defined $ENV{JOB_ID}){
		$job_id = $ENV{JOB_ID};
	}

	if (defined $ENV{SGE_STDERR_PATH}){
		$sgeerr = $ENV{SGE_STDERR_PATH};
	}

	if (defined $ENV{SGE_STDOUT_PATH}){
		$sgeout = $ENV{SGE_STDOUT_PATH};
	}

	if (! -s $file){
		my $error_name = "$config->{RunInfo}->{base_output_dir}/$config->{RunInfo}->{pi}/$config->{RunInfo}->{type}/$config->{RunInfo}->{output_folder}/error/GeneCount.$options{sample}.err";
		$util->createErrorFile($error_name, "EXPECTED FILE WHILE RUNNING GENE COUNT STEP IS MISSING\n\n$file");

		$util->reportErrorSGE($config->{RunInfo}->{email},
							  $file,
							  "Gene Count",
							  $error_name,
							  $job_id,
							  $sgeerr,
							  $sgeout);
		exit(100);
	}
}

#############################################################################
sub fix_pseudoautosomal_genes {
	my $file = shift;
	my $autosomal = {'AKAP17A'=>1,
				 'ASMT'=>1,
				 'ASMTL'=>1,
				 'ASMTL-AS1'=>1,
				 'CD99'=>1,
				 'CD99P1'=>1,
				 'CRLF2'=>1,
				 'CSF2RA'=>1,
				 'DHRSX'=>1,
				 'DHRSX-IT1'=>1,
				 'FABP5P13'=>1,
				 'GTPBP6'=>1,
				 'IL3RA'=>1,
				 'LINC00102'=>1,
				 'LINC00106'=>1,
				 'LINC00108'=>1,
				 'LINC00685'=>1,
				 'P2RY8'=>1,
				 'PLCXD1'=>1,
				 'PPP2R3B'=>1,
				 'SHOX'=>1,
				 'SLC25A6'=>1,
				 'XG'=>1,
				 'ZBED1'=>1,
				 'AMDP1'=>1,
				 'DDX11L16'=>1,
				 'DPH3P2'=>1,
				 'IL9R'=>1,
				 'SPRY3'=>1,
				 'TCEB1P24'=>1,
				 'TRPC6P'=>1,
				 'VAMP7'=>1,
				 'WASH6P'=>1,
				 'WASIR1'=>1
				};

	#### grep for each pseudoautosomal gene in gene file
	foreach my $key (sort keys %{$autosomal}) {
		my $extract = `grep '^${key}\\.chr' $file`;
		chomp $extract;

		my @lines = split(/\n/, $extract);
		my $max = 0;
		my $sub = 0;

		#### process each line
		foreach my $l (@lines) {
			my @info = split (/\t/, $l);

			#### substitute count only if the count of a gene
			#### in multi chr is not zero
			if ($info[1] == 0) {
				$sub = 1;
			}

			if ($info[1] > $max) {
				$max = $info[1];
			}
		}

		#### if there is a count other than zero for any other
		#### chr then do not sub with largest value
		if ($sub) {
			$cmd = "sed 's/^${key}\\.chr\\(.*\\)\\t0/${key}.chr\\1\\t$max/' $file > ${file}_new";
			execute_cmd($cmd);

			$cmd = "mv ${file}_new $file";
			execute_cmd($cmd);
		}
	}
}
