#!/usr/bin/perl

=head1 NAME
   gene_count.pl

=head1 SYNOPSIS
    USAGE: gene_count.pl -r=run_info.txt -s=sample name -o=output_dir

=head1 OPTIONS

B<--run_info, -r>
	Run info file

B<--output_dir, -o>
	Output directory

B<--sample, -s>
	Sample name

B<--help,-h>


=head1  DESCRIPTION
	Run htseq-count for a given sample.

=head1  INPUT

=head1  OUTPUT


=head1  CONTACT
  bjaysheel@gmail.com


==head1 EXAMPLE
	./gene_count.pl -r=run_info.txt -s=sample_name -o=output_dir

=cut

use strict;
use warnings;
use Data::Dumper;
use Pod::Usage;
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
use ParseConfig;
use MyUtility;
use Workflow::Logger;

my %options = ();
my $results = GetOptions (\%options,
                          'run_info|r=s',
						  'output_dir|o=s',
						  'sample|s=s',
						  'log|l=s',
			              'debug=s',
						  'help|h') || pod2usage();

## display documentation
if( $options{'help'} ){
    pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
}

#############################################################################
#### set global vars
############################################
my $LONG_WAIT = 300;

#### make sure everything passed was peachy
&check_parameters(\%options);

#### parse X_info files
my $config = new ParseConfig($options{run_info});
my $util = new MyUtility;

my $logger = new Workflow::Logger('LOG_FILE'=>"$config->{RunInfo}->{logs}/GeneCount.$options{sample}.log",
								  'LOG_LEVEL'=>$options{'debug'});
$logger = Workflow::Logger::get_logger();

$logger->info("Gene count for sample $options{sample} started");

create_dir_struct(\%options);


my $featureCountTool = "HTSEQ";

if(defined($config->{RunInfo}->{use_subread_featurecounts})){
    my $subread = uc($config->{RunInfo}->{use_subread_featurecounts}) ;

    if($subread eq "Y" || $subread eq "YES" || $subread eq "TRUE"){
        $featureCountTool = "SUBREAD";
    }
}

my $this = {};
my $cmd = "";

if($featureCountTool eq "SUBREAD"){
    subread_feature_count();
} else {
    htseq_count();
}

fix_pseudoautosomal_genes("$this->{output}/$options{sample}.gene.count.tsv");

$logger->info("Gene count for sample $options{sample} completed");
exit();


#############################################################################
sub htseq_count {

    #### set env variables.
    if (defined $ENV{'PYTHONPATH'}) {
        $ENV{'PYTHONPATH'}="$config->{ToolInfo}->{htseqpath}->{value}:$ENV{'PYTHONPATH'}";
    } else {
        $ENV{'PYTHONPATH'}="$config->{ToolInfo}->{htseqpath}->{value}";
    }

    $this->{input} = "$options{output_dir}/alignment/tophat_$options{sample}/$options{sample}-sorted.id.bam";
    $this->{output} = "$options{output_dir}/counts";

    #### check input file
    check_input($this->{input});

    ## run htseq-count on id sorted bam file.
    $cmd = "";
    $cmd .= "$config->{ToolInfo}->{samtools}->{value}/samtools view $this->{input}";
    $cmd .= " | $config->{ToolInfo}->{python}->{value}/python $config->{ToolInfo}->{htseq}->{value}/bin/htseq-count";
    $cmd .= " -m intersection-nonempty -q -s no - $config->{ToolInfo}->{features}->{value}";
    $cmd .= " > $this->{output}/$options{sample}.gene.count.tsv";
    execute_cmd($cmd);

    ## check if htseq output was created.
    if (! -s "$this->{output}/$options{sample}.gene.count.tsv") {
        $logger->logdie("Gene count failed for $options{sample}");
    }

    ## if input is paried end data than double htseq-count value.
    if ($config->{RunInfo}->{paired} == 1) {
        $cmd = "cat $this->{output}/$options{sample}.gene.count.tsv | awk '{print \$1\"\\t\"\$2*2}'";
        $cmd .= " > $this->{output}/$options{sample}.gene.count.tsv.tmp";
        execute_cmd($cmd);

        $cmd = "mv $this->{output}/$options{sample}.gene.count.tsv.tmp $this->{output}/$options{sample}.gene.count.tsv";
        execute_cmd($cmd);
    }
}


#############################################################################
sub subread_feature_count {

    $this->{input} = "$options{output_dir}/alignment/tophat_$options{sample}/$options{sample}_sorted.bam";
    $this->{output} = "$options{output_dir}/counts";

    #### check input file
    check_input($this->{input});

    $cmd = "";
    $cmd .= "$config->{ToolInfo}->{subread_bin}->{value}/featureCounts " ;
    $cmd .= "-a $config->{ToolInfo}->{features}->{value} ";
    $cmd .= "-o $this->{output}/$options{sample}.gene.count.tsv.original ";
    $cmd .= $this->{input} ;

    print $cmd ;
    execute_cmd($cmd);

    $cmd = "cut -f1,7 $this->{output}/$options{sample}.gene.count.tsv.original | tail -n +3 > $this->{output}/$options{sample}.gene.count.tsv";
    print $cmd ;
    execute_cmd($cmd);


    # check if htseq output was created.
    if (! -s "$this->{output}/$options{sample}.gene.count.tsv") {
        $logger->logdie("Gene count failed for $options{sample}");
    }
}


#############################################################################
sub check_parameters {
    my $options = shift;

	my @required = qw(run_info output_dir sample);

	foreach my $key (@required) {
		unless ($options{$key}) {
			print STDERR "ARG: $key is required\n";
			pod2usage({-exitval => 2,  -message => "error message", -verbose => 1, -output => \*STDERR});
			exit(-1);
		}
	}

	$options{'debug'} = 3 unless ($options{'debug'});
}

#############################################################################
sub execute_cmd {
	my $cmd = shift;

	$logger->info("$cmd");
	system($cmd);

	#while (( $? >> 8 ) != 0 ){
	#	$logger->logdie("ERROR: Following command failed to execute. Exiting execution of workflow\n$cmd");

	#	exit(-1);
	#}
}

#############################################################################
sub create_dir_struct {
	my $options = shift;

	my $dir = "$options{output_dir}/counts";
	if ( -d $dir) {
		$logger->info("Directory $dir exist");
	} else {
		execute_cmd("mkdir -p $dir");
	}
}

#############################################################################
sub check_input {
	my $file = shift;

	my $send_mail = 1;

	while (! -s $file) {
		if ($send_mail) {
			$send_mail = 0;

			$util->missingInput($config->{RunInfo}->{email},
								"GeneCount",
								"SortBam",
								"$file",
								$config->{RunInfo}->{tool});
		}

		sleep $LONG_WAIT;
	}
}

#############################################################################
sub fix_pseudoautosomal_genes {
	my $file = shift;
	my $autosomal = {'AKAP17A'=>1,
				 'ASMT'=>1,
				 'ASMTL'=>1,
				 'ASMTL-AS1'=>1,
				 'CD99'=>1,
				 'CD99P1'=>1,
				 'CRLF2'=>1,
				 'CSF2RA'=>1,
				 'DHRSX'=>1,
				 'DHRSX-IT1'=>1,
				 'FABP5P13'=>1,
				 'GTPBP6'=>1,
				 'IL3RA'=>1,
				 'LINC00102'=>1,
				 'LINC00106'=>1,
				 'LINC00108'=>1,
				 'LINC00685'=>1,
				 'P2RY8'=>1,
				 'PLCXD1'=>1,
				 'PPP2R3B'=>1,
				 'SHOX'=>1,
				 'SLC25A6'=>1,
				 'XG'=>1,
				 'ZBED1'=>1,
				 'AMDP1'=>1,
				 'DDX11L16'=>1,
				 'DPH3P2'=>1,
				 'IL9R'=>1,
				 'SPRY3'=>1,
				 'TCEB1P24'=>1,
				 'TRPC6P'=>1,
				 'VAMP7'=>1,
				 'WASH6P'=>1,
				 'WASIR1'=>1
				};

	#### grep for each pseudoautosomal gene in gene file
	foreach my $key (sort keys %{$autosomal}) {
		my $extract = `grep '^${key}\\.chr' $file`;
		chomp $extract;

		my @lines = split(/\n/, $extract);
		my $max = 0;
		my $sub = 0;

		#### process each line
		foreach my $l (@lines) {
			my @info = split (/\t/, $l);

			#### substitute count only if the count of a gene
			#### in multi chr is not zero
			if ($info[1] == 0) {
				$sub = 1;
			}

			if ($info[1] > $max) {
				$max = $info[1];
			}
		}

		#### if there is a count other than zero for any other
		#### chr then do not sub with largest value
		if ($sub) {
			$cmd = "sed 's/^${key}\\.chr\\(.*\\)\\t0/${key}.chr\\1\\t$max/' $file > ${file}_new";
			execute_cmd($cmd);

			$cmd = "mv ${file}_new $file";
			execute_cmd($cmd);
		}
	}
}
