#!/usr/bin/perl

=head1 NAME
   merge_gene.pl

=head1 SYNOPSIS
    USAGE: merge_gene.pl -g=gene.mapping.txt -i=input.list -o=output.file

=head1 OPTIONS

B<--input, -i>
	Required. List of all files of gene/exon counts

B<--output, -o>
	Required. Full path to output file

B<--gene_mapping, -g>
	Required. Gene mapping file

B<--help,-h>


=head1  DESCRIPTION
	Merge all gene count files in one


=head1  INPUT



=head1  OUTPUT


=head1  CONTACT
  bjaysheel@gmail.com


==head1 EXAMPLE
	./merge_gene.pl -g=gene.mapping.txt -i=input.list -o=output.file

=cut

use strict;
use warnings;
use Data::Dumper;
use Pod::Usage;
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
use File::Basename;

my %options = ();
my $results = GetOptions (\%options,
                          'input|i=s',
						  'output|o=s',
						  'gene_mapping|g=s',
						  'log|l=s',
			              'debug=s',
						  'help|h') || pod2usage();

#### display documentation
if( $options{'help'} ){
    pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
}

#############################################################################
#### make sure everything passed was peachy
&check_parameters(\%options);

my %genes = ();
my %samples = ();


#### Parse gene mapping file
open(FHD, "<", $options{gene_mapping})
	or die "Could not open gene mapping file $options{gene_mapping}\n";
while (<FHD>) {
	chomp $_;
	my ($chr, $gene, $start, $stop, $len) = split(/\t/, $_);

	#### value for sort order.
	my $order = $chr;
	$order =~ s/chr//;
	if ($order eq "X") {
		$order = 23;
	} elsif ($order eq "Y") {
		$order = 24;
	} elsif ($order eq "M") {
		$order = 25;
	} elsif ($order !~ /\d+/){
		$order = 27;
	}

	#### gene name in gene_mapping file is of format
	#### GENEID.CHR
	#### this is always expected to be a unique value in the gene_mapping file
	#### hence just create a hash with gene name from gene_mapping file as the key
	my @g_name = split(/\./, $gene);
	$genes{$gene} = {chr=>$chr, gene=>$g_name[0], start=>$start, stop=>$stop, coding_length=>$len, sort_order=>$order};
}
close(FHD);

open(LIST, "<", $options{input}) or die "Could not open gene list file $options{input}\n";
#### parse each sample gene count file.
while(my $file = <LIST>) {
	chomp $file;
	my($filename, $path) = fileparse($file); #get filename from full path.

	my $m_path = $path;
	my $sample = $filename;
	$sample =~ s/\.gene\.count\.tsv$//;
	$m_path =~ s/counts\/$/alignment\/tophat_$sample/;

	$m_path .= "/" .$sample. ".flagstat";
	my $mapped_reads=`cat $m_path | cut -f2 | tr "\n" " " | awk '{print \$NF}'`;
	chomp $mapped_reads;

	open(COUNT, "<", $file) or die "Could not open sample count file $file\n";

	while (<COUNT>) {
		chomp $_;

		next if ( ($_ =~ /no_feature/) || ($_ =~ /ambiguous/) || ($_ =~ /too_low_aQual/) || ($_ =~ /not_aligned/) || ($_ =~ /alignment_not_unique/) ) ;

		 #### id: gene_id, value: expression count
		my ($id, $value) = split(/\t/, $_);
		my $sample_name = fileparse($file, ".gene.count.tsv");
		$samples{$sample_name} = 1;

		#if (defined $genes{$id}) {
			print $id."\t".$sample_name."\n";
			push(@{$genes{$id}->{sample}}, {name=>$sample_name, raw=>$value, rpkm=>(((10**9)*$value)/($mapped_reads*$genes{$id}->{coding_length}))});
		#} else {
			#print STDERR "GENE ID NOT FOUND: $id\n";
		#}
	}
	close COUNT;
}
close LIST;

open(OUT, ">", $options{output}) or die "Could not open file to write $options{output}\n";

my $header="Chr\tGeneID\tStart\tStop\tCodingLength";

foreach my $s (sort keys %samples){
	$header .= "\t". $s ."_GeneCount\t" . $s . "_RPKM";
}

print OUT $header ."\n";

foreach my $key (sort { ($genes{$a}->{sort_order} <=> $genes{$b}->{sort_order}) || ($genes{$a}->{gene} cmp $genes{$b}->{gene}) } keys %genes) {
	print OUT $genes{$key}->{chr} ."\t". $genes{$key}->{gene} ."\t". $genes{$key}->{start} ."\t". $genes{$key}->{stop} ."\t". $genes{$key}->{coding_length};

	#if (exists $genes{$key}->{sample}) {
		foreach my $s (sort {$a->{name} cmp $b->{name}} @{$genes{$key}->{sample}}) {
			print OUT "\t".$s->{raw}."\t".$s->{rpkm};
		}
	#} else {
		#print STDERR "Sample(s) does not exists for $key\n";
	#}

	print OUT "\n";
}

exit();

#############################################################################
sub check_parameters {
    my $options = shift;

	my @required = qw(input output gene_mapping);

	foreach my $key (@required) {
		unless ($options{$key}) {
			print STDERR "ARG: $key is required\n";
			pod2usage({-exitval => 2,  -message => "error message", -verbose => 1, -output => \*STDERR});
			exit(-1);
		}
	}
}
