#!/usr/bin/perl

=head1 NAME
   add_unique_exon_id.pl

=head1 SYNOPSIS
    USAGE: add_unique_exon_id.pl -i=reference_file.gtf [-t=/tmp/dir]

=head1 OPTIONS

B<--input, -i>
	Required. Reference file to be modified.  File must be i GTF format.

B<--output_prefix, -o>
	Required. Full path to output location plus prefix
	/some/path/to/file_prefix

B<--help,-h>


=head1  DESCRIPTION
	Create a new GTF file with prefix OUTPUT_PREFIX that contatins
	unique id for each exon.

	Create a new EXON bed file containing only
		chr# start stop unique_exon_id
	This file will be sorted by chr# start and stop values.


=head1  INPUT
	Reference file in GTF format


=head1  OUTPUT
	Update reference file by adding unique id to each exon and exon bed file

=head1  CONTACT
  bjaysheel@gmail.com


==head1 EXAMPLE
	./add_unique_exon_id.pl -r=reference.gtf [-t=/tmp]

=cut

use strict;
use warnings;
use Data::Dumper;
use Pod::Usage;
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
use File::Basename;
use ParseConfig;
use MyUtility;
use Workflow::Logger;

my %options = ();
my $results = GetOptions (\%options,
                          'run_info|r=s',
						  'output_dir|o=s',
						  'log|l=s',
			              'debug=s',
						  'help|h') || pod2usage();

#### display documentation
if( $options{'help'} ){
    pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
}

#############################################################################
#### set global vars
############################################
my $LONG_WAIT = 300;

#### make sure everything passed was peachy
&check_parameters(\%options);

#### parse X_info files
my $config = new ParseConfig($options{run_info});
my $util = new MyUtility;

my $logger = new Workflow::Logger('LOG_FILE'=>"$config->{RunInfo}->{logs}/ModifyGTF.log",
								  'LOG_LEVEL'=>$options{'debug'});
$logger = Workflow::Logger::get_logger();

$logger->info("Modify GTF started");

my $filename = fileparse($config->{ToolInfo}->{features}->{value}, ".gtf");
my %genes = ();
my %hash = ();
my %unique = ();

#### open file handler to read original gtf file
open (FHD, "<", $config->{ToolInfo}->{features}->{value}) or die "Could not open file $config->{ToolInfo}->{features}->{value}\n";

#### file handler to write modified gtf
open (GTF, ">", "$options{output_dir}/$filename".".mod.gtf")
	or die "Could not open file $options{output_dir}/$filename".".mod.gtf\n";

#### file handler to write exons only in bed format
open (EXN, ">", "$options{output_dir}/$filename".".exon.bed")
	or die "Could not open file $options{output_dir}/$filename".".exon.bed\n";

#### file handler to write gene mapping file
open (GMP, ">", "$options{output_dir}/$filename".".gene.mapping.txt")
	or die "Could not open file $options{output_dir}/$filename".".gene.mapping.txt\n";

#### file handler for chrs that are skipped
open (SKP, ">", "$options{output_dir}/$filename".".skip.txt")
	or die "Could not open file $options{output_dir}/$filename".".skip.txt\n";

my $prev = {start=>0, stop=>0, gene=>'', chr=>''};

while (<FHD>) {
	chomp $_;

	my @data = split(/\t/, $_);

	#### skip contigs
	if ($data[0] !~ /^chr\d+$|^chrX$|^chrY$|^chrM$/) {
		print SKP $data[0]."\n";
		next;
	}

	#### only adding ids to exon features.
	if ($data[2] !~ /exon/) {
		$data[8] =~ s/gene_id "(\w+)"/gene_id "$1.$data[0]"/;
		print GTF join ("\t", @data) ."\n";
		next;
	}

	#### extract gene and transcript id from each line of gtf file.
	##############################################################
	my @info = split(/;/, $data[8]);

	## could have - or _ in the id better to deal with each
	## individually rather than creating a globla regex
	my $gene_id = "";
	my $transcript_id = "";

	#### assuming order of gene_id and transcript_id is unknown.
	foreach my $i (@info) {
		$i =~ s/^\s+//;
		$i =~ s/\s+$//;

		next if ($i !~ /^gene_id|^transcript_id/);

		if ($i =~ /^gene_id/) {
			$gene_id = $i;
			$gene_id =~ s/gene_id\s+//;
			$gene_id =~ s/\"//g;
			$gene_id =~ s/\s+//g;
			#print "GeneID: $gene_id\n";
		}


		if ($i =~ /^transcript_id/) {
			$transcript_id = $i;
			$transcript_id =~ s/transcript_id\s+//;
			$transcript_id =~ s/\"//g;
			$transcript_id =~ s/\s+//g;
			#print "TranscriptID: $transcript_id\n";
		}
	}

	#### create a hash of genes -> chr
	#### to count for duplicate genes id across diff chr
	#### key for hash is gene_chr
	my $chr = $data[0];
	my $key = $gene_id .".". $data[0];

	#### check if a gene_id is duplicate
	#if (exists $unique{$gene_id}) {
	#	if ($unique{$gene_id} !~ /$data[0]/i) {
	#		print "Duplicate gene $gene_id in $unique{$gene_id} and $data[0]\n";
	#	}
	#} else {
	#	$unique{$gene_id} = $data[0];
	#}

	if (! exists $genes{$key}){
		$genes{$key} = {gene => $gene_id, chr=>$data[0]};
	}

	#### remove transcript spanning exact same region of the gene in order to get
	#### accurate coding length.
	#### ignore if start, stop, chr and gene are exact same as previously seen
	#### i.e transcript spanning same region.
	unless (($prev->{start} == $data[3]) && ($prev->{stop} == $data[4]) && ($prev->{chr} =~ /^$data[0]$/i) && ($prev->{gene} =~ /^$gene_id$/i)) {

		push (@{$genes{$key}->{exons}}, {start=>$data[3], stop=>$data[4]});

		$prev->{start} = $data[3];
		$prev->{stop} = $data[4];
		$prev->{chr} = $data[0];
		$prev->{gene} = $gene_id;
	}

	$data[8] =~ s/gene_id "$gene_id"/gene_id "${gene_id}.$data[0]"/;

	my $id = $gene_id . "." . $transcript_id;
	if (! exists $hash{$id}) {
		$hash{$id} = 1;
	}

	$data[8] .= " exon_id \"" . $id . ".E". $hash{$id} . "\";";

	print GTF join ("\t", @data) ."\n";
	print EXN $data[0] . "\t" . $data[3] ."\t". $data[4] . "\t" . $id .".E" . $hash{$id} . "\n";

	$hash{$id}++;
}

close(FHD);
close(GTF);
close(EXN);
close(SKP);

#### create gene mapping file.
#### sum all exons to get coding length,
#### min start value is start of gene
#### max stop value is stop of gene.

#### all transcripts are sorted by start position
#### there for only following cases can occur
#### case 1
####	|---------------------------|
####	|---------------------------|

#### case 2
####	|---------------------------|
####		|---------------------|

#### case 3
####	|---------------------------|
####		|---------------------------|

#### case 4
####	|---------------------------|
####								|---------------------------|

#### case 5
####	|---------------------------|
####								 |---------------------------|

foreach my $key (sort {$genes{$a}->{gene} cmp $genes{$b}->{gene}} keys %genes) {
	my $coding_length = 0;
	my $start = -1;
	my $stop = -1;
	my $prev_stop = -1;

	my @arr = @{$genes{$key}->{exons}};

	foreach my $feat (sort {$$a{start} <=> $$b{start}} @arr) {
		#### skip if currenct feature is completely in side previous feature.
		#### i.e case 1 or case 2
		#### |-------------------------|
		####     |--------------|
		next if (($prev_stop > $feat->{start}) && ($prev_stop >= $feat->{stop}));

		#### case 5
		if ($prev_stop < $feat->{start}) {
			$coding_length += (($feat->{stop} - $feat->{start}) + 1);
		} else {
			#### for case 3, and 4 since last base of previous transcript is already
			#### accounted for start from next base.
			$coding_length += (($feat->{stop} - ($prev_stop+1)) + 1);
		}

		if ($start == -1) {
			$start = $feat->{start};
		}

		if ($stop == -1) {
			$stop = $feat->{stop};
		}

		if ($start >= $feat->{start}) {
			$start = $feat->{start};
		}

		if ($stop <= $feat->{stop}) {
			$stop = $feat->{stop};
		}

		$prev_stop = $feat->{stop};
	}

	print GMP $genes{$key}->{chr} . "\t" . $key . "\t". $start . "\t" . $stop . "\t" . $coding_length ."\n";
}

close(GMP);


#### check and update tool info file.
$config->{ToolInfo}->{features}->{value} = "$options{output_dir}/$filename".".mod.gtf";
$config->{ToolInfo}->{exon_bed}->{value} = "$options{output_dir}/$filename".".exon.bed";
$config->{ToolInfo}->{gene_mapping}->{value} = "$options{output_dir}/$filename".".gene.mapping.txt";

check_input($config->{ToolInfo}->{features}->{value});
check_input($config->{ToolInfo}->{exon_bed}->{value});
check_input($config->{ToolInfo}->{gene_mapping}->{value});

$config->{ToolInfo}->export("$options{output_dir}/tool_info.txt");

exit();

#############################################################################
sub check_parameters {
    my $options = shift;

	my @required = qw(run_info output_dir);

	foreach my $key (@required) {
		unless ($options{$key}) {
			print STDERR "ARG: $key is required\n";
			pod2usage({-exitval => 2,  -message => "error message", -verbose => 1, -output => \*STDERR});
			exit(-1);
		}
	}
}

#############################################################################
sub check_input {
	my $file = shift;

	my $send_mail = 1;

	while (! -s $file) {
		if ($send_mail) {
			$send_mail = 0;

			$util->missingInput($config->{RunInfo}->{email},
								"ModifyGTF",
								"ModifyGTF",
								"$file",
								$config->{RunInfo}->{tool});
		}

		sleep $LONG_WAIT;
	}
}
