#!/usr/bin/perl

=head1 NAME
   add_unique_exon_id.pl

=head1 SYNOPSIS
    USAGE: add_unique_exon_id.pl -i=reference_file.gtf [-t=/tmp/dir]

=head1 OPTIONS

B<--input, -i>
	Reference file to be modified.  File must be i GTF format.

B<--output_prefix, -o>
	Full path to output location plus prefix
	/some/path/to/file_prefix

B<--help,-h>


=head1  DESCRIPTION
	Create a new GTF file with prefix OUTPUT_PREFIX that contatins
	unique id for each exon.

	Create a new EXON bed file containing only
		chr# start stop unique_exon_id
	This file will be sorted by chr# start and stop values.


=head1  INPUT
	Reference file in GTF format


=head1  OUTPUT
	Update reference file by adding unique id to each exon and exon bed file

=head1  CONTACT
  bjaysheel@gmail.com


==head1 EXAMPLE
	./add_unique_exon_id.pl -r=reference.gtf [-t=/tmp]

=cut

use strict;
use warnings;
use Data::Dumper;
use Pod::Usage;
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);

my %options = ();
my $results = GetOptions (\%options,
                          'input|i=s',
						  'output_prefix|o=s',
						  'log|l=s',
			              'debug=s',
						  'help|h') || pod2usage();

## display documentation
if( $options{'help'} ){
    pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
}

#############################################################################
## make sure everything passed was peachy
&check_parameters(\%options);

my $gtf = $options{output_prefix} . ".gtf";
my $exon = $options{output_prefix} . ".Exon.bed";

open (FHD, "<", $options{input}) or die "Could not open file $options{input}\n";
open (OUT, ">", $gtf) or die "Could not open file $gtf\n";
open (EXN, ">",  $exon) or die "Could not open file $exon\n";

my %hash;

while (<FHD>) {
	chomp $_;

	my @data = split(/\t/, $_);

	## only adding ids to exon features.
	if ($data[2] !~ /exon/) {
		print OUT $_ . "\n";
		next;
	}

	my @info = split(/;/, $data[8]);


	## could have - or _ in the id better to deal with each
	## individually rather than creating a globla regex
	my $gene_id = "";
	my $transcript_id = "";

	#### assuming order of gene_id and transcript_id is unknown.
	foreach my $i (@info) {
		$i =~ s/^\s+//;
		$i =~ s/\s+$//;

		next if ($i !~ /^gene_id|^transcript_id/);

		if ($i =~ /^gene_id/){
			$gene_id = $i;
			$gene_id =~ s/gene_id\s+//;
			$gene_id =~ s/\"//g;
			$gene_id =~ s/\s+//g;
			#print "GeneID: $gene_id\n";
		}


		if ($i =~ /^transcript_id/){
			$transcript_id = $i;
			$transcript_id =~ s/transcript_id\s+//;
			$transcript_id =~ s/\"//g;
			$transcript_id =~ s/\s+//g;
			#print "TranscriptID: $transcript_id\n";
		}
	}

	my $id = $gene_id . "_" . $transcript_id;

	if (! exists $hash{$id}) {
		$hash{$id} = 1;
	}

	$data[8] .= " exon_id \"" . $id . "_E". $hash{$id} . "\";";

	print OUT join ("\t", @data) ."\n";
	print EXN $data[0] . "\t" . $data[3] ."\t". $data[4] . "\t" . $id ."_E" . $hash{$id} . "\n";

	$hash{$id}++;
}

close(FHD);
close(OUT);
close(EXN);

#### Usually origianl GTF file is sorted by chr1 start and stop so no need to sort
#### as file is read wrote back in order.
#system("sort $exon > ${exon}.tmp");
#system("mv ${exon}.tmp $exon");

exit();

#############################################################################
sub check_parameters {
    my $options = shift;

	my @required = qw(input output_prefix);

	foreach my $key (@required) {
		unless ($options{$key}) {
			print STDERR "ARG: $key is required\n";
			pod2usage({-exitval => 2,  -message => "error message", -verbose => 1, -output => \*STDERR});
			exit(-1);
		}
	}
}
