#!/usr/bin/perl
use strict;
use warnings;
## Perl script takes in peak positions and finds genes with nearby TSS or TTS and reports their distance
## Peak BED file: chr	start	stop	rank or peak #
## Jared Evans 12/19/11
use Data::Dumper;

if (scalar(@ARGV) != 4)	
{
	die ( "USAGE: find_nearby_genes.pl [peaks BED file] [reflat file] [output file] [neighborhood distance]\n" );
}

open PEAKS, "$ARGV[0]" or die "opening file: $ARGV[0]";
open GENES, "$ARGV[1]" or die "opening file: $ARGV[1]";
open OUT, ">", "$ARGV[2]" or die "opening file: $ARGV[2]";

my $neighborhood_cutoff = $ARGV[3];

my %peaks = ();
## chr_start_stop -> [peakcenter, peakrank, chr, start, stop, peaklength]
my %transcripts = ();
## chr -> transcript_start_stop -> [transcript, gene, transcript start, transcript stop, TSS, TTS, strand]
my %peaks_and_transcripts = ();
## peakchr_start_stop -> transcript_start_stop -> [transcript, gene, chr, start, stop, strand, TSS, distupstream_TSS, distdownstream_TSS, TTS, distupstream_TTS, distdownstream_TTS]

# args: peak_key transcript gene chr start stop strand TSS TTS 
sub insert_transcript{
	my($peak,$transcript,$gene,$chr,$start,$stop,$strand,$TSS,$TTS,$dist,$index) = ($_[0],$_[1],$_[2],$_[3],$_[4],$_[5],$_[6],$_[7],$_[8],$_[9],$_[10]);
	if(not $peaks_and_transcripts{$peak}{$transcript."_".$start."_".$stop}){
		#add transcript because it isn't associated with this peak yet
		$peaks_and_transcripts{$peak}{$transcript."_".$start."_".$stop} = [$transcript,$gene,$chr,$start,$stop,$strand,$TSS,"NA","NA",$TTS,"NA","NA"];
	}
	$peaks_and_transcripts{$peak}{$transcript."_".$start."_".$stop}[$index] = $dist;
	return 1;
}

# load peaks into hash
while (<PEAKS>){
	my $row = $_;
	chomp $row;
	my @line = split("\t",$row);
	my $peakcenter = (($line[2]-$line[1])/2)+$line[1];
	if($peakcenter =~ m/\.5/){
		$peakcenter += 0.5;
	}
	$peaks{"$line[0]_$line[1]_$line[2]"} = [$peakcenter,$line[3],$line[0],$line[1],$line[2],($line[2]-$line[1])];
	#load all peaks once in output hash
	$peaks_and_transcripts{"$line[0]_$line[1]_$line[2]"} = ();
}

# load transcripts into hash of hashes
while(<GENES>){
	my $row = $_;
	chomp $row;
	my @line = split("\t",$row);
	my $tss = 0;
	my $tts = 0;
	my $strand = 1;
	if($line[3] eq "+"){
		$tss = $line[4];
		$tts = $line[5];
	}else{
		$tss = $line[5];
		$tts = $line[4];
	}
	$transcripts{$line[2]}{"$line[1]_$line[4]_$line[5]"} = [$line[1],$line[0],$line[4],$line[5],$tss,$tts,$line[3]];
}


#print $transcripts{"chr13"}{"NM_008291"}[0]."\n";

foreach my $peak_line (keys %peaks){
	#my $peak_line = "chr1_5011760_5014166"; #upstream -
	#my $peak_line = "chr1_15786175_15786773"; #upstream +
	#my $peak_line = "chr1_13355099_13355685"; #downstream -
	#my $peak_line = "chr1_34076808_34079752"; #downstream +
	my @peak_info = @{$peaks{$peak_line}};
	foreach my $transcript (keys %{$transcripts{$peak_info[2]}}){
		my @transcript_info = @{$transcripts{$peak_info[2]}{$transcript}};
		## TSS:
		if(($peak_info[0]-$transcript_info[4]) <= $neighborhood_cutoff && ($peak_info[0]-$transcript_info[4]) >= 0){
			if($transcript_info[6] eq "+"){
				#print OUT "Peak ($peak_info[0]) Downstream of TSS: ".$transcript_info[0]."\t".$peak_info[2]."\t".$transcript_info[1]."\t".$transcript_info[2]."\t".$transcript_info[3]."\t".$transcript_info[4]."\t".$transcript_info[5]."\t".$transcript_info[6]."\n";
				&insert_transcript($peak_line,$transcript_info[0],$transcript_info[1],$peak_info[2],$transcript_info[2],$transcript_info[3],$transcript_info[6],$transcript_info[4],$transcript_info[5],($peak_info[0]-$transcript_info[4]),8);
			}else{
				#print OUT "Peak ($peak_info[0]) Upstream of TSS: ".$transcript_info[0]."\t".$peak_info[2]."\t".$transcript_info[1]."\t".$transcript_info[2]."\t".$transcript_info[3]."\t".$transcript_info[4]."\t".$transcript_info[5]."\t".$transcript_info[6]."\n";
				&insert_transcript($peak_line,$transcript_info[0],$transcript_info[1],$peak_info[2],$transcript_info[2],$transcript_info[3],$transcript_info[6],$transcript_info[4],$transcript_info[5],($peak_info[0]-$transcript_info[4]),7);
			}
		}elsif(($transcript_info[4]-$peak_info[0]) <= $neighborhood_cutoff && ($transcript_info[4]-$peak_info[0]) >= 0){
			if($transcript_info[6] eq "+"){
				#print OUT "Peak ($peak_info[0]) Upstream of TSS: ".$transcript_info[0]."\t".$peak_info[2]."\t".$transcript_info[1]."\t".$transcript_info[2]."\t".$transcript_info[3]."\t".$transcript_info[4]."\t".$transcript_info[5]."\t".$transcript_info[6]."\n";
				&insert_transcript($peak_line,$transcript_info[0],$transcript_info[1],$peak_info[2],$transcript_info[2],$transcript_info[3],$transcript_info[6],$transcript_info[4],$transcript_info[5],($transcript_info[4]-$peak_info[0]),7);
			}else{
				#print OUT "Peak ($peak_info[0]) Downstream of TSS: ".$transcript_info[0]."\t".$peak_info[2]."\t".$transcript_info[1]."\t".$transcript_info[2]."\t".$transcript_info[3]."\t".$transcript_info[4]."\t".$transcript_info[5]."\t".$transcript_info[6]."\n";
				&insert_transcript($peak_line,$transcript_info[0],$transcript_info[1],$peak_info[2],$transcript_info[2],$transcript_info[3],$transcript_info[6],$transcript_info[4],$transcript_info[5],($transcript_info[4]-$peak_info[0]),8);
			}
		}
		## TTS:
		if(($peak_info[0]-$transcript_info[5]) <= $neighborhood_cutoff && ($peak_info[0]-$transcript_info[5]) >= 0){
			if($transcript_info[6] eq "+"){
				#print OUT "Peak ($peak_info[0]) Downstream of TTS: ".$transcript_info[0]."\t".$peak_info[2]."\t".$transcript_info[1]."\t".$transcript_info[2]."\t".$transcript_info[3]."\t".$transcript_info[4]."\t".$transcript_info[5]."\t".$transcript_info[6]."\n";
				&insert_transcript($peak_line,$transcript_info[0],$transcript_info[1],$peak_info[2],$transcript_info[2],$transcript_info[3],$transcript_info[6],$transcript_info[4],$transcript_info[5],($peak_info[0]-$transcript_info[5]),11);
			}else{
				#print OUT "Peak ($peak_info[0]) Upstream of TTS: ".$transcript_info[0]."\t".$peak_info[2]."\t".$transcript_info[1]."\t".$transcript_info[2]."\t".$transcript_info[3]."\t".$transcript_info[4]."\t".$transcript_info[5]."\t".$transcript_info[6]."\n";
				&insert_transcript($peak_line,$transcript_info[0],$transcript_info[1],$peak_info[2],$transcript_info[2],$transcript_info[3],$transcript_info[6],$transcript_info[4],$transcript_info[5],($peak_info[0]-$transcript_info[5]),10);
			}
		}elsif(($transcript_info[5]-$peak_info[0]) <= $neighborhood_cutoff && ($transcript_info[5]-$peak_info[0]) >= 0){
			if($transcript_info[6] eq "+"){
				#print OUT "Peak ($peak_info[0]) Upstream of TTS: ".$transcript_info[0]."\t".$peak_info[2]."\t".$transcript_info[1]."\t".$transcript_info[2]."\t".$transcript_info[3]."\t".$transcript_info[4]."\t".$transcript_info[5]."\t".$transcript_info[6]."\n";
				&insert_transcript($peak_line,$transcript_info[0],$transcript_info[1],$peak_info[2],$transcript_info[2],$transcript_info[3],$transcript_info[6],$transcript_info[4],$transcript_info[5],($transcript_info[5]-$peak_info[0]),10);
			}else{
				#print OUT "Peak ($peak_info[0]) Downstream of TTS: ".$transcript_info[0]."\t".$peak_info[2]."\t".$transcript_info[1]."\t".$transcript_info[2]."\t".$transcript_info[3]."\t".$transcript_info[4]."\t".$transcript_info[5]."\t".$transcript_info[6]."\n";
				&insert_transcript($peak_line,$transcript_info[0],$transcript_info[1],$peak_info[2],$transcript_info[2],$transcript_info[3],$transcript_info[6],$transcript_info[4],$transcript_info[5],($transcript_info[5]-$peak_info[0]),11);
			}
		}
	}
}

#print Dumper(%peaks_and_transcripts);

## print output header
#print OUT "IGV link\tChr\tPeak Start\tPeak Stop\tPeak\tPeak Length\tPeak Center\tTranscript\tGene\tChr\tTranscript Start\tTranscript Stop\tStrand\tTSS\tDistance of Peak Upstream TSS\tDistance of Peak Downstream TSS\tTTS\tDistance of Peak Upstream TTS\tDistance of Peak Downstream TTS\n";
print OUT "IGV link\tChr\tPeak Start\tPeak Stop\tPeak\tPeak Length\tPeak Center\tGene\tTranscript\tTranscript Start\tTranscript Stop\tStrand\tTSS\tDistance of Peak Upstream TSS\tDistance of Peak Downstream TSS\tTTS\tDistance of Peak Upstream TTS\tDistance of Peak Downstream TTS\n";

foreach my $peak (sort keys %peaks_and_transcripts){
	my @peak_info = @{$peaks{$peak}};
	if(keys %{$peaks_and_transcripts{$peak}}){
		foreach my $transcript (keys %{$peaks_and_transcripts{$peak}}){
			my @trans_info = @{$peaks_and_transcripts{$peak}{$transcript}};
			print OUT "=HYPERLINK(\"http://localhost:60151/goto?locus=".$peak_info[2].":".$peak_info[3]."-".$peak_info[4]."\",\"link\")\t";
			print OUT $peak_info[2]."\t".$peak_info[3]."\t".$peak_info[4]."\t".$peak_info[1]."\t".$peak_info[5]."\t".$peak_info[0]."\t";
			print OUT $trans_info[1]."\t".$trans_info[0]."\t".$trans_info[3]."\t".$trans_info[4]."\t".$trans_info[5]."\t".$trans_info[6]."\t".$trans_info[7]."\t".$trans_info[8]."\t".$trans_info[9]."\t".$trans_info[10]."\t".$trans_info[11]."\n";
		}
	}else{
		print OUT "=HYPERLINK(\"http://localhost:60151/goto?locus=".$peak_info[2].":".$peak_info[3]."-".$peak_info[4]."\",\"link\")\t";
		print OUT $peak_info[2]."\t".$peak_info[3]."\t".$peak_info[4]."\t".$peak_info[1]."\t".$peak_info[5]."\t".$peak_info[0]."\t";
		print OUT "NA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\tNA\n";
	}
}

close PEAKS;
close GENES;
close OUT;





