#!/usr/bin/perl
use strict;
use Getopt::Long;
my $hidden='';
my $snppicker = "SNPPicker2.jar";

my $fsnp ="snps_all_sources.txt";
my $bestgenesource_file = "gene_snps_all_sources.txt";

my $r2 = 0.9;
my $tool = "ldselect";
my $mapfile = "rsid_pos_map.txt";
my %OKtools =("ldselect"=>1,"tagzilla" =>1,"mayo"=>1,"tagger"=>1);
my $infinium = "";
my $infinium_switch='';#default false
my $hidden='';
my $computeUtilityOnly = '';
my $computeUtilityOnly_option="";
my $excludefile = "";
my $obligatefile = "";
my $minscore="";
my $pop="CEU";
my $default_score = -1;
my $tooClose = 60;
my $tooclose_option ="";

my @illuminaFiles = ();#("/sharedData/sicotteh/snppicker/Test/rider/kullo_example/Kullo_redo_uniq_withids_nonulls_illuminaFile_PS.csv");

my $report_name ="kullo_report";
my $help='';
my $result;

my $onesnpperbin='';
my $onesnpperbin_option="";
my $nOPA=1;
my $onerun_switch=''; # if true, run do not run SNPPicker on each Chrom (assumes that we have annotation for all SNPs)

my @populations=();
my @directories=();
#my @sources=("Hapmap","Niehs","Seattle");
my $mayobest = ''; # flag on wether to read gene_snps_all_sources.txt format file with the best source for each gene.


# The input file structure must be 
#  directory_for_one_pop/source/binfile

# There can be multiple directories for the same pop, e.g. as in the case when one makes one directory
#  per chromosome.

my $rules = '';
my $fileidmap = 'fileid2entrez.txt';
my $minprob =1.0;
my $external_obligates="";
my $force_external_obligates_flag="";
my $oblig_utility="";
my $nrandom=0;
my $binfile = "ldselect.out";
my $maxCPU="";

die "Error processing input paremeters" unless 
	$result = GetOptions("help"=> \$help,"s=s"=> \$fsnp,"r2=f"=>\$r2,"tool=s" =>\$tool,
	                     "ps=s"=> \@illuminaFiles,
	                     "reportname=s" =>\$report_name,
	                     "binfile=s"=> \$binfile,
	                     "posmap=s" =>\$mapfile,
	                     "infinium"=>\$infinium_switch,
	                     "x=s" => \$excludefile,
	                     "obligate=s" => \$obligatefile,
	                     "computeutilityonly"=>\$computeUtilityOnly,
	                     "onesnpperbin" => \$onesnpperbin,
	                     "rules=s" => \$rules,
	                     "nOPA=i" => \$nOPA,
	                     "pop=s" =>\@populations,
	                     "dir=s" =>\@directories,
	    		     "mayobest" => \$mayobest,
	    		     "snpfile=s" => \$fsnp,
	    		     "minscore=s" => \$minscore,
	    		     "minprob=s" => \$minprob,
			     "all_chrom_one_run" => \$onerun_switch,
			     "fileidmap" => \$fileidmap,
			     "tool=s" => \$tool,
			     "default_score=s" => \$default_score,
			     "obligatesnotinpanel=s" => \$external_obligates,
			     "obligatesnotinpaneldone" => \$force_external_obligates_flag,
                             "obligateutilityonly" => \$oblig_utility,
                             "random=i" => \$nrandom,
                             "hidden" => \$hidden,
                             "X=i" => \$tooClose,
                             "cpu=i" => \$maxCPU
	                     );
if($help) {
	usage();
	die;
}
die "invalid -tools option ($tool), must be one of ". join(",",keys %OKtools) . "\n" 
     unless $OKtools{$tool}==1;
#die "sources other than ldselect not yet supported" unless $tool eq "ldselect";



if($mayobest) {
	print "Using only best Sources\n";
} else {
	print "Using all sources\n";
}
my $cpuoption="";

if(length($maxCPU)!=0) {
	$cpuoption = " -cpu $maxCPU ";	
}

if($infinium_switch) {
        $infinium = " -infinium2 ";
        $nOPA=-1;
        $tooClose=0;
}
$tooclose_option = " -X $tooClose ";

my $chromByChrom=1;
if($onerun_switch) {
	$chromByChrom='';
}
my $exclude_option="";
if($excludefile) {
	$exclude_option = " -x $excludefile "; # Can be one whole file for all chromosomes.
}
my $obligate_option="";
if($obligatefile) {
	$obligate_option = " -f $obligatefile "; # Can be one whole file for all chromosomes.
}
my $minscore_option="";
if($minscore) {
	$minscore_option = " -minscore $minscore "; # Can be one whole file for all chromosomes.
}
if($computeUtilityOnly) {
	$computeUtilityOnly_option=" -computeutilityonly ";
}
die "cannot both specify --onesnpperbin and --rules " if($onesnpperbin && $rules);
my $binning_rules='';
if($onesnpperbin) {
	$onesnpperbin_option = " -r \"1-Inf=1\" ";
} elsif($rules) {
	$binning_rules =  "-r \"$rules\" ";
}
die "The number of --pop must be equal to the number of --dir" unless $#populations == $#directories;
die "Need at least one input directory (can be \.)" if $#populations<0;

die "must specify a --binfile= option if change --tool =mayo|tagzilla||tagger" unless ( $tool eq "ldselect" || (!($binfile eq "ldselect.out")));

my $default_score_option = " -defaultscore $default_score ";

my $ext_oblig_option="";
if($external_obligates) {
    if($force_external_obligates_flag) {
	$ext_oblig_option = " -obligatesnotinpanel $external_obligates -obligatesnotinpaneldone ";
    } else {
	$ext_oblig_option = " -obligatesnotinpanel $external_obligates ";
    }
}

my $oblig_util_option="";
if($oblig_utility) { 
    $oblig_util_option=" -obligateutilityonly ";
}

my $randomswitch = "";
if($nrandom>0) {
        $randomswitch = " -randommulti -nR $nrandom ";
}

my %GENESRC;
my %CHRSRC;
my %GENESRCPOP;
my %GENES;

for(my $idir=0;$idir<=$#directories;$idir++) {
        my $dir = $directories[$idir];
        my $pop = $populations[$idir];
	if($mayobest) {
	# Only the best source per gene.
		my $genefile = $dir . "/" . $bestgenesource_file;
		die "unable to open GENE file $genefile\n" 
		unless open FGS,"<$genefile";
		my $l=<FGS>; # read header line
		while ($l=<FGS>) {
			chomp $l;
			my @line = split(/\t/,$l);
			my $geneid=$line[1];
			my $hugo = $line[0];
			my $source = $line[9]; # Best overall sources only.
			if(!($source eq "None")) {
				my $chr = $line[5];
				$GENESRC{$dir . "/" . $source . "/" . $geneid} = $hugo;
				$CHRSRC{$dir . "/" .$source . "/" . $geneid} = $chr;
				$GENESRCPOP{$dir . "/" .$source . "/" . $geneid} = $pop;
			}
		}
		close FGS;
	 } else {
		 my $snpfile = $dir . "/" . $fsnp;
		 die "unable to open SNP file $snpfile\n" 
			unless open FGS,"<$snpfile";
		my $l=<FGS>; # read header line
		while ($l=<FGS>) {
			chomp $l;
			my @line = split(/\t/,$l);
			my $geneid=$line[1];
			my $hugo = $line[0];
			my $source = $line[2];
			my $chr = $line[4]; 
			my $key =$dir . "/" . $source . "/" . $geneid;
			$GENESRC{$key} = $hugo;
			$CHRSRC{$key} = $chr;
			$GENESRCPOP{$key} = $pop;
			if(exists $GENES{$geneid}) {
			    my $list_ref = $GENES{$geneid};
			    push(@$list_ref,$key);
			    $GENES{$geneid}=$list_ref;
			} else {
			    $GENES{$geneid}=[$key];
			}

		}
		close FGS;
	}
	if($tool eq "mayo") {
		# Have to create gene-centric mayo format files.
		 my $snpfile = $dir . "/" . $fsnp;
		 die "unable to open SNP file $snpfile\n" 
			unless open FGS,"<$snpfile";
		my %OUTFILES;
		my $headerline=<FGS>; # read header line
		my $l="";
		my $oldfile="";
		my $FOUTREF;
		while ($l=<FGS>) {
			chomp $l;
			my @line = split(/\t/,$l);
			my $geneid=$line[1];
			my $hugo = $line[0];
			my $source = $line[2];
			my $chr = $line[4]; 
			my $foutgene = $dir . "/" . $source . "/" . $geneid . "/$binfile";
			if(!($oldfile eq $foutgene)) {
				if($FOUTREF) {close $FOUTREF;}
				open FREF,">$foutgene";
				print "$headerline\n";
				$FOUTREF = \*FREF;
				$oldfile=$foutgene;
			}
			print FREF "$l\n";

		}
		close FGS;
		if($FOUTREF) {close $FOUTREF;}
	}
	
}

my $cmd = "";
my $fswitch = " -il ";
if($tool eq "ldselect") {
	$fswitch = " -il ";
} elsif ($tool eq "tagzilla") {
	$fswitch=" -iz ";
} elsif ($tool eq "tagger") {
	$fswitch=" -it ";
} elsif ($tool eq "mayo") {
	$fswitch=" -im ";
}

# to fully support mayo format, one would have to read the top level file per source
# .. e.g. Hapmap/snps.txt .. and create a new input file with only the genes that are considered "best" source.

my @genes = keys %GENESRC;
if(!-d $report_name) {
	mkdir($report_name);
}
my @chrs = ("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y");
#my @chrs = ("13","14","15","16","17","18","19","20","21","22","X","Y");

my $cat ="";
my $header ="";
my $bheader="";
 
open "FFINAL", ">$report_name" . ".txt";
if($hidden) {
	open "BFINAL", ">$report_name" . ".binprob";
}
 
my %ADDOBLIG;
my $report = "$report_name" ."/" . "$report_name" . "_allchrom";
if (! -d $report_name) {
    mkdir $report_name;
}

my $hidden_option ="";
if($hidden) {
	$hidden_option = " -dumpBinProb $report_name" . ".binprob ";
}
my $cmd = "java -jar $snppicker $infinium $tooclose_option -s " . join(" -s ",@illuminaFiles) . " -o $report" . ".out -e $report" . ".err "
                        . $exclude_option . $obligate_option . $computeUtilityOnly_option . $onesnpperbin_option . $minscore_option .
                        $binning_rules . " $cpuoption -nOPA $nOPA -minP $minprob $default_score_option $ext_oblig_option $oblig_util_option $randomswitch $hidden_option";
my $fileid=0;
open FIDM ,">$report_name/$fileidmap";
my $nfiles=0;
for(my $ichr =0;$ichr<=$#chrs;$ichr++) {
	my $tchr = $chrs[$ichr];
	my $ngenes_in_chr=0;
	my @chrfiles;
	for(my $i=0;$i<=$#genes;$i++) {;
		my $chr = $CHRSRC{$genes[$i]} ;
		if($chr eq $tchr) {
			$ngenes_in_chr++;
			my $infile = $genes[$i] . "/$binfile";
			my @fields = split(/\//,$genes[$i]);
			print FIDM $fileid . "\t" . $fields[3] . "\n";
			$fileid++;
			push(@chrfiles," -p " . $GENESRCPOP{$genes[$i]} . " " . $fswitch . $infile);
		}
	}
# for ldselect format, snppicker will assume the snp identifiers are chromosomal positions and
# match them from the annotation file (illumina *_PS.csv file).
#	if($ngenes_in_chr>1) {# can't skip this chromosome, there might be obligates
		$cat .= " " . $report . ".out ";
		if($chromByChrom) {
                        $report = "$report_name" ."/" . "$report_name" . "_chr$tchr";
                        if($hidden) {
                        	$hidden_option = " -dumpBinProb $report" . ".binprob ";
                        }
			$cmd = "java -jar $snppicker -chr $tchr $infinium  $tooclose_option -s " . join(" -s ",@illuminaFiles) . " -o $report" . ".out -e $report" . ".err "
			. $exclude_option . $obligate_option . $computeUtilityOnly_option . $onesnpperbin_option . $minscore_option .
                        $binning_rules . " $cpuoption -nOPA $nOPA -minP $minprob $default_score_option $ext_oblig_option $oblig_util_option $hidden_option $randomswitch " . join(" ",@chrfiles) . " > $report". ".log";
		        print "$cmd\n";
		       
		        #	`$cmd`;
			
			my $freport = "$report" . ".out";
			my $breport = "$report" . ".binprob";
			

			# Merge the results from many runs but only keep the result for an obligate
			# when the obligate's chromosome is processed (e.g. so the proximity contraints can act).
			 
			if($#chrfiles>=0 && ! -e $freport) {
			    print "ERROR: SNPPicker failed on chromosome $tchr\n";
			} else {
			    # Merge multiple files, but only keep one header.
			    if(length($header)==0) {
				my $hcmd = "head -1 $freport";
				$header = `$hcmd`;
				print FFINAL "$header";
				if($hidden) {
					my $bhcmd = "head -1 $breport";
					$bheader = `$bhcmd`; 
				 	print BFINAL "$bheader";
				}
			    }	
			    open FREP, "<$freport";
			    if($hidden) {
			    	open BREP, "<$breport";
			    }
			    my $l;
			    if($l=<FREP>) {#read first line, skipping header
				my $nlines=0;
				while($l=<FREP>) {
				    chomp $l;
				    $nlines++;
				    my @line = split(/\t/,$l);
				    my $ochr = $line[4];
				    my $oblig_name = $line[6];
				    my $note = $line[17];
				    if($note =~ /obligate/) {
					if($ochr eq $tchr) {
					    print FFINAL "$l\n";
					} elsif(length($ochr)==0) {
					    $ADDOBLIG{$oblig_name}=$l;
					}
				    } else {
					print FFINAL "$l\n";
				    }
				}
				if($#chrfiles>=0 && $nlines==0) {
				    print "ERROR: SNPPicker produced empty file with only header $tchr\n";
				}
			    } else {
				if($#chrfiles>=0) {
				    print "ERROR: SNPPicker produced empty file on $tchr\n";
				}
			    }
			    if($hidden) {
				    if($l=<BREP>) {#read first line, skipping header
					my $nlines=0;
					while($l=<BREP>) {
					    chomp $l;
					    print BFINAL "$l\n";
					}
				    }
			    }
  
			}
			 
		} else {
                        $cmd .= (" " .  join(" ",@chrfiles) );
			$nfiles=1+$#chrfiles;
                }

#	}
} # for
close FIDM;
if(! $chromByChrom) {
	$cmd .=  " > $report". ".log";
        print "$cmd\n";
        `$cmd`;
	my $freport = "$report" . ".out";
	if($nfiles>0) {
	    if(! -e $freport) {
		print "ERROR: SNPPicker failed\n";
	    } else {
		open FREP, "<$freport";
		my $l;
		if($l=<FREP>) {
		    if($l=<FREP>) {
		    } else {
			print "ERROR: SNPPicker produced empty file $freport with only header\n";
		    }
		} else {
		    print "ERROR: SNPPicker produced empty file $freport \n";
		}
		close FREP;
	    }
	}
}

#add obligates without a chromosome assignment.
my @obligs_no_chr = keys %ADDOBLIG;
for(my $i=0;$i<=$#obligs_no_chr;$i++) {
	print FFINAL $ADDOBLIG{$obligs_no_chr[$i]} . "\n";
}

 
close FFINAL;

if($hidden) {
	close BFINAL;
}

 


sub usage() {
  print "under construction, RTFC\n";

}
