#!/usr/local/biotools/perl/5.14.2/bin/perl

=head1 NAME
   extractNonProteinReads.pl

=head1 SYNOPSIS

    USAGE: extractNonProtienReads.pl -f=original.fasta -t=transcript_list -o=output_dir

=head1 OPTIONS

B<--fasta, -f>
	Required. Original input fasta file that was passed into PFam

B<--transcript, -t>
	Required. Modifed PFam output where multi spaced are changed to tabs and header removed.

B<--output, -o>
	Required.  Ouput fasta file

B<--type, -t>
	Optional.  Type of input result file (-p option).  Valid values are pfam or blast
	this is used to determine which column in the txt to look for read ids.

B<--help,-h>


=head1 DESCRIPTION
	Extract readsom from input fasta that are not present in PFam output.

=head1 INPUT
	Fasta file passed to PFam, and modified PFam output.

=head1 OUTPUT

=head1 VERSION
	1.0

=head1  CONTACT
  bjaysheel@gmail.com


==head1 EXAMPLE
	./extractNonProtienReads.pl -f=original.fasta -p=pfam_ouput.txt -o=output.fasta

=cut

#use lib "/usr/local/biotools/perl/5.10.0/lib/site_perl/5.10.0";
use strict;
use warnings;
use Data::Dumper;
use Cwd;
use Pod::Usage;
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
use Bio::SeqIO;

my %options = ();
my $results = GetOptions (\%options,
						  'fasta|f=s',
						  'transcript|t=s',
						  'output_dir|o=s',
						  'type|y=s',
						  'help|h') || pod2usage();

#### display documentation
if( $options{'help'} ){
    pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
}

#############################################################################
#### make sure everything passed was peachy
&check_parameters(\%options);

my $seq_in = Bio::SeqIO->new( -file   => $options{fasta},
						   -format => 'fasta' );

my $seq_out = Bio::SeqIO->new( -file   => ">$options{output}",
						   -format => 'fasta' );

my $verified_protein_hash;


#### read in modified pfam output or tab delimited blast output
#### and create a hash of verified protein ids.
open(FHD, "<", $options{txt}) or die "Could not open file to read $options{txt}";

#### read id in tab delimited blast output is first column or 0.
my $colm = 0;

if ($options{type} =~ /^pfam$/i) {
	$colm = 3;
}

while(<FHD>) {
	#### comments and column headers are indeicated by line starting pound(#)
	#### skip if any line starts with pound(#)
	next if ($_ =~ /^#/);

	chomp $_;
	my @info = split(/\t/, $_);

	$verified_protein_hash->{$info[$colm]} = 1;
}

#### read in fasta file and output reads if they are not verified as proteins
while (my $seq = $seq_in->next_seq) {
	if (! exists $verified_protein_hash->{$seq->id}) {
		$seq_out->write_seq($seq);
	}
}

exit();

#############################################################################
sub check_parameters {
    my $options = shift;

	#print Dumper($options);

	my @required = qw(fasta transcript output_dir);

	foreach my $key (@required) {
		unless (defined $options{$key}) {
			print STDERR "ARG: $key is required\n";
			pod2usage({-exitval => 2,  -message => "error message", -verbose => 1, -output => \*STDERR});
			exit(-1);
		}
	}

	$options{'debug'} = 3 unless (defined $options{'debug'});
	$options{'type'} = "pfam" unless (defined $options{'type'});
}
