#!/usr/local/biotools/perl/5.14.2/bin/perl

=head1 NAME

split_multifasta.pl - split a single FASTA file containing multiple sequences into separate files.

=head1 SYNOPSIS

USAGE: split_multifasta.pl
            --run_info=/path/to/run_info.txt
            --output_dir=/path/to/somedir
            [--seqs_per_file=1
            --total_files=1]

=head1 OPTIONS

B<--run_info,-r>
    The run_info text file

B<--output_dir,-o>
    The directory to which the output files will be written.

B<--total_files, -t>
    Used if the user wants to specify the total outputs files to be created. The script will
    determine the amount of sequences per file to meet this parameter. Cannot be used in conjunction
    with the seqs_per_file parameter.

B<--seqs_per_file,-e>
    Number of sequences per file.

B<--debug,-d>
    Debug level.  Use a large number to turn on verbose debugging.

B<--log,-l>
    Log file

B<--help,-h>
    This help message

=head1  DESCRIPTION

This script is used to split a single FASTA file containing multiple sequences into separate
files containing one sequence each.

=head1  INPUT

The input is defined with --input_file and should be a single fasta file.  File extensions are
ignored.  When creating this multi-entry FASTA file, one should take care to make the first
*word* after the > symbol a unique value, as it will be used as the file name for that sequence.
For example:

    >gi53791237 Tragulus javanicus p97bcnt gene for p97Bcnt
    ACAGGAGAAGAGACTGAAGAGACACGTTCAGGAGAAGAGCAAGAGAAGCCTAAAGAAATGCAAGAAGTTA
    AACTCACCAAATCACTTGTTGAAGAAGTCAGGTAACATGACATTCACAAACTTCAAAACTAGTTCTTTAA
    AAAGGAACATCTCTCTTTTAATATGTATGCATTATTAATTTATTTACTCATTGGCGTGGAGGAGGAAATG

    >gi15387669 Corynebacterium callunae pCC1 plasmid
    ATGCATGCTAGTGTGGTGAGTATGAGCACACACATTCATGGGCACCGCCGGGGTGCAGGGGGGCTTGCCC
    CTTGTCCATGCGGGGTGTGGGGCTTGCCCCGCCGATAGAGACCGGCCACCACCATGGCACCCGGTCGCGG
    GGTGATCGGCCACCACCACCGCCCCCGGCCACTCTCCCCCTGTCTAGGCCATATTTCAGGCCGTCCACTG

Whitespace is ignored within the input file.  See the OUTPUT section for more on creation of
output files.

=head1  OUTPUT

The name of each output sequence file is pulled from the FASTA header of that sequence.  The
first *word* after the > symbol will be used as the file name, along with the extension .fsa.
The word is defined as all the text after the > symbol up to the first whitespace.

If the above example were your input file, two files would be created:

    gi53791237.fsa
    gi15387669.fsa

Any characters other than a-z A-Z 0-9 . _ - in the ID will be changed into an
underscore.  This only occurs in the file name; the original FASTA header within the file
will be unmodified.

You can pass a path to the optional --output_list to create a text file containing the full paths
to each of the FASTA files created by this script.

Two other optional arguments, --output_subdir_size and --output_subdir_prefix, can be used
on input sets that are too large to write out to one directory.  This depends on the limitations
of your file system, but you usually don't want 100,000 files written in the same directory.

If you have an FASTA file containing 95000 sequences, and use the following option:

    --output_dir=/some/path
    --output_subdir_size=30000

The following will be created:

    directory              file count
    ---------------------------------
    /some/path/1/          30000
    /some/path/2/          30000
    /some/path/3/          30000
    /some/path/4/           5000

If you choose to create a list file (and you probably want to), it will contain these proper paths.

You may not want the subdirectories to simply be numbers, as above, so you can use the
--output_subdir_prefix option.  For example:

    --output_dir=/some/path
    --output_subdir_size=30000
    --output_subdir_prefix=fasta

The following will be created:

    directory              file count
    ---------------------------------
    /some/path/fasta1/     30000
    /some/path/fasta2/     30000
    /some/path/fasta3/     30000
    /some/path/fasta4/      5000

Finally, you can write multiple sequences to each output file using the --seqs_per_file option, which
can be used along with --outupt_subdir_size and --output_subdir_prefix.  The main difference to note
is that, if you use --seqs_per_file, the fasta file created will no longer be named using values
taken from the header, since it will contain multiple headers.  Instead, the file will simply be
named using sequential numbers starting at 1 (like 1.fsa).  For example:

    --output_dir=/some/path
    --output_subdir_size=3000
    --output_subdir_prefix=fasta
    --seqs_per_file=10

The following will be created:

    directory              file count
    ---------------------------------
    /some/path/fasta1/     3000
    /some/path/fasta2/     3000
    /some/path/fasta3/     3000
    /some/path/fasta4/      500

=head1  CONTACT

    Jaysheel Bhavsar
    bjaysheel@gmail.com

=cut

use lib "/data2/bsi/reference/perl_workflow_ref/lib";
use lib "/data2/bsi/reference/perl_workflow_ref/lib/perl5/x86_64-linux/auto";
use strict;
use warnings;
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
use Pod::Usage;
use POSIX;
use ParseConfig;
use MyUtility;
use Workflow::Logger;

my %options = ();
my $results = GetOptions (\%options,
                          'run_info|r=s',
                          'output_dir|o=s',
                          'sample|s=s',
						  'for_blast|b=s',
                          'seqs_per_file|e=s',
                          'total_files|t=s',
                          'log|l=s',
                          'debug=s',
                          'help|h') || pod2usage();

#### display documentation
if( $options{'help'} ){
    pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
}

#### create hash of all config info.
my $config = new ParseConfig($options{run_info});
my $util = new MyUtility;

#### setup log object
my $logger = new Workflow::Logger('LOG_FILE'=>"$config->{RunInfo}->{logs}/SplitMultiFASTA.$options{sample}.log",
								  'LOG_LEVEL'=>$options{'debug'});
$logger = Workflow::Logger::get_logger();

$logger->info("Split multi FASTA for $options{sample} started");

#### make sure everything passed was peachy
&check_parameters(\%options);

my $first = 1;
my $seq = '';
my $header;
my $this;
my $cmd;


#### set input and output based on weather data is used for pfam or blast
$this->{input} = "$options{output_dir}/lincRNA/emboss/$options{sample}/$options{sample}.proteinFasta.fa";
$this->{output_dir} = "$options{output_dir}/lincRNA/pfam/$options{sample}";

if ($options{for_blast}) {
	$this->{input} = "$options{output_dir}/lincRNA/scripture/$options{sample}/$options{sample}.NonProtein.candidates.PFam.fa";
	$this->{output_dir} = "$options{output_dir}/lincRNA/scripture/$options{sample}/blast";
}

check_input($this->{input});
create_dir_struct(\%options);

my $sfh;
open ($sfh, "<", $this->{input}) || $logger->logdie("can't open sequence file:\n$!");

#### number of sequences in the input file.
my $seq_count = 0;
my $seq_file_count = 0;

#### if we instead want to split sequences into a maximum number of files
#### we need to do some hacky calculations here and reset seqs_per_file
$options{seqs_per_file} = &set_seqs_per_total_files(\$sfh, $options{total_files}) if ( defined($options{total_files}) && $options{total_files} ne "" );

#### keep track of how many sequences are in the current output file
my $seqs_in_file = 0;
my $group_filename_prefix = 1;

#### holds the output file handle
my $ofh;

#### holds the total number of files created
my $total_files_created = 0;

while (<$sfh>) {

    #### if the TOTAL_FILES parameter is used we want to check when we hit our last ile
    #### to make sure that we toss all the rest of the sequence files there.
    if (defined($options{'total_files'}) && $total_files_created eq $options{'total_files'}) {
        #### this number tells us how many sequences we are goingto have in our final file.
        my $remainder_seqs = $seq_count % $options{'total_files'};
        $seqs_in_file += $remainder_seqs if ($remainder_seqs ne 0);
    }

    #### if we find a header line ...
    if (/^\>(.*)/) {

        ## write the previous sequence before continuing with this one
        unless ($first) {
            chomp $seq;
            &writeSequence(\$header, \$seq);

            ## reset the sequence
            $seq = '';
        }

        $first = 0;
        $header = $1;

    ## else we've found a sequence line
    } else {
        ## skip it if it is just whitespace
        next if (/^\s*$/);

        ## record this portion of the sequence
        $seq .= $_;
    }
}

## don't forget the last sequence
&writeSequence(\$header, \$seq) if(defined( $header ));

exit;

#############################################################################
sub check_parameters {
    my $options = shift;

	my @required = qw(run_info output_dir sample);

	foreach my $key (@required) {
		unless (defined $options{$key}) {
			print STDERR "ARG: $key is required\n";
			pod2usage({-exitval => 2,  -message => "error message", -verbose => 1, -output => \*STDERR});
			exit(-1);
		}
	}

    #### if the total_files parameter is being used than we want to double check that
    #### the seqs_per_file parameter is not being used.
    if (defined $options{total_files} && defined $options{seqs_per_file}) {
        $logger->logdie("The seqs_per_file parameter and total_files parameter cannot be used in conjunction");
    }

    #### We also want to make sure that total files is not 0
    if (defined $options{total_files} && $options{total_files} < 1) {
        $logger->logdie("total_files setting cannot be less than one");
    }


	$options{debug} = 3 unless (defined $options{debug});
    $options{seqs_per_file} = 1 unless (defined $options{seqs_per_file});
    $options{total_files} = $config->{ToolInfo}->{max_fsa_split}->{value} unless ($options{total_files});
	$options{for_blast} = 0 unless (defined $options{for_blast});
}

#############################################################################
sub create_dir_struct {
	my $options = shift;

	my $dir = "$this->{output_dir}";
	if ( -d $dir ) {
		$logger->info("Directory $dir exist");
	} else {
		execute_cmd("mkdir -p $dir");
	}
}

#############################################################################
sub writeSequence {
    my ($header, $seq) = @_;

    #### the id used to write the output file will be the first thing
    ####  in the header up to the first whitespace.  get that.
    $$header =~ /^(\S+)/ || $logger->logdie( "can't pull out an id on header $$header" );
    my $id = $1;

    #### because it is going to be the filename, we're going to take out the characters that are bad form to use
    #### legal characters = a-z A-Z 0-9 - . _
    $id =~ s/[^a-z0-9\-_.]/_/gi;

    #### if we're writing more than one sequence to a file, change the id from
    ####  fasta header to the current group file name
    if ($options{seqs_per_file} > 1) {
        $id = $group_filename_prefix;
    }

    #### did the user ask for a file prefix?
    my $filepath = "$this->{output_dir}/$id.fsa";

    #### take any // out of the filepath
    $filepath =~ s|/+|/|g;

    #### write the sequence
    $logger->debug("Writing sequence to $filepath") if ($logger->is_debug());

    #### open a new output file if we need to
    ####  if we're writing multiple sequences per file, we only open a new
    ####  one when $seqs_in_file = 0 (first sequence)
    if ($seqs_in_file == 0) {
        open ($ofh, ">", $filepath) || $logger->logdie("can't create '$filepath':\n$!");

        $total_files_created++;
    }

    ## write the sequence
    print $ofh ">$$header\n$$seq\n";
    $seqs_in_file++;

    ## if we hit the limit of how many we want in each file, set the next file name and
    ##  reset the count of seqs within the file
    if ($options{seqs_per_file} == $seqs_in_file) {
        $seqs_in_file = 0;
        $group_filename_prefix++;
    }
}

#############################################################################
sub set_seqs_per_total_files {
    my $fh = ${$_[0]};
    my $tot_files = $_[1];

    my $seq_count = 0;
    local $/ = ">";
    while(my $line = <$fh>) { $seq_count += 1; }

    ## need a quick check here to make sure that the number of files wanted
    ## is not greater than the number of sequences in the file
    if ($tot_files > $seq_count) {
        $tot_files = $seq_count;
        $logger->warn("total_files $options{'total_files'} is greater than the number of sequences $seq_count.  Rest total_files to $seq_count");
    }


    ## calculate how many sequences we should have per file to meet the total_files parameter request.
    my $seqs_per_file = int($seq_count / $tot_files);

    ## reset filehandle
    seek $fh,0,0;

    return $seqs_per_file;
}

#############################################################################
sub execute_cmd {
	my $cmd = shift;

	$logger->info("$cmd");
	system($cmd);

	while (( $? >> 8 ) != 0 ){
		$logger->logdie("ERROR: Following command failed to execute. Exiting execution of workflow\n$cmd");
		exit(100);
	}
}

#############################################################################
sub check_input {
	my $file = shift;

	my $job_id = 0;
	my $sgeerr = "";
	my $sgeout = "";

	if (defined $ENV{JOB_ID}){
		$job_id = $ENV{JOB_ID};
	}

	if (defined $ENV{SGE_STDERR_PATH}){
		$sgeerr = $ENV{SGE_STDERR_PATH};
	}

	if (defined $ENV{SGE_STDOUT_PATH}){
		$sgeout = $ENV{SGE_STDOUT_PATH};
	}

	if (! -s $file){
		my $error_name = "$config->{RunInfo}->{base_output_dir}/$config->{RunInfo}->{pi}/$config->{RunInfo}->{type}/$config->{RunInfo}->{output_folder}/error/SplitMultiFASTA.$options{sample}.err";
		$util->createErrorFile($error_name, "EXPECTED FILE WHILE RUNNING Split Multi FASTA STEP IS MISSING\n\n$file");

		$util->reportErrorSGE($config->{RunInfo}->{email},
							  $file,
							  "Split multi FASTA",
							  $error_name,
							  $job_id,
							  $sgeerr,
							  $sgeout);
		exit(100);
	}
}
