#!/bin/bash

# exit if any statement returns a non-zero exit code
set -e

### If user specified help, or did not specify exactly 4 args, then show usage
if [[ $* == *-h* ]] || [[ -z "$1" ]] || [[ -z "$2" ]] || [[ -z "$3" ]] || [[ -z "$4" ]] ;
then
    echo "bior_annotate_blaster breaks up VCF files into chunks of a given size, and processes them over the open grid engine"
    echo ""
    echo "Usage: bior_annotate_blaster  <VcfIn>  <VcfOut>  <ChunkSize>  <YourEmail>  [BiorAnnotateConfigFilePath]"
    echo ""
    echo "Example: Say you have a vcf file with 2 million variants, called my.vcf, and want to split it into 500k line chunks to run on the grid:"
    echo "bior_annotate_blaster  my.vcf  my.out.gz  500000  Last.First@mayo.edu  my.config"
    echo ""
    echo "This will split up the 2 million variant my.vcf file into 4 chunks of 500k each, using the given config file with corresponding output:"
    echo "  my.out.gz.part.1"
    echo "  my.out.gz.part.2"
    echo "  my.out.gz.part.3"
    echo "  my.out.gz.part.4"
    echo ""
    exit 1;
fi

dir=$PWD
vcfIn=$(readlink -f $1)
vcfOut=$2
chunksize=$3
email=$4
annotConfigFile=$5

### Turn OFF exit-on-command-failure, so that we can run "type" on a cmd without it killing this script
set +e
qsubCmd=`type qsub 2>/dev/null`
### Turn exit-on-command-failure back ON
set -e
### If qsub was not found, then show message and exit
if [ -z "$qsubCmd" ] ; then
  echo "The qsub command is not present on this system."
  echo "This script will not be able to run bior_annotate on the open grid engine."
  echo "Stopping."
  exit 1;
fi

### Example file type that "cat" would use: ASCII text  (will have to see if it *contains* this string)
### If uname contains "Darwin", then we're on a Mac, so use gzcat instead of zcat (which demands .Z extensions)
### NOTE: Make sure to use the -L flag to dereference any symbolic links
fileType=$(file -L $vcfIn)
parser=zcat
if [[ "$fileType" == *ASCII* ]] ; then
    parser=cat
elif [[ "$(uname)" == "Darwin" ]]; then
    parser=gzcat
fi

echo "Counting # of lines in vcf file..."
dataLinesInFile=$($parser $vcfIn | grep -v "^#" | wc -l)
echo "    # lines: $dataLinesInFile"

### Create subdir "BiorBlasterScripts" in which to execute the scripts and get error msg's
subdir=BiorAnnotateBlaster
if [ ! -d "$subdir" ] ; then
	mkdir "$subdir"
fi

### Clear out the jobIds.txt file
jobIdsFile=$subdir/$(basename $vcfIn)_jobIds.txt
echo "" > $jobIdsFile

### Skip this if there are no files with the chunk names
if [ -f "$subdir/$vcfOut.part.1" ] ; then
  echo "Removing all previous file chunks by the same name..."
  for file in `ls $subdir/$vcfOut.part.*`
  do
    rm "$file"
  done
fi

### while there are chunks to operate on:
###   Construct a shell script to run
###   Run that script using a grid call
start=1
end=$(($start + $chunksize - 1))
part=1
while [ $start -le $dataLinesInFile ] ; do
    ### Construct the grid command
    
    partPadded=`printf "%04d" $part`
    partName=$vcfOut.part.$partPadded
    gridCmd="qsub -V -wd $PWD/$subdir  -M $email -m besa -q 1-day -l h_vmem=5G -pe threaded 3 $BIOR_LITE_HOME/bin/_bior_annotate_blaster_chunker  $vcfIn  $start  $end  $partName"

	### Run the command and get the grid job Id back 
    gridJobId=`$gridCmd`
    
    ### Dump the command and grid job Id to a text file
    echo "----------------" >> $jobIdsFile
	echo "Grid jobId: $gridJobId" >> $jobIdsFile
	echo "    $gridCmd" >> $jobIdsFile
	
	### Increment the start and end by the chunk amount
	start=$(($end + 1))
	end=$(($start + $chunksize - 1))
	
	### Increment file part number
	part=$(($part + 1))
done

echo "$(($part - 1)) jobs submitted to grid engine in chunks of $chunksize lines"
echo "List of grid jobs launched: $jobIdsFile"
echo "Use 'qstat' to check jobs status."
echo "When all grid jobs are finished, use bior_merge to combine all parts into one file (parts are in subdirectory: $subdir)"
