#!/bin/bash

### For debugging:
#set -x
#set -v

# exit if any statement returns a non-zero exit code
set -e

### If user specified help, or did not specify at least 4 args, then show usage
function usage
{
    echo "bior_annotate_blaster breaks up VCF files into chunks of a given size, and processes them over the open grid engine using bior_annotate"
    echo ""
    echo "Output will be TJSON (or VCF if appropriate flag used), compressed by default"
    echo ""
    echo "Usage: bior_annotate_blaster  <VcfIn>  <VcfOut>  <ChunkSize>  <YourEmail>  [BiorAnnotateConfigFilePath]  [--vcfOut]  [--statusFile  statusFile]  [--logFile logFile]  [--tempDir  tempDirectory]  [--noCleanup]  [--emailOnlyOnAbort]  [--noZip]  [--help]"
    echo ""
    echo "  <VcfIn>     : The input VCF file, which can be plaintext or gzip (Ex: my.vcf.gz)"
    echo "  <VcfOut>    : The output VCF file, gzip'd by default (Ex: my.out.vcf.gz)"
    echo "  <ChunkSize> : The number of lines each portion of the input file will be broken into to be processed independently"
    echo "  <YourEmail> : The email address to send notifications when the command is complete"
    echo "  [BiorAnnotateConfigFile] : OPTIONAL file containing the columns that bior_annotate should output.  See bior_annotate for more info."
    echo "  [--vcfOut]   : Output should be in VCF format instead of TJSON"
    echo "  [--statusFile statusFile] : Use the -s flag to specify a statusFile to write to which will give total number of lines written for each part and for the whole file as well.  Format will be similar to that output from bior_annotate"
    echo "  [--logFile logFile]  : The log file in which to write error and warning messages"
    echo "  [--tempDir tempDir]  : Temporary directory to write the output files.  If not specified, this will assume 'BiorAnnotateBlaster' under the current directory"
    echo "  [--noCleanup]        : Don't cleanup the partial chunk files.  This is useful for large files where a portion of the file fails, which allows you to just re-run that portion, or if you want to see where errors occurred"
    echo "  [--emailOnlyOnAbort] : Only send emails when the job aborts, NOT when the job completes successfully"
    echo "  [--noZip]            : Do not gzip the output (by default, all output is gzip'd)"
    echo "  [--help] or [-h]     : Print usage and exit"
    echo ""
    echo "Example:"
    echo "  Say you have a vcf file with 2 million variants, called my.vcf,"
    echo "  and want to split it into 500k line chunks to run on the grid:"
    echo "    bior_annotate_blaster  my.vcf  my.out.gz  500000  Last.First@mayo.edu  my.config"
    echo ""
    echo "  This will split up the 2 million variant my.vcf file into 4 chunks of 500k each,"
    echo "  using the given config file with corresponding output:"
    echo "    my.out.gz.part.1"
    echo "    my.out.gz.part.2"
    echo "    my.out.gz.part.3"
    echo "    my.out.gz.part.4"
    echo ""
    echo "Example:"
    echo "  Process my.vcf with 2 million variants, split into 100k line chunks, output to VCF format, and get status and log file while writing to temp dir /tmp/myBior/"
    echo "    bior_annotate_blaster  my.vcf  my.out.gz  100000  Last.First@mayo.edu  --statusFile my.status  --logFile my.log  --tempDir /tmp/myBior  --vcfOut"
    echo ""
}

dir="$PWD"
vcfIn=""
vcfOut=""
chunksize=""
email=""
annotConfigFile=""
statusFile="/dev/null"
logFile="/dev/null"
isCleanup="true"
isVcfOut="false"
subdir="BiorAnnotateBlaster"
### The --noEmailOnSuccess flag will change gridNotification to "a"
gridNotification="esa"
isZipOutput="true"

# Resolve all of the parameters from the command line
idx=1
while [ "$1" != "" ] ; do
  case $1 in
    --help )
      usage
      exit 0
      ;;
    -h )
      usage
      exit 0
      ;;
    --statusFile )
      shift
      statusFile="$1"
      shift
      ;;
    --logFile )
      shift
      logFile="$1"
      shift
      ;;
    --noCleanup )
      isCleanup="false"
      shift
      ;;
   --tempDir )
      # Temporary directory
      shift
      subdir="$1"
      shift
      ;;
   --vcfOut )
      # Output should be VCF
      isVcfOut="true"
      shift
      ;;
   --emailOnlyOnAbort )
      # Only email when the final grid call is aborted
      gridNotification="a"
      shift
      ;;
   --noZip )
      # Do NOT zip the output
      isZipOutput="false"
      shift
      ;;
   --* )
      echo "Unrecognized flag or option: $1"
      exit 1
      ;;
    * )
      if [ $idx == 1 ] ; then
        vcfIn="$1"
      elif [ $idx == 2 ] ; then
        vcfOut="$1"
      elif [ $idx == 3 ] ; then
        chunksize="$1"
      elif [ $idx == 4 ] ; then
        email="$1"
      elif [ $idx == 5 ] ; then
        annotConfigFile="$1"
      fi

      #echo "Value $idx = $1"
      idx=$((idx + 1))
      shift
      ;;
  esac
done


# If the four required arguments are not given, then show usage and exit
if [[ -z "$vcfIn" ]] || [[ -z "$vcfOut" ]] || [[ -z "$chunksize" ]] || [[ -z "$email" ]] ; then
  echo ""  | tee -a $logFile
  echo "ERROR: Incorrect number of arguments" | tee -a $logFile
  echo "" | tee -a $logFile
  usage
  exit 1
fi


### Create the temp directory if it does not already exist, then resolve it to full path
if [ ! -d "$subdir" ]; then
  mkdir -p "$subdir"
fi
subdir=$(readlink -f "$subdir")

### Now, make sure all paths that have to be resolved to their full paths are
### NOTE: This must be done AFTER the temp directory is created!
### NOTE: Need to use "set +e" before calling readlink as any blank or bad directories will cause it to fail and exit this script
set +e
vcfIn=$(readlink -f "$vcfIn")
vcfOut=$(readlink -f "$vcfOut")
statusFile=$(readlink -f "$statusFile")
logFile=$(readlink -f "$logFile")
annotConfigFile=$(readlink -f "$annotConfigFile")
set -e


### For debugging purposes:
function printVars 
{
  echo "-------------" | tee -a $logFile
  echo "bior_annotate_blaster parameters" | tee -a $logFile
  echo "-------------" | tee -a $logFile
  echo "dir             = $dir"  | tee -a $logFile
  echo "vcfIn           = $vcfIn"  | tee -a $logFile
  echo "vcfOut          = $vcfOut"  | tee -a $logFile
  echo "chunksize       = $chunksize"  | tee -a $logFile
  echo "email           = $email"  | tee -a $logFile
  echo "annotConfigFile = $annotConfigFile"  | tee -a $logFile
  echo "statusFile      = $statusFile"  | tee -a $logFile
  echo "logFile         = $logFile"  | tee -a $logFile
  echo "isVcfOut        = $isVcfOut"  | tee -a $logFile
  echo "isCleanup       = $isCleanup"  | tee -a $logFile
  echo "subdir          = $subdir"  | tee -a $logFile
  echo "gridNotification= $gridNotification" | tee -a $logFile
}
###printVars


### Turn OFF exit-on-command-failure, so that we can run "type" on a cmd without it killing this script
set +e
qsubCmd=`type qsub 2>/dev/null`
### Turn exit-on-command-failure back ON
set -e
### If qsub was not found, then show message and exit
if [ -z "$qsubCmd" ] ; then
  echo "The qsub command is not present on this system." | tee -a $logFile
  echo "This script will not be able to run bior_annotate on the open grid engine." | tee -a $logFile
  echo "Stopping." | tee -a $logFile
  exit 1;
fi

### Example file type that "cat" would use: ASCII text  (will have to see if it *contains* this string)
### If uname contains "Darwin", then we're on a Mac, so use gzcat instead of zcat (which demands .Z extensions)
### NOTE: Make sure to use the -L flag to dereference any symbolic links
fileType=$(file -L $vcfIn)
parser=zcat
if [[ "$fileType" == *ASCII* ]] ; then
    parser=cat
elif [[ "$(uname)" == "Darwin" ]]; then
    parser=gzcat
fi

### echo "Counting # of lines in vcf file..." | tee -a $logFile
dataLinesInFile=$($parser $vcfIn | grep -v "^#" | wc -l)
### echo "    # lines: $dataLinesInFile" | tee -a $logFile

### Clear out the jobIds.txt file
jobIdsFile=$subdir/$(basename "$vcfOut")_jobIds.txt
echo "" > $jobIdsFile

### Remove all old files with the same prefix
### Skip this if there are no files with the chunk names
if [ -f "$subdir/$vcfOut.part.1" ] ; then
  echo "Removing all previous file chunks by the same name..." | tee -a $logFile
  for file in `ls $subdir/$vcfOut.part.*`
  do
    rm "$file"
  done
fi



### while there are chunks to operate on:
###   Construct a shell script to run
###   Run that script using a grid call
start=1
end=$(($start + $chunksize - 1))
if [ $end -gt $dataLinesInFile ] ; then
    end=$dataLinesInFile
fi
part=1
partsPrefix=$(basename $vcfOut)
while [ $start -le $dataLinesInFile ] ; do
    ### Construct the grid command
    partPadded=`printf "%04d" $part`
    partName=$subdir/$partsPrefix.part.$partPadded
    ### echo "---------------" | tee -a $logFile
    echo "---------------" >> $logFile
    
    ### Put "isSuccessful=false" into the status file initially so that we know if the command failed, 
    ###  didn't complete, or didn't run at all as the status file will be there to check against 
    ###  (rather than missing and not accounted for in the failure tally)
    partStatus=$partName.status
    echo "### NOTE: FILE IS STILL PROCESSING!  This should be overwritten once the current chunk is done annotating!" > $partStatus
    echo "isSuccessful=false" >> $partStatus
    
    ### echo "Grid call to chunker:  <subdir>  _bior_annotate_blaster_chunker  <vcfIn>  <start>  <end>  <partName>  <annotConfigFile>" | tee -a $logFile
    echo "Grid call actual:      $subdir   $BIOR_LITE_HOME/bin/_bior_annotate_blaster_chunker  $vcfIn  $start  $end  $partName  $annotConfigFile" >> $logFile
    gridCmd="qsub -V -wd $subdir  -M $email -m a  -q 1-day -l h_vmem=5G -pe threaded 3 $BIOR_LITE_HOME/bin/_bior_annotate_blaster_chunker  $vcfIn  $start  $end  $partName  $annotConfigFile"
	
    ### Run the command and get the grid job Id back 
    ### The response looks like:  "Your job 1457064 (\"_bior_annotate_blaster_chunker\") has been submitted",
    ### so strip off the text around the id:
    gridJobResponse=`$gridCmd`
    gridJobId=$(echo $gridJobResponse | sed "s/Your job //" | sed "s/ (.*//")

    ### Add the gridJobId to an array - this will be used by the validator when all grid calls are finished.
    gridJobIds[$part]=$gridJobId

    ### Dump the gridId, start, end, filePart  - for use by the verification and cleanup script
    ### Dump the command and grid job Id to a text file
    numLines=$(($end - $start + 1))
    echo "----------------" >> $jobIdsFile
    echo "GridId: $gridJobId, TotalLinesInOriginalFile: $dataLinesInFile, NumLinesThisChunk: $numLines, StartLine: $start, EndLine: $end, FilePart: $partName" >> $jobIdsFile
    echo "    $gridCmd" >> $jobIdsFile
    echo "" >> $jobIdsFile
	
    ### Increment the start and end by the chunk amount
    start=$(($end + 1))
    end=$(($start + $chunksize - 1))
	if [ $end -gt $dataLinesInFile ] ; then
    	end=$dataLinesInFile
	fi
	
    ### Increment file part number
    part=$(($part + 1))
done

## If the logFile is not given, then set it to /dev/null before passing to _bior_annotate_blaster_concat
if [ -z "$logFile" ] ; then
  logFile=/dev/null
fi

## If the statusFile is not given, then set it to /dev/null before passing to _bior_annotate_blaster_concat
if [ -z "$statusFile" ] ; then
  statusFile=/dev/null
fi

### Wait for jobs to finish
### Convert spaces between grid job Ids into commas
csvGridIds=`echo ${gridJobIds[@]} | sed 's/ /,/g'`
echo "--" >> $logFile
echo "Command to run: qsub -hold_jid $csvGridIds -V -M $email -m $gridNotification  -wd $subdir  $BIOR_LITE_HOME/bin/_bior_annotate_blaster_concat  $subdir   $partsPrefix  $vcfOut  $isVcfOut  $isCleanup  $logFile  $statusFile" >> $logFile
qsub -hold_jid "$csvGridIds"  -V  -M $email -m $gridNotification  -wd $subdir  $BIOR_LITE_HOME/bin/_bior_annotate_blaster_concat  "$subdir"  "$partsPrefix"  "$vcfOut"  "$isVcfOut"  "$isCleanup"  "$logFile"  "$statusFile"  "$isZipOutput"

echo "Grid jobs submitted."

