#!/bin/bash

### For debugging:
#set -x
#set -v

### NOTE: This command should receive the full path to the files (not relative paths)
numArgsExpected=8
if [ "$1" == "-h" ] || [ "$#" -ne $numArgsExpected ] ; then 
  echo "Incorrect number of arguments passed (expects: $numArgsExpected,  got: $#)"
  echo "Usage:  _bior_annotate_blaster_concat  <partsDir>  <partsPrefix>  <vcfOut>  <isVcf>  <isCleanup>  <logFile>  <statusFile>  <isZipOutput>"
  echo "   Where:"
  echo "       <partsDir> is the directory that contains the parts output from bior_annotate_blaster"
  echo "       <partsPrefix> is the filename for the parts to assemble, before the '.part.000x' extension (within the partsDir)"
  echo "                  Example: If one of the parts is named 'my.100.out.vcf.part.0001' this would be 'my.100.out.vcf'" 
  echo "       <vcfOut>   is the final VCF file to output"
  echo "       <isVcf>    is whether the file should be a true VCF file instead of TJSON ('true' or 'false')"
  echo "       <isCleanup>  is whether the temp directory should be cleaned up ('true' or 'false')"
  echo "       <logFile>  Combine all grid errors and output to one file.  Specify /dev/null or \"\" if this is not needed"
  echo "       <statusFile> Write the status of the command to a file.  Specify /dev/null or \"\" if this is not needed"
  echo "       <isZipOutput> is whether the output should be zipped or plain text"
  exit 0;
fi

partsDir="$1"
partsPrefix="$2"
vcfOut="$3"
isVcf="$4"
isCleanup="$5"
logFile="$6"
statusFile="$7"
isZipOutput="$8"

function printvars
{
  echo "-------------------------"  | tee -a $logFile
  echo "_bior_annotate_blaster_concat parameters" | tee -a $logFile
  echo "-------------------------"  | tee -a $logFile
  echo "PWD         = $PWD"         | tee -a $logFile
  echo "partsDir    = $partsDir"    | tee -a $logFile
  echo "partsPrefix = $partsPrefix" | tee -a $logFile
  echo "vcfOut      = $vcfOut"      | tee -a $logFile
  echo "isVcf       = $isVcf"       | tee -a $logFile
  echo "isCleanup   = $isCleanup"   | tee -a $logFile
  echo "logFile     = $logFile"     | tee -a $logFile
  echo "statusFile  = $statusFile"  | tee -a $logFile
  echo "isZipOutput = $isZipOutput" | tee -a $logFile
  echo "BIOR_LITE_HOME = $BIOR_LITE_HOME" | tee -a $logFile
  echo "PATH           = $PATH"     | tee -a $logFile
}
printvars

## For each file with a ".FAILED" extension, print it out
isFailed="false"
## Disregard list of files if there are none present  (ignore warning)
failedList=`ls $partsDir/$partsPrefix.part.*.done.FAILED  2>/dev/null`
## Note: the count must be done on the exact same command or it will insert blank lines (on variables)
failedCount=`ls $partsDir/$partsPrefix.part.*.done.FAILED  2>/dev/null | wc -l`
if [ "$failedCount" -ne 0 ] ; then
  isFailed="true"
  echo "Failed on $failedCount files:" | tee -a $logFile
  echo -e "$failedList"   | tee -a $logFile
fi

### Exit on any error - NOTE: Need to do this after the check for .FAILED files
### as "no files found" will technically be treated as an error 
###set +e

## Write to status file
function status
{
  numLinesIn=0
  numLinesOut=0
  numLinesBadData=0
  ### If something failed, then we failed overall
  isSuccessful=true
  if [ "true" == "$isFailed" ] ; then
    isSuccessful=false
  fi

  ### Check all status files - if any were not successful, then the whole thing fails
  ### Ex status file:  my.100.bad.out.vcf.gz.part.0001.status
  lines=`grep isSuccessful $partsDir/$partsPrefix.part.????.status | sed "s/^.*isSuccessful=//g"`
  for val in $lines
  do
    if [ "false" == "$val" ] ; then
      isSuccessful=false
    fi
  done

  if [ "$isSuccessful" == "true" ] ; then
    ### Loop thru all lines from the status files - numLinesIn
    lines=`grep numLinesIn $partsDir/$partsPrefix.part.????.status | sed "s/^.*numLinesIn=//g"`
    for val in $lines
    do
      numLinesIn=$(($numLinesIn + $val));
    done
  
    ### Get total numLinesOut
    lines=`grep numLinesOut $partsDir/$partsPrefix.part.????.status | sed "s/^.*numLinesOut=//g"`
    for val in $lines
    do
      numLinesOut=$(($numLinesOut + $val));
    done
  
    ### Get total numLinesBadData
    lines=`grep numLinesBadData $partsDir/$partsPrefix.part.????.status | sed "s/^.*numLinesBadData=//g"`
    for val in $lines
    do
      numLinesBadData=$(($numLinesBadData + $val));
    done
  fi
  
  ### Write it to the status output file
  echo "numLinesIn=$numLinesIn" > $statusFile
  echo "numLinesOut=$numLinesOut" >> $statusFile
  echo "numLinesBadData=$numLinesBadData" >> $statusFile
  echo "isSuccessful=$isSuccessful" >> $statusFile 
}

## If at least one file failed, then exit
if [ "true" == "$isFailed" ] ; then
  echo ""
  echo "FAILED on at least one file.  Exiting..."
  echo "Cleanup will not occur."
  status
  exit 1
fi

###=============================================================
## Concatenate the files together
## If the user wants a true VCF file as output (and not TJSON), then run it thru bior_tjson_to_vcf
###=============================================================
tempVcfOut=$partsDir/$(basename "$vcfOut").temp
if [ "true" == "$isVcf" ] ; then
  ## Concatenate the parts
  bior_concat -s  -i $partsDir/$partsPrefix.part.????  -o $tempVcfOut  --log
  
  ## Convert from TJSON to VCF and gzip (only if isZipOutput = "true")
  if [ "true" == "$isZipOutput" ] ; then
    zcat $tempVcfOut | bior_tjson_to_vcf | gzip -c > $vcfOut
  else
    zcat $tempVcfOut | bior_tjson_to_vcf > $vcfOut
  fi
    
else
  ## Else just concatenate the files into the vcf output
  ## If the user does NOT want the output zipped, then concat the file to tempVcfOut, then unzip it to vcfOut
  noZipFlag=""
  if [ "false" == "$isZipOutput" ] ; then
    noZipFlag="--nozip"
  fi
  bior_concat -s  -i $partsDir/$partsPrefix.part.????  -o $vcfOut $noZipFlag   --log
fi

## Dump all grid errors and stdout to the log file
function logOutput
{
  ## Loop thru all error messages and append them to the log
  for f in `ls $partsDir/_bior_annotate_blaster_*.e*` ; do
    ## If the file size is > 0 bytes, then print its name in the log and dump its contents
    ## (but don't dump lines stating that SnpEff is starting)
    ## ("SNPEFF is requested, bior is starting it up, this will take about 1 min.")
    contents=`grep -v "^SNPEFF is requested," $f`
    if [ -s "$f" ] && [ "$contents" != "" ]; then
      echo "" >> "$logFile"
      echo "---------------------------------------" >> "$logFile"
      echo "$f" >> "$logFile"
      echo "---------------------------------------" >> "$logFile"
      cat "$f" >> "$logFile"
    fi
  done

  ## Loop thru all stdout messages and append them to the log
  for f in `ls $partsDir/_bior_annotate_blaster_*.o*` ; do
    ## If the file size is > 0 bytes, then print its name in the log and dump its contents
    if [ -s "$f" ] ; then
      echo "" >> "$logFile"
      echo "---------------------------------------" >> "$logFile"
      echo "$f" >> "$logFile"
      echo "---------------------------------------" >> "$logFile"
      cat "$f" >> "$logFile"
    fi
  done 
}

## If there was a logFile specified, then dump all contents of grid error and output logs into it
if [ -n "$logFile" ] ; then
  logOutput
fi

## Write to the status file if specified
if [ -n "$statusFile" ] ; then
  status
fi

## Cleanup the directory
function cleanup
{
  ### Don't exit on any cleanup errors
  set +e
  
  ## Remove all "_bior_annotate_blaster_chunker.pe1457123" type files
  rm  $partsDir/_bior_annotate_blaster_chunker.*
  
  ## Remove all part files:
  rm  $partsDir/$partsPrefix.part.*
  
  ## Remove the jobIds file:
  rm  $partsDir/${partsPrefix}_jobIds.txt
  
  ## Remove the .temp file if it exists
  rm  $tempVcfOut
 
  ## Remove the bior.log file
  rm  $partsDir/bior.log*

  ## Remove the concatenator files
  rm $partsDir/_bior_annotate_blaster_concat.*

  ## Remove the status files
  rm $partsDir/${partsPrefix}.part.*.status

  ## Remove the directory
  rmdir  $partsDir
}

if [ "true" == "$isCleanup" ] && [ "$isSuccessful" == "true" ] ; then
  cleanup
fi

###touch  $vcfOut.done
