package edu.mayo.bior.catalog.misses;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;

import net.sf.samtools.util.BlockCompressedInputStream;

/**
 * 
 * @author m054457
 * HTSJDK jar available at:
 * 	https://mvnrepository.com/artifact/com.github.samtools/htsjdk
 */
public class TabixReaderTest {

	private edu.mayo.bior.catalog.misses.TabixReaderBAD  mTabixReaderBad;
	private htsjdk.tribble.readers.TabixReader     mTabixReaderGood;
	private String mCatalogPath = null;
	private File mVcfFile = null;
	private File mOutputFile = null;
	
	
	
	private static void printUsage() {
		System.out.println(
			"This is a utility provided by the BIOR team to help assess the impact to user-generated\n" +
			"BIOR catalogs due to a bug in a Broad Institute Java-based tabix reader used within the BIOR toolkit\n" +
			"prior to versions 2.4.2 and 4.1.2.\n\n"+
			"By default the utility outputs all lines from of a BIOR catalog that do not return an expected\n" +
			"result. These lines in the catalog would have been incorrectly ommitted from annotations outputted\n" +
			"by some commands in the BIOR toolkit.\n\n" +
			"You can also ensure your catalog is now returning all the expected results, as it should have been, by\n" +
			"running this utility with the optional '--fixed' parameter.\n\n" +
			"For more information on this issue, please see web-site: http://bsiweb.mayo.edu/bior/java-tabix-bug-2016-09\n");
			
		System.out.println("USAGE: bior_count_catalog_misses --catalog <CATALOG> [--output_file_path <OUTPUT_FILE_PATH>]   [--vcf <VCF_PATH>]  [--fixed]");
		System.out.println(
			"where: \t--catalog <CATALOG>  is required.\n" +
		    "            This is the catalog to check if any misses occurred by scanning the whole catalog line-by-line to see\n" +
		    "            if an overlap operation does NOT return the same line, in which case we know there is a problem with\n" +
		    "            the TabixReader implementation.\n" +
		    "            If --vcf is specified, then it is the VCF file that is crawled line-by-line for problems against the catalog,\n" + 
		    "            comparing hits between bad and fixed TabixReader.\n" +
			"       \t--output_file_path <OUTPUT_FILE_PATH>  is optional. \n" +
			"            If provided, the reported catalog rows that did not return an expected result will be written to this file.\n" +
			"            If the file already exists, the program will exit with an error. If not provided, an output file name will \n" + 
			"            be generated automatically based on the catalog file name and reported at the conclusion of the program run.\n" +
			"       \t--vcf <VCF_PATH>  is optional\n" +
			"            If specified, this checks each line in the VCF against the catalog using both the bad and fixed\n" +
			"            TabixReader class and dumps out any VCF lines that are different between the two.\n" +
			"       \t--fixed  is optional. \n" +
			"            If --fixed is specified, the new, fixed htsjdk TabixReader will be used to check your catalog so that you can\n" +
			"            verify the previous problem with the Java TabixReader has been resolved for your catalog.  If the --fixed parameter\n" +
			"            is left out, this utility uses the older, broken TabixReader so that you can assess which annotations \n" + 
			"            from the specific catalog were incorrectly ommited in the past.  This is ignored if checking a VCF against a catalog.\n");
		System.exit(0);
	}
	
	
	
	public static void main(String[] args) {
		if( args.length == 0 ) {
			printUsage();
		}

		boolean useFixed = false;
		String catalogFilePath = null;
		String outputFilePath = null;
		String vcfPath = null;
		for (int i = 0; i < args.length; i++) {
			String eachArg = args[i];
                        String nextArg = null;
                        if ((i+1) < args.length) {
                         	nextArg = args[i+1];
                        }
			if (eachArg.equals("-h") || eachArg.equals("--help") || eachArg.equals("-help")) {
				printUsage();
			}

			if (eachArg.equals("-c") || eachArg.equals("--catalog")) {
				catalogFilePath = nextArg;
				i++;
			}
			if (eachArg.equals("-o") || eachArg.equals("--output_file_path")) {
				outputFilePath = nextArg;
				i++;
			}
			if (eachArg.equals("-v") || eachArg.equals("--vcf")) {
				vcfPath = nextArg;
				i++;
			}
			if (eachArg.equals("-f") || eachArg.equals("--fixed")) {
				useFixed = true;
			}
		}
		
		if (catalogFilePath == null || catalogFilePath.length() == 0) {
			System.err.println("\nERROR: Catalog file to check not specified. Please specify a catalog.\n");
		 	printUsage();
		} else {
			File catFile = new File(catalogFilePath);
			if (catFile == null || !catFile.exists()) {
				System.err.println("\nERROR: Catalog file specified does not exist. Please specify a valid catalog.\n");
			 	printUsage();
			}
		}
		File catalogIndexFile = new File(catalogFilePath + ".tbi");
		if (catalogIndexFile == null || !catalogIndexFile.exists()) {
			System.err.println("\nERROR: Catalog tabix index file for the catalog does not exist [" + catalogIndexFile.getAbsolutePath() + "]. " +
			" If this is a catalog that has genomic coordinates in the first three columns, you must create a tabix index in order to check" +
			" it for misses due to the Java tabix issue. If this is not a positional catalog, there is no need to check for misses. " +
			" Non-positional catalogs are not designed to return results using the tools affected by this Java tabix issue.\n");
			printUsage();
		}
			
		if (outputFilePath==null || outputFilePath.length() == 0) {
			File catalogFile = new File(catalogFilePath);
			String basename = catalogFile.getName();
			if (useFixed) {
				outputFilePath = basename + "_misses_fixed.txt";
			} else {
				outputFilePath = basename + "_misses.txt";
			}
		}
		
		File outFile = new File(outputFilePath);
		if (outFile.exists()) {
			if (!outFile.isFile()) {
				System.err.println("\nERROR: Output file is not a file ["+outputFilePath+"]. Please specify an output file value.\n");
				printUsage();
			} else {
				System.err.println("\nERROR: Output file already exists ["+outputFilePath+"]. Please remove and rerun, or specify new value.\n");
				printUsage();
			}
		}
		
		File vcfFile = (vcfPath == null  ?  null  :  new File(vcfPath));
		if( vcfFile != null && ! vcfFile.exists()) {
			System.err.println("\nERROR: VCF file specified does not exist. Please specify a valid VCF file.\n");
		 	printUsage();		
		}
		
		File parentDir = null;
		if (outFile.getParentFile() == null) {
		    parentDir = new File(".").getAbsoluteFile();
		} else {
		    parentDir = outFile.getParentFile();
		}
		if (parentDir == null || !parentDir.canWrite()) {
			System.err.println("\nERROR: Output file specified is not writable ["+outputFilePath+"]. Please change permissions on directory/file and rerun, or specify new value.\n");
			printUsage();
		}		
		
		try {
			TabixReaderTest tester = new TabixReaderTest(catalogFilePath, outFile, vcfFile);
			if( vcfFile != null && vcfFile.exists() ) {
				tester.dumpMissesFromVcf();
			} else {
				tester.dumpMissesFromCatalog(useFixed);
			}
		} catch(Exception e) {
			e.printStackTrace();
		}
	}
	
	public TabixReaderTest(String catalogPath, File outputFile, File vcfFile) throws IOException {
		mCatalogPath = catalogPath;
		mOutputFile = outputFile;
		mVcfFile = vcfFile;
		mTabixReaderBad  = new edu.mayo.bior.catalog.misses.TabixReaderBAD(catalogPath);
		mTabixReaderGood = new htsjdk.tribble.readers.TabixReader(catalogPath);
	}
	
	
	private void dumpMissesFromCatalog(boolean useFixed) {
		BufferedWriter outWtr = null;
		try {
			double start = System.currentTimeMillis();
		
			outWtr = new BufferedWriter(new FileWriter(mOutputFile));
			
			File catalogFile = new File(mCatalogPath);
			BlockCompressedInputStream inStream = new BlockCompressedInputStream(catalogFile);
			String line = null;
			long numLines = 0;
			long numMisses = 0;
			
			System.err.println(". = 10k lines,  o=100k lines");
			while( (line = inStream.readLine()) != null ) {
				numLines++;
				
				if(numLines % 1000000 == 0)
					System.err.println(numLines);
				else if( numLines % 100000 == 0 )
					System.err.print("o");
				else if(numLines % 10000 == 0)
					System.err.print(".");
				
				String chrom = getMid(line, "_landmark\":\"", "\"");
				String min   = getMid(line, "_minBP\":", ",");
				String max   = getMid(line, "_maxBP\":", ",");
				
				// Skip if no chrom found
				if( ! isGiven(chrom) )
					continue;
				
				String tabixRegionQueryStr = chrom + ":" + min + "-" + max;
				
				// We only need to check if there was AT LEAST ONE hit.  If none, then dump the line with an empty JSON field at end
				if( ! isAtLeastOneHit(tabixRegionQueryStr, useFixed) ) {
					numMisses++;
					outWtr.write(line + "\t{}\n");
				}
			}
			
			System.err.println();
			double end = System.currentTimeMillis();
			double runtime = (end-start)/1000.0;
			long linesPerSecond = (long)((double)numLines / runtime);
			File tabixIndex = new File(mCatalogPath + ".tbi");
			String fileSizeWithCommas = new DecimalFormat("#,###").format(catalogFile.length());
			System.err.println("Catalog path:     " + mCatalogPath);
			System.err.println("Size of catalog:  " + fileSizeWithCommas);
			System.err.println("Last modified:    " + new Date(catalogFile.lastModified()));
			System.err.println("Tabix index size: " + new DecimalFormat("#,###").format(tabixIndex.length()));
			System.err.println("Tabix index last modified: "  + new Date(tabixIndex.lastModified()));
			System.err.println("Time of this run: " + new Date());
			System.err.println("Total runtime (seconds) = " + runtime);
			System.err.println("Output file for MISSES:   " + mOutputFile.getCanonicalPath());
			System.err.println("# lines processed:        " + numLines);
			System.err.println("# lines processed per second: " + linesPerSecond);
			System.err.println("# MISSES:  " + numMisses);

		} catch(Exception e) {
			e.printStackTrace();
		} finally {
		    try { if (outWtr != null) { outWtr.flush(); outWtr.close(); } } catch (Exception writerErr) {} 
		}
	}

	
	
	/** Process each line in the VCF.  Compare the result against the catalog for the bad TabixReader vs the good one.
	 *  If there is a difference, then dump that VCF line */
	private void dumpMissesFromVcf() {
		BufferedWriter outWtr = null;
		try {
			double start = System.currentTimeMillis();
		
			outWtr = new BufferedWriter(new FileWriter(mOutputFile));
			outWtr.write("## These are the lines from the VCF file where there was a difference between using the old/bad TabixWriter and the new/fixed one.\n");
			outWtr.write("## VCF File: " + mVcfFile.getCanonicalPath() + "\n");
			
			File catalogFile = new File(mCatalogPath);
			
			// Input should be the VCF file
			BufferedReader inStream = null;
			boolean isCompressedVcf = mVcfFile.getName().endsWith(".gz") || mVcfFile.getName().endsWith(".bgz");
			if(isCompressedVcf) {
				inStream = new BufferedReader(new InputStreamReader(new BlockCompressedInputStream(mVcfFile)));
			} else {
				inStream = new BufferedReader(new FileReader(mVcfFile));
			}
			
			String line = null;
			long numLines = 0;
			long numMisses = 0;
			
			System.err.println(". = 10k lines,  o=100k lines");
			while( (line = inStream.readLine()) != null ) {
				// If header, then skip
				if( line.startsWith("#") )
					continue;
				
				numLines++;
				
				if(numLines % 1000000 == 0)
					System.err.println(numLines);
				else if( numLines % 100000 == 0 )
					System.err.print("o");
				else if(numLines % 10000 == 0)
					System.err.print(".");
				
				// Get the chrom, min, max from the VCF line
				// VCF line:  chrom, pos, id, ref, alt, qual, filter, info
				String[] lineSplit = line.split("\t", -1);
				String chrom = lineSplit[0];
				Long min     = Long.parseLong(lineSplit[1]);
				String ref   = lineSplit[3];
				Long max     = min + ref.length();
				
				String tabixRegionQueryStr = chrom + ":" + min + "-" + max;
				
				// Get the results from both the bad and good TabixReader
				List<String> bad = getHitsBad(tabixRegionQueryStr);
				List<String> fixed = getHitsFixed(tabixRegionQueryStr);
				if( ! isListSame(bad, fixed) ) {
					numMisses++;
					outWtr.write(line + "\n");
				}
			}
			
			System.err.println();
			double end = System.currentTimeMillis();
			double runtime = (end-start)/1000.0;
			long linesPerSecond = (long)((double)numLines / runtime);
			File tabixIndex = new File(mCatalogPath + ".tbi");
			String fileSizeWithCommas = new DecimalFormat("#,###").format(catalogFile.length());
			System.err.println("Catalog path:     " + mCatalogPath);
			System.err.println("Size of catalog:  " + fileSizeWithCommas);
			System.err.println("Last modified:    " + new Date(catalogFile.lastModified()));
			System.err.println("Tabix index size: " + new DecimalFormat("#,###").format(tabixIndex.length()));
			System.err.println("Tabix index last modified: "  + new Date(tabixIndex.lastModified()));
			System.err.println("VCF file:         " + mVcfFile.getCanonicalPath());
			System.err.println("Time of this run: " + new Date());
			System.err.println("Total runtime (seconds) = " + runtime);
			System.err.println("Output file for MISSES:   " + mOutputFile.getCanonicalPath());
			System.err.println("# lines processed:        " + numLines);
			System.err.println("# lines processed per second: " + linesPerSecond);
			System.err.println("# MISSES:  " + numMisses);

		} catch(Exception e) {
			e.printStackTrace();
		} finally {
		    try { if (outWtr != null) { outWtr.flush(); outWtr.close(); } } catch (Exception writerErr) {} 
		}
	}
	
	/** Checks if the two lists have the same number of lines and each line is identical to the other and in the same order */
	private boolean isListSame(List<String> bad, List<String> fixed) {
		if( bad.size() != fixed.size() )
			return false;
		
		for(int i=0; i < bad.size(); i++) {
			if( ! bad.get(i).equals(fixed.get(i)) )
				return false;
		}
		
		// All matched
		return true;
	}



	private boolean isAtLeastOneHit(String tabixRegionQueryStr, boolean useFixed) {
		try {
			if( useFixed ) {
				htsjdk.tribble.readers.TabixReader.Iterator iter = mTabixReaderGood.query(tabixRegionQueryStr);
				return iter != null  &&  isGiven(iter.next());
			} else {
				edu.mayo.bior.catalog.misses.TabixReaderBAD.Iterator iter = mTabixReaderBad.query(tabixRegionQueryStr);
				return iter != null  &&  isGiven(iter.next());			
			}
		}catch(Exception e) {
			return false;
		}
	}
	
	private List<String> getHitsBad(String tabixRegionQueryStr) throws IOException {
		edu.mayo.bior.catalog.misses.TabixReaderBAD.Iterator iter = mTabixReaderBad.query(tabixRegionQueryStr);
		List<String> results = new ArrayList<String>();
		String line = null;
		while( iter != null  &&  (line = iter.next()) != null) {
			results.add(line);
		}
		return results;
	}

	private List<String> getHitsFixed(String tabixRegionQueryStr) throws IOException {
		htsjdk.tribble.readers.TabixReader.Iterator iter = mTabixReaderGood.query(tabixRegionQueryStr);
		List<String> results = new ArrayList<String>();
		String line = null;
		while( iter != null  &&  (line = iter.next()) != null) {
			results.add(line);
		}
		return results;
	}

	private boolean isGiven(String s) {
		return s != null && s.trim().length() > 0;
	}


	private String getMid(String line, String pre, String post) {
		int idx1 = line.indexOf(pre);
		int idx2 = line.indexOf(post, idx1 + pre.length());
		// If prefix not found, then return "{}"
		if( idx1 == -1 )
			return "{}";
		
		// If the prefix was found, but not the suffix, then return the end part of the line,
		// but remove any ending brace (ex: where the string to find is at the end of the JSON: "_minBP":123} )
		if( idx2 == -1 )
			idx2 = line.length();
		return line.substring(idx1 + pre.length(), idx2).replace("}", "");
	}
	

}

