package edu.mayo.bior.catalog.stats;

import com.jayway.jsonpath.InvalidPathException;
import com.jayway.jsonpath.JsonPath;
import edu.mayo.bior.catalog.CatalogFileUtils;
import edu.mayo.bior.catalog.CatalogFiles;
import edu.mayo.bior.catalog.CatalogFormatException;
import edu.mayo.bior.catalog.ChunkUtils;
import org.apache.commons.io.FileUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.*;

public class StatsBuilder
{
   private static Logger sLogger = LoggerFactory.getLogger(StatsBuilder.class);

   // 5GB. If catalog file bigger than this, stop reading after past chunk
   private static final long FILE_SIZE_THRESHOLD = 5l * 1024 * 1024 * 1024;

   // Default is to read and sample every row
   private static final long ONE_IN_X_SAMPLING_DEFAULT = 1;

   // Write a progress statement to the log every X lines
   private static long LOG_PROGRESS_EVERY_X_LINES = 100000;

   private static final long START_LINE_DEFAULT = 1;

   private static final long NUM_LINES_DEFAULT = 0;

   private long startTime;
   
   /** Set this to true if you want to test large rows counts (overriding the row and count initializations) to values of Integer.MAX_VALUE */
   private static boolean testLargeRowCounts = false;
   private static Properties valuesToInitializeToIntMax;
   private final long LARGE_VALUE = (long)(Integer.MAX_VALUE) + 10;

   static
   {
      setTestLargeRowCountsToDefault();
      setValuesToInitializeToIntMaxToDefault();
   }

   private CatalogStats catalogStats;

   public static void setTestLargeRowCounts(boolean largeRowCounts)
   {
      testLargeRowCounts = largeRowCounts;
   }

   public static void setTestLargeRowCountsToDefault()
   {
      setTestLargeRowCounts(false);
   }

   public static void setValuesToInitializeToIntMax(Properties values)
   {
      valuesToInitializeToIntMax = values;
   }

   public static void setValuesToInitializeToIntMaxToDefault()
   {
      setValuesToInitializeToIntMax(null);
   }

   /**
    * Change the log progress every x lines setting
    */
   public static void setLogProgressEveryXLines(long progressEveryXLines)
   {
      LOG_PROGRESS_EVERY_X_LINES = progressEveryXLines;
   }

   /**
    * Builds stats the given catalog.
    *
    * @param catalogFile The catalog tsv.bgz file
    * @param maxValueCount  Number of values to sample per catalog column
    * @return {@link CatalogStats} containing the statistics and sampled values.
    * @throws IOException
    * @throws CatalogFormatException
    */
   public CatalogStats build(File catalogFile, int maxValueCount) throws IOException, CatalogFormatException
   {
      return build(catalogFile, maxValueCount, ONE_IN_X_SAMPLING_DEFAULT);
   }

   /**
    * Builds stats the given catalog.
    *
    * @param catalogFile    The catalog tsv.bgz file
    * @param maxValueCount     Number of values to sample per catalog column
    * @param oneInXSampling
    * @return {@link CatalogStats} containing the statistics and sampled values.
    * @throws IOException
    * @throws CatalogFormatException
    */
   public CatalogStats build(File catalogFile, int maxValueCount, long oneInXSampling) throws IOException, CatalogFormatException
   {
      return build(catalogFile, maxValueCount, oneInXSampling, START_LINE_DEFAULT, NUM_LINES_DEFAULT);
   }

   public CatalogStats build(File catalogFile, int maxValueCount, long oneInXSampling, long startLine, long numLines)
      throws IOException, CatalogFormatException
   {
      checkFileParam(catalogFile);

      checkProcessCatalogIntParam(maxValueCount, 1, "number values sampled per column");
      checkProcessCatalogIntParam(oneInXSampling, 1, "line sampling frequency");
      checkProcessCatalogIntParam(startLine, 1, "start line");
      checkProcessCatalogIntParam(numLines, 0, "number of lines");

      CatalogFiles catalogFiles = new CatalogFiles(catalogFile);
      List<String> columnNames = getColumns(catalogFiles.getColumnsFile(), true);

      catalogStats = initializeStats(columnNames);
      
      return processCatalog(catalogFile, columnNames, maxValueCount, oneInXSampling, startLine, numLines);
   }

   private CatalogStats initializeStats(List<String> columnNames) {
	   final long INIT_VALUE = testLargeRowCounts ?  LARGE_VALUE  :  0;
	   
	   CatalogStats stats = new CatalogStats();
	   stats.setTotalDataLineCount(INIT_VALUE);
	   stats.setNumLinesSampled(INIT_VALUE);
	   stats.setColumns(columnNames);

	   for (String columnName : columnNames) {
		   CatalogColumnStats colStats = new CatalogColumnStats();
		   colStats.setValueSamplings(getInitValueSamplings(columnName));
		   colStats.setNumEntries(INIT_VALUE);
		   colStats.setNumCharacters((long)INIT_VALUE);
		   
		   colStats.setLineAsciiStats(getInitCharStats());
		   colStats.setTotalAsciiStats(getInitCharStats());
		   
		   stats.setColumnStats(columnName, colStats);
	   }
	   
	   return stats;
   }
   
   private Collection<ValueSampling> getInitValueSamplings(String columnName) {
	   List<ValueSampling> valueSamplingList = new ArrayList<ValueSampling>();
	   if(testLargeRowCounts) {
		   String valToSetToMax = valuesToInitializeToIntMax.getProperty(columnName);
		   if(valToSetToMax != null) {
			   ValueSampling valSample = new ValueSampling();
			   valSample.setValue(valToSetToMax);
			   valSample.setFrequency(LARGE_VALUE);
			   valueSamplingList.add(valSample);
		   }
	   }
	   return valueSamplingList;
   }

   private AsciiCharacterStats getInitCharStats() {
	   AsciiCharacterStats charStats = new AsciiCharacterStats();
	   if(testLargeRowCounts) {
		   // Try just setting the number '2' to the large value
		   int number2 = (int)('2');
		   charStats.counts[number2] = LARGE_VALUE;
	   }
	   return charStats;
   }

   public static long getOneInXSamplingDefault() {
      return ONE_IN_X_SAMPLING_DEFAULT;
   }

   public static long getStartLineDefault()
   {
      return START_LINE_DEFAULT;
   }

   public static long getNumLinesDefault()
   {
      return NUM_LINES_DEFAULT;
   }

   private CatalogStats processCatalog(File catalogFile, List<String> columnNames, int maxNumValues, long oneInXSampling,
                                       long startLine, long numLines)  throws IOException
   {
      sLogger.info("Start line for chunk: " + startLine);
      sLogger.info("Num lines in chunk: " + numLines);
      sLogger.info("Sampling every 1 in " + oneInXSampling + " lines");

      // No point in doing anything is there are no column names
      if (columnNames == null || columnNames.isEmpty()) {
         sLogger.info("Don't have any columns to sample so bailing");
         return null;
      }

      // Set this for later checking to see if you will leave file early
      boolean fileBiggerThanThreshold = false;
      if (catalogFile.length() > FILE_SIZE_THRESHOLD)
      {
         fileBiggerThanThreshold = true;
         sLogger.info(String.format("Catalog file length %d > %d. Can stop reading early if past chunk.",
            catalogFile.length(), FILE_SIZE_THRESHOLD));
      }

      startTime = new Date().getTime();

      // Set default JSON column (will be changed later if necessary)
      int jsonCol = 3;

      BufferedReader catalogRdr = CatalogFileUtils.getBufferedReader(catalogFile.getAbsolutePath());
      String line;
      while ((line = catalogRdr.readLine()) != null) {
         // Skip any header lines
         if (line.startsWith("#")) {
            continue;
         }

         // bail out if past chunk
         if (fileBiggerThanThreshold && ChunkUtils.beyondTargetChunk(catalogStats.getTotalDataLineCount() + 1, startLine, numLines))
         {
            catalogStats.setStoppedReadingAfterChunk(true);
            sLogger.info(String.format("Stopped reading catalog after reading %d lines - past chunk",
               catalogStats.getTotalDataLineCount()));
            break;
         }

         // Count the line before checking if it should be sampled
         catalogStats.incrementTotalDataLineCount();

         logProgressEveryXLines(catalogStats.getTotalDataLineCount());

        // Only process 1 in every X lines.  Skip the rest
         if( ! shouldSample(catalogStats.getTotalDataLineCount(), oneInXSampling, startLine, numLines)) {
            continue;
         }

         catalogStats.incrementNumLinesSampled();

          String[] colArr = line.split("\t", -1);

         // If this is the first line, then determine which column is the json column
         if (catalogStats.getTotalDataLineCount() == 1) {
            jsonCol = getJsonColumn(colArr);
         }

         final String json = colArr[jsonCol];

         for (String columnName : columnNames) {
        	 String value = getJsonValue(json, columnName);
             if( value != null )
            	 updateColumnStats(columnName, value, maxNumValues);
         }
      }

      return catalogStats;
   }

   private String getJsonValue(String json, String columnName) {
	   try {
		   return JsonPath.compile(columnName).read(json).toString();
	   } catch (InvalidPathException ipe) {
		   // Don't do anything - it may just be that the key was not found in this particular row, which is fine
	   }
	   // Key not found, so return null
	   return null;
   }

   private void updateColumnStats(String columnName, String value, int maxNumValues) {
	   CatalogColumnStats colStats = catalogStats.getColumnStats(columnName);
	   colStats.setColumnName(columnName);
	   colStats.incrementNumEntries();

	   updateValueSampling(value, colStats, maxNumValues);

	   updateCharStats(value, colStats);
   }


   private void updateValueSampling(String columnValue, CatalogColumnStats colStats, int maxNumValues) {
	   ValueSampling valSampling = colStats.getValueSampling(columnValue);
	   // If the ValueSample is not already in the list, AND if we are under the value count threshold, then we can add another
	   if( valSampling == null ) {
		   if( colStats.getNumUniqueValues() < maxNumValues) {
			   valSampling = new ValueSampling();
			   valSampling.setValue(columnValue);
			   valSampling.setFrequency(1);
			   colStats.addValueSampling(valSampling);
		   }
	   } else {
		   valSampling.incrementFrequency();
	   }
   }
   
   private void updateCharStats(String value, CatalogColumnStats colStats) {
	   AsciiCharacterStats lineStats  = colStats.getLineAsciiStats();
	   AsciiCharacterStats totalStats = colStats.getTotalAsciiStats();
	   
	   // For each of the ASCII character, set a boolean on whether it was found in the line
	   // Later we'll increment the line count if it was found 
	   // multiple values will only increment the line count by 1 since it simply indicates whether the value was found on that line) 
	   boolean[] isCharFoundInLine = new boolean[lineStats.counts.length];
	   
	   // Got thru each character in the value string
	   for(int i=0; i < value.length(); i++) {
		   colStats.incrementNumCharacters();
		   
		   int asciiCode = (int)(value.charAt(i));
		   if( asciiCode < totalStats.counts.length ) {
			   totalStats.counts[asciiCode]++;
			   isCharFoundInLine[asciiCode] = true; 
		   }
	   }
	   
	   // how many lines did a specific char occur on?
	   for (int i = 0; i < isCharFoundInLine.length; i++) {
		   if (isCharFoundInLine[i] ) 
			   lineStats.counts[i]++;
	   }
   }


   private void checkFileParam(File catalogFile) throws IOException
   {
      if (catalogFile == null)
      {
         throw new IOException("Null catalog file supplied to StatsBuilder.build()");
      }
      if (!catalogFile.exists())
      {
         throw new IOException(String.format("File '%s' does not exist", catalogFile.getPath()));
      }
      if (!catalogFile.canRead())
      {
         throw new IOException(String.format("File '%s' is not readable", catalogFile.getPath()));
      }
   }

   private void checkProcessCatalogIntParam(long param, long threshold, String paramName)
   {
      if (param < threshold)
      {
         String msg = String.format("Programming Error: supplied value %d for %s must be >= %d in StatsBuilder.build()",
                                    param, paramName, threshold);
         throw new RuntimeException(msg);
      }
   }
   private static boolean shouldSample(long lineNumber, long oneInXSampling, long startLine, long numLines)
   {
      // Only process 1 in every X lines. Important to note that the first data line must be sampled so if you are
      // sampling every 100 rows, row 1, 101, 201, etc... would be sampled
      if (((lineNumber - 1) % oneInXSampling) != 0)
      {
         return false;
      }

      return ChunkUtils.inTargetChunk(lineNumber, startLine, numLines);
   }

   private void logProgressEveryXLines(long totalDataLinesInFile)
   {
      if (totalDataLinesInFile % LOG_PROGRESS_EVERY_X_LINES == 0)
      {
         Date now = new Date();
         double elapsedTimeSeconds = ((double) now.getTime() - (double) startTime) / 1000.0;
         DecimalFormat decFormat = new DecimalFormat("#,###,###,##0");
         sLogger.info("Stats - # lines processed " + decFormat.format(totalDataLinesInFile) + "  (" + decFormat.format(elapsedTimeSeconds) + "s)");
      }
   }

   /**
    * Get the 0-based json column.  Usually this is the 4th column (3 in zero-based), but may be the first if it is the only column
    */
   private int getJsonColumn(String[] colArr) {
      if (colArr.length > 3 && colArr[3].startsWith("{") && colArr[3].endsWith("}")) {
         return 3;
      } else {
         return 0;
      }
   }

   /**
    * Helper method that pulls the column names from the catalog's columns.tsv file.
    *
    * @param columnsFile          The columns.tsv file.
    * @param skipGoldenAttributes Skips golden attribute columns (e.g. _landmark)
    * @return List of column names.
    * @throws IOException
    */
   private List<String> getColumns(File columnsFile, boolean skipGoldenAttributes) throws IOException
   {
      List<String> colNames = new ArrayList<String>();
      for (String line : FileUtils.readLines(columnsFile))
      {
         if (line.startsWith("#")) continue; // skip header lines

         if (skipGoldenAttributes && line.startsWith("_")) continue; // skip golden attributes

         String[] colArr = line.split("\t", -1);
         final String columnName = colArr[0];
         if (columnName.trim().length() > 0)
            colNames.add(columnName);
      }
      return colNames;
   }
}
