package edu.mayo.bior.catalog.stats;

import edu.mayo.bior.catalog.CatalogFileUtils;
import edu.mayo.bior.catalog.CatalogFiles;
import edu.mayo.bior.catalog.CatalogFormatException;
import edu.mayo.bior.catalog.ChunkUtils;
import edu.mayo.bior.cli.func.BaseFunctionalTest;
import edu.mayo.pipes.history.ColumnMetaData;
import edu.mayo.pipes.history.ColumnMetaDataOperations;
import net.sf.samtools.util.BlockCompressedOutputStream;
import org.apache.commons.io.FileUtils;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import org.junit.rules.TemporaryFolder;

import java.io.*;
import java.util.*;

import static org.junit.Assert.*;

public class StatsBuilderTest extends BaseFunctionalTest
{
   @Rule
   public ExpectedException expectedException = ExpectedException.none();

   @Rule
   public TemporaryFolder temporaryFolder = new TemporaryFolder();

   private static final String SINGLE_QUOTE = "'";
   private static final String DOUBLE_QUOTE = "\"";

   @Test
   public void buildClinvar() throws IOException, CatalogFormatException
   {
      int valueCount = 1000;
      File bgzipFile = new File("src/test/resources/catalogStats/clinvar_catalog/macarthur-lab_xml_txt.tsv.bgz");
      StatsBuilder statsBuilder = new StatsBuilder();
      CatalogStats catalogStats = statsBuilder.build(bgzipFile, valueCount);

      assertEquals(2, catalogStats.getTotalDataLineCount());
      assertEquals(getColumnNames(bgzipFile), catalogStats.getColumns());

      CatalogColumnStats colStats = catalogStats.getColumnStats("chrom");
      List<ValueSampling> samplings = getSortedByValue(colStats.getValueSamplings());
      assertEquals(2, colStats.getNumEntries());
      assertEquals(2, colStats.getNumCharacters());
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) '1']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) '2']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) '1']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) '2']);
      assertEquals("1", samplings.get(0).getValue());
      assertEquals(1, samplings.get(0).getFrequency());
      assertEquals("2", samplings.get(1).getValue());
      assertEquals(1, samplings.get(1).getFrequency());

      colStats = catalogStats.getColumnStats("clinical_significance");
      samplings = getSortedByValue(colStats.getValueSamplings());
      assertEquals(2, colStats.getNumEntries());
      assertEquals(32, colStats.getNumCharacters());
      // Pathogenic, Uncertain significance
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'P']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 'a']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 't']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'h']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'o']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 'g']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 'e']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 'n']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 'i']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 'c']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'U']);
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'n']); letter n
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'c']); letter c
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'e']); letter e
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'r']);
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 't']); letter t
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'a']); letter a
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'i']); letter i
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'n']); letter n
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) ' ']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 's']);
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'i']); letter i
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'g']); letter g
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'n']); letter n
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'i']); letter i
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'f']);
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'i']); letter i
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'c']); letter c
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'a']); letter a
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'n']); letter n
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'c']); letter c
      //assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'e']); letter e

      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'P']);
      assertEquals(3, colStats.getTotalAsciiStats().counts[(int) 'a']);
      assertEquals(2, colStats.getTotalAsciiStats().counts[(int) 't']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'h']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'o']);
      assertEquals(2, colStats.getTotalAsciiStats().counts[(int) 'g']);
      assertEquals(3, colStats.getTotalAsciiStats().counts[(int) 'e']);
      assertEquals(5, colStats.getTotalAsciiStats().counts[(int) 'n']);
      assertEquals(5, colStats.getTotalAsciiStats().counts[(int) 'i']);
      assertEquals(4, colStats.getTotalAsciiStats().counts[(int) 'c']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'U']);
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'n']); letter n
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'c']); letter c
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'e']); letter e
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'r']);
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 't']); letter t
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'a']); letter a
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'i']); letter i
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'n']); letter n
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) ' ']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 's']);
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'i']); letter i
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'g']); letter g
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'n']); letter n
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'i']); letter i
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'f']);
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'i']); letter i
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'c']); letter c
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'a']); letter a
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'n']); letter n
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'c']); letter c
      //assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'e']); letter e
      assertEquals("Pathogenic", samplings.get(0).getValue());
      assertEquals(1, samplings.get(0).getFrequency());
      assertEquals("Uncertain significance", samplings.get(1).getValue());
      assertEquals(1, samplings.get(1).getFrequency());
   }

   private List<ValueSampling> getSortedByValue(Collection<ValueSampling> samplings)
   {
      Comparator<ValueSampling> c = new Comparator<ValueSampling>()
      {
         @Override
         public int compare(ValueSampling o1, ValueSampling o2)
         {
            return o1.getValue().compareTo(o2.getValue());
         }
      };

      List<ValueSampling> l = new ArrayList<ValueSampling>();
      l.addAll(samplings);
      Collections.sort(l, c);
      return l;
   }

   @Test
   public void testBadValueCountParam() throws IOException, CatalogFormatException
   {
      expectedException.expect(RuntimeException.class);
      expectedException.expectMessage("number values sampled per column");
      expectedException.expectMessage("Programming Error");
      File bgzipFile = new File("src/test/resources/catalogStats/clinvar_catalog/macarthur-lab_xml_txt.tsv.bgz");
      StatsBuilder statsBuilder = new StatsBuilder();
      CatalogStats catalogStats = statsBuilder.build(bgzipFile, 0);
   }

   //checkFileParam(catalogFile);

   @Test
   public void testBadSamplingParam() throws IOException, CatalogFormatException
   {
      expectedException.expect(RuntimeException.class);
      expectedException.expectMessage("line sampling frequency");
      File bgzipFile = new File("src/test/resources/catalogStats/clinvar_catalog/macarthur-lab_xml_txt.tsv.bgz");
      StatsBuilder statsBuilder = new StatsBuilder();
      CatalogStats catalogStats = statsBuilder.build(bgzipFile, 1000, 0);
   }

   @Test
   public void testBadStartLineParam() throws IOException, CatalogFormatException
   {
      expectedException.expect(RuntimeException.class);
      expectedException.expectMessage("start line");
      expectedException.expectMessage("Programming Error");
      File bgzipFile = new File("src/test/resources/catalogStats/clinvar_catalog/macarthur-lab_xml_txt.tsv.bgz");
      StatsBuilder statsBuilder = new StatsBuilder();
      CatalogStats catalogStats = statsBuilder.build(bgzipFile, 1000, 1, 0, 0);
   }

   @Test
   public void testBadNumLinesParam() throws IOException, CatalogFormatException
   {
      expectedException.expect(RuntimeException.class);
      expectedException.expectMessage("number of lines");
      expectedException.expectMessage("Programming Error");
      File bgzipFile = new File("src/test/resources/catalogStats/clinvar_catalog/macarthur-lab_xml_txt.tsv.bgz");
      StatsBuilder statsBuilder = new StatsBuilder();
      CatalogStats catalogStats = statsBuilder.build(bgzipFile, 1000, 1, 1, -1);
   }

   @Test
   public void testBadFileNullParam() throws IOException, CatalogFormatException
   {
      expectedException.expect(IOException.class);
      expectedException.expectMessage("Null catalog file");
      StatsBuilder statsBuilder = new StatsBuilder();
      CatalogStats catalogStats = statsBuilder.build(null, 1000, 1, 1, 0);
   }

   @Test
   public void testBadFileNotExistsParam() throws IOException, CatalogFormatException
   {
      expectedException.expect(IOException.class);
      expectedException.expectMessage("does not exist");
      File nonExistentFile = new File("blah/filethatdoesntexist");
      StatsBuilder statsBuilder = new StatsBuilder();
      CatalogStats catalogStats = statsBuilder.build(nonExistentFile, 1000, 1, 1, 0);
   }

   @Test
   public void testBadFileNotReadableParam() throws IOException, CatalogFormatException
   {
      expectedException.expect(IOException.class);
      expectedException.expectMessage("is not readable");
      File nonReadableFile = temporaryFolder.newFile();
      nonReadableFile.setReadable(false);
      StatsBuilder statsBuilder = new StatsBuilder();
      CatalogStats catalogStats = statsBuilder.build(nonReadableFile, 1000, 1, 1, 0);
   }

   @Test
   public void build() throws IOException, CatalogFormatException
   {

      final List<String> catalog = Arrays.asList(
         "0\t0\t0\t{'FIELD1':'VALUE1'}".replace(SINGLE_QUOTE, DOUBLE_QUOTE),
         "0\t0\t0\t{'FIELD2':'VALUE2'}".replace(SINGLE_QUOTE, DOUBLE_QUOTE),
         "0\t0\t0\t{'FIELD2':'VALUE2'}".replace(SINGLE_QUOTE, DOUBLE_QUOTE),
         "0\t0\t0\t{'FIELD3':'VALUE3'}".replace(SINGLE_QUOTE, DOUBLE_QUOTE),
         "0\t0\t0\t{'FIELD3':'VALUE3'}".replace(SINGLE_QUOTE, DOUBLE_QUOTE),
         "0\t0\t0\t{'FIELD3':'VALUE3'}".replace(SINGLE_QUOTE, DOUBLE_QUOTE)
      );

      final List<String> columns = Arrays.asList("FIELD1", "FIELD2", "FIELD3");

      CatalogStats catalogStats = getStats(catalog, columns, 1000, 1);

      assertEquals(catalog.size(), catalogStats.getTotalDataLineCount());
      assertEquals(columns, catalogStats.getColumns());

      CatalogColumnStats colStats = catalogStats.getColumnStats("FIELD1");
      ValueSampling sampling = colStats.getValueSamplings().iterator().next();
      assertEquals(1, colStats.getNumEntries());
      assertEquals(6, colStats.getNumCharacters());
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'V']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'A']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'L']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'U']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) 'E']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) '1']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'V']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'A']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'L']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'U']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) 'E']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) '1']);
      assertEquals("VALUE1", sampling.getValue());
      assertEquals(1, sampling.getFrequency());

      colStats = catalogStats.getColumnStats("FIELD2");
      sampling = colStats.getValueSamplings().iterator().next();
      assertEquals(2, colStats.getNumEntries());
      assertEquals(12, colStats.getNumCharacters());
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 'V']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 'A']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 'L']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 'U']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) 'E']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) '2']);
      assertEquals(2, colStats.getTotalAsciiStats().counts[(int) 'V']);
      assertEquals(2, colStats.getTotalAsciiStats().counts[(int) 'A']);
      assertEquals(2, colStats.getTotalAsciiStats().counts[(int) 'L']);
      assertEquals(2, colStats.getTotalAsciiStats().counts[(int) 'U']);
      assertEquals(2, colStats.getTotalAsciiStats().counts[(int) 'E']);
      assertEquals(2, colStats.getTotalAsciiStats().counts[(int) '2']);
      assertEquals("VALUE2", sampling.getValue());
      assertEquals(2, sampling.getFrequency());

      colStats = catalogStats.getColumnStats("FIELD3");
      sampling = colStats.getValueSamplings().iterator().next();
      assertEquals(3, colStats.getNumEntries());
      assertEquals(18, colStats.getNumCharacters());
      assertEquals(3, colStats.getLineAsciiStats().counts[(int) 'V']);
      assertEquals(3, colStats.getLineAsciiStats().counts[(int) 'A']);
      assertEquals(3, colStats.getLineAsciiStats().counts[(int) 'L']);
      assertEquals(3, colStats.getLineAsciiStats().counts[(int) 'U']);
      assertEquals(3, colStats.getLineAsciiStats().counts[(int) 'E']);
      assertEquals(3, colStats.getLineAsciiStats().counts[(int) '3']);
      assertEquals(3, colStats.getTotalAsciiStats().counts[(int) 'V']);
      assertEquals(3, colStats.getTotalAsciiStats().counts[(int) 'A']);
      assertEquals(3, colStats.getTotalAsciiStats().counts[(int) 'L']);
      assertEquals(3, colStats.getTotalAsciiStats().counts[(int) 'U']);
      assertEquals(3, colStats.getTotalAsciiStats().counts[(int) 'E']);
      assertEquals(3, colStats.getTotalAsciiStats().counts[(int) '3']);
      assertEquals("VALUE3", sampling.getValue());
      assertEquals(3, sampling.getFrequency());
   }

   @Test
   /** Sample only 1 in every 3 lines in the catalog, skipping all other lines when doing the statistics */
   public void sample1In3Lines() throws IOException, CatalogFormatException
   {

      final List<String> catalog = Arrays.asList(
         concat("1", "100", "100", swapQuotes("{'key1':0.1}")), // sample this line
         concat("1", "101", "101", swapQuotes("{'key2':2.3}")), // skip
         concat("1", "102", "102", swapQuotes("{'key2':3.4}")), // skip
         concat("1", "103", "103", swapQuotes("{'key1':3.88,'key3':5.666}")), // sample this line
         concat("1", "104", "104", swapQuotes("{'key3':7.8}")), // skip
         concat("1", "105", "105", swapQuotes("{'key3':9.0123456789}")) // skip
      );

      final List<String> columns = Arrays.asList("key1", "key2", "key3");

      CatalogStats catalogStats = getStats(catalog, columns, 1000, 3);

      // NOTE: Only two keys were sampled due to line sampling (1 in 3): key1, key3
      assertEquals(6, catalogStats.getTotalDataLineCount());
      assertEquals(2, catalogStats.getNumLinesSampled());
      assertEquals(columns, catalogStats.getColumns());

      CatalogColumnStats colStats = catalogStats.getColumnStats("key1");
      assertEquals(2, colStats.getNumEntries());
      assertEquals(7, colStats.getNumCharacters());
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) '0']);
      assertEquals(2, colStats.getLineAsciiStats().counts[(int) '.']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) '1']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) '3']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) '8']);
      //------------
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) '0']);
      assertEquals(2, colStats.getTotalAsciiStats().counts[(int) '.']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) '1']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) '3']);
      assertEquals(2, colStats.getTotalAsciiStats().counts[(int) '8']);
      //------------
      ValueSampling[] samples = colStats.getValueSamplings().toArray(new ValueSampling[0]);
      assertEquals(2, samples.length);
      assertEquals("0.1", samples[0].getValue());
      assertEquals(1, samples[0].getFrequency());
      assertEquals("3.88", samples[1].getValue());
      assertEquals(1, samples[1].getFrequency());

      // key2 was not sampled, so will not have any stats
      colStats = catalogStats.getColumnStats("key2");
      assertEquals(0, colStats.getNumEntries());
      assertEquals(0, colStats.getNumCharacters());
      samples = colStats.getValueSamplings().toArray(new ValueSampling[0]);
      assertEquals(0, samples.length);

      // key3 was sampled one time
      colStats = catalogStats.getColumnStats("key3");
      assertEquals(1, colStats.getNumEntries());
      assertEquals(5, colStats.getNumCharacters());
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) '5']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) '.']);
      assertEquals(1, colStats.getLineAsciiStats().counts[(int) '6']);
      //------------
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) '5']);
      assertEquals(1, colStats.getTotalAsciiStats().counts[(int) '.']);
      assertEquals(3, colStats.getTotalAsciiStats().counts[(int) '6']);
      //------------
      samples = colStats.getValueSamplings().toArray(new ValueSampling[0]);
      assertEquals(1, samples.length);
      assertEquals("5.666", samples[0].getValue());
      assertEquals(1, samples[0].getFrequency());

   }


   @Test
   public void testDbSNP() throws Exception
   {
      File bgzipFile = new File("src/test/resources/catalogStats/dbsnp_catalog/All_dbSNP.tsv.bgz");
      StatsBuilder statsBuilder = new StatsBuilder();

      CatalogStats catalogStats = statsBuilder.build(bgzipFile, 1000);

      assertEquals(100, catalogStats.getTotalDataLineCount());
      assertEquals(getColumnNames(bgzipFile), catalogStats.getColumns());
   }

   @Test
   public void testDbSNPSubset() throws Exception
   {
      long defaultSampling = StatsBuilder.getOneInXSamplingDefault();
      testDbSNPSubsetStartLineNumLines(1, 25, defaultSampling);
      testDbSNPSubsetStartLineNumLines(26, 25, defaultSampling);
      testDbSNPSubsetStartLineNumLines(51, 25, defaultSampling);
      testDbSNPSubsetStartLineNumLines(76, 25, defaultSampling);
      testDbSNPSubsetStartLineNumLines(12, 17, defaultSampling);
      testDbSNPSubsetStartLineNumLines(51, 2, defaultSampling);
      testDbSNPSubsetStartLineNumLines(80, 50, defaultSampling);

      // do some tests with a different sampling on files that will work. It has to have a startLine that is equal to
      // (startLine - 1)%sampling = 0
      testDbSNPSubsetStartLineNumLines(1, 25, 3);
      testDbSNPSubsetStartLineNumLines(25, 25, 4);
      testDbSNPSubsetStartLineNumLines(51, 25, 5);
      testDbSNPSubsetStartLineNumLines(1, 25, 100);
      testDbSNPSubsetStartLineNumLines(101, 25, 100);
      testDbSNPSubsetStartLineNumLines(13, 17, 3);
      testDbSNPSubsetStartLineNumLines(51, 2, 5);
      testDbSNPSubsetStartLineNumLines(81, 50, 4);

   }

   private void testDbSNPSubsetStartLineNumLines(long startLine, long numLines, long oneInXSampling) throws Exception
   {
      File bgzipFile = new File("src/test/resources/catalogStats/dbsnp_catalog/All_dbSNP.tsv.bgz");
      List<String> catalogLinesRegion = getBgzipSubset(bgzipFile, startLine, numLines);
      List<String> columns = getColumnNames(bgzipFile);

      CatalogStats catalogStatsFromSubsetCatalog = getStats(catalogLinesRegion, columns, 1000, oneInXSampling);

      List<String> allCatalogLines = getBgzipSubset(bgzipFile, 1, 0);
      CatalogStats catalogStatsFromStartLineNumLines = getStats(allCatalogLines, columns, 1000, oneInXSampling,
         startLine, numLines);

      CatalogFiles files = new CatalogFiles(bgzipFile);
      ColumnMetaDataOperations metaDataOperations = new ColumnMetaDataOperations(files.getColumnsFile());
      Map<String, ColumnMetaData> metaDataMap = metaDataOperations.load();

      for (String column: catalogStatsFromSubsetCatalog.getColumns())
      {
         File subsetCatalogStats = writeStats(catalogStatsFromSubsetCatalog, metaDataMap, column);
         File startLineNumLineStats = writeStats(catalogStatsFromStartLineNumLines, metaDataMap, column);
         assertTrue(filesSameContent(subsetCatalogStats, startLineNumLineStats));
      }
   }

   private File writeStats(CatalogStats stats, Map<String, ColumnMetaData> metaDataMap, String column)
      throws IOException
   {
      ColumnMetaData metaData = metaDataMap.get(column);

      File fullStatsFile = temporaryFolder.newFile();
      FileWriter fileWtr = new FileWriter(fullStatsFile);
      StatsPrinter.printStats(new PrintWriter(fileWtr), stats, column, metaData);
      fileWtr.close();
      // Go through and remove line from output that has "Total Lines in file:" that will produce a diff.
      // All else should be cool
      // there's probably an easier way to do this with a function that already exists
      return removeLinesWithString(fullStatsFile, "Total Lines in file:");
   }

   private File removeLinesWithString(File f, String excludeString) throws IOException
   {
      File fileWithoutString = temporaryFolder.newFile();
      PrintWriter fileWithoutStringWriter = new PrintWriter(new FileWriter(fileWithoutString));
      BufferedReader reader = new BufferedReader(new FileReader(f));
      String line;
      while ((line = reader.readLine()) != null)
      {
         if (!line.contains(excludeString))
         {
            fileWithoutStringWriter.println(line);
         }
      }
      reader.close();
      fileWithoutStringWriter.close();
      return fileWithoutString;
   }

   private List<String> getBgzipSubset(File catalogData, long startLine, long numLines) throws IOException
   {
      BufferedReader catalogRdr = CatalogFileUtils.getBufferedReader(catalogData.getAbsolutePath());
      assertNotNull(catalogRdr);
      List<String> linesInRegion = new ArrayList<String>();
      String line;
      long lineCount = 0;
      while ((line = catalogRdr.readLine()) != null)
      {
         lineCount++;
         if (ChunkUtils.inTargetChunk(lineCount, startLine, numLines))
         {
            linesInRegion.add(line);
         }
      }
      return linesInRegion;
   }


   private boolean filesSameContent(File f1, File f2) throws IOException
   {
      return FileUtils.contentEquals(f1, f2);
   }

   private CatalogStats getStats(List<String> catalogLines, List<String> catalogColumns, int maxValues, long oneInXSamping)
      throws IOException, CatalogFormatException
   {
      return getStats(catalogLines, catalogColumns, maxValues, oneInXSamping,
                      StatsBuilder.getStartLineDefault(), StatsBuilder.getNumLinesDefault());
   }

   private CatalogStats getStats(List<String> catalogLines, List<String> catalogColumns, int maxValues, long oneInXSamping,
                                 long startLine, long numLines)
      throws IOException, CatalogFormatException
   {
      File tempDir = temporaryFolder.newFolder();
      String dataName = "data" + System.currentTimeMillis();
      File bgzipFile = new File(tempDir, dataName + ".tsv.bgz");
      File columnsFile = new File(tempDir, dataName + ".columns.tsv");

      writeTempBGZIP(bgzipFile, catalogLines);
      writeCatalogColumnsFile(columnsFile, catalogColumns);

      StatsBuilder statsBuilder = new StatsBuilder();
      return statsBuilder.build(bgzipFile, maxValues, oneInXSamping, startLine, numLines);
   }

   private void writeCatalogColumnsFile(File f, List<String> columns) throws IOException
   {
      FileUtils.writeLines(f, columns);
   }

   /**
    * Writes a BGZIP file with the given data lines.
    *
    * @param lines TAB data lines
    */
   private void writeTempBGZIP(File bgzip, List<String> lines) throws IOException
   {
      BlockCompressedOutputStream bgzStream = new BlockCompressedOutputStream(bgzip);
      for (String line : lines)
      {
         String s = String.format("%s%n", line);
         bgzStream.write(s.getBytes());
      }
      bgzStream.close();
   }

   /**
    * Looks at the columns.tsv file and returns a list of the column names (except for golden attributes).
    *
    * @param bgzipFile Catalog bgz file
    * @return List of column names (except for golden attrs)
    * @throws CatalogFormatException
    * @throws IOException
    */
   private List<String> getColumnNames(File bgzipFile) throws CatalogFormatException, IOException
   {
      CatalogFiles catalogFiles = new CatalogFiles(bgzipFile);
      ColumnMetaDataOperations metaDataOperations = new ColumnMetaDataOperations(catalogFiles.getColumnsFile());
      List<String> columns = new ArrayList<String>();
      for (ColumnMetaData cmd : metaDataOperations.loadAsList())
      {
         if (!cmd.getColumnName().startsWith("_"))
            columns.add(cmd.getColumnName());
      }
      return columns;
   }
}