package edu.mayo.bior.cli.cmd;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PushbackInputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Options;
import org.apache.commons.lang.math.NumberUtils;
import org.apache.log4j.Logger;
import org.h2.value.CompareMode;

import edu.mayo.bior.util.FilenameComparatorByNumericPart;
import edu.mayo.cli.CommandLineApp;
import edu.mayo.cli.CommandPlugin;
import edu.mayo.pipes.util.test.PipeTestUtils;

/** Merges multiple vcf files together.  
 *  Can match on file patterns.  For example:  my.*.vcf would match
 *    my.1.vcf
 *    my.2.vcf
 *    ...
 *    my.23.vcf
 *  And would resolve all of those file name and piece together one final vcf
 *  
 *  Using the "--nozip" flag will cause it to output plain-text tsv rather than gzip'd tsv.
 *  
 *  Usage:  MergeCommand  [--nozip]  <OutputFile>  <InputVcfFileOrPattern>
 *  
 *  Example:
 *  	MergeCommand  my.vcf.out.gz  my.vcf  your.vcf.*.out.gz
 *  	MergeCommand  --nozip  my.vcf.out.gz  my.vcf.out.1.gz  my.vcf.out.2.gz  my.vcf.out.3.gz
 *  
 * @author Michael Meiners (m054457)
 * 2014-04-04
 *
 */
public class MergeCommand implements CommandPlugin
{
	private static Logger sLogger = Logger.getLogger(MergeCommand.class);
	
	
	private static final char OPTION_NO_ZIP = 'n';
    private static final char OPTION_OUTPUT_FILE = 'o';
    private static final char OPTION_KEEP_DUPLICATES = 'k';
    private static final char OPTION_SORT_INPUT_FILES = 's';

	public void init(Properties arg0) throws Exception {
	}

	public void execute(CommandLine cmdLine, Options options) throws Exception {
        String outfile = cmdLine.getOptionValue(OPTION_OUTPUT_FILE);
        boolean isZipOutput = ! cmdLine.hasOption(OPTION_NO_ZIP);
        boolean isKeepDuplicates = cmdLine.hasOption(OPTION_KEEP_DUPLICATES);
        boolean isSortInputFileList = cmdLine.hasOption(OPTION_SORT_INPUT_FILES);

        sLogger.info("Output file: " + outfile);
        sLogger.info("Should output be zipped?: " + isZipOutput);
        sLogger.info("Sort the input files?: " + isSortInputFileList);
        
        List<String> inputVcfs = cmdLine.getArgList();
        logList("Input files (unsorted): ", inputVcfs);

        if( isSortInputFileList ) {
        	Collections.sort(inputVcfs, new FilenameComparatorByNumericPart());
        	logList("Sorted input file list: ", inputVcfs);
        }

		verifyInputFiles(inputVcfs);
		sLogger.info("Input files have been resolved");
		
		mergeVcfs(inputVcfs, outfile, isZipOutput, isKeepDuplicates);
		sLogger.info("Done.");
	}
	
	private void logList(String msg, List<String> inputVcfs) {
        sLogger.info(msg);
        for(String input : inputVcfs)
        	sLogger.info("  " + input);
	}


	private static void verifyInputFiles(List<String> inputVcfs) {
		if( inputVcfs == null || inputVcfs.size() == 0 ) {
			throw new IllegalArgumentException("No input vcf files were specified");
		}
			
		for(String inputVcf : inputVcfs) {
			if( ! new File(inputVcf).exists() ) {
				throw new IllegalArgumentException("Input vcf file does not exist: " + inputVcf);
			}	
		}
	}

	public void mergeVcfs(List<String> inputVcfPaths, String outputPath, boolean isZipOutput, boolean isKeepDuplicates) throws Exception {
		BufferedWriter fout = null;
		List<InputVcf> inVcfs = null;

		try {
			fout = isZipOutput  
					?  new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(outputPath))))
					:  new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputPath)));

			List<String> headers = getAllHeadersInOrder(inputVcfPaths);
			writeHeaders(headers, fout);
		
			sLogger.info("\nScanning all files to find chromosome ranges...");
			List<ChromRange> allChromRanges = getAllChromRangesAllFilesSorted(inputVcfPaths);
			
			sLogger.info("\nNow merging chromosome ranges into the output file...");
			List<String> chromosomes = getChroms(allChromRanges);
			for( String chrom : chromosomes ) {
				List<ChromRange> chromRanges = getRangesForChrom(chrom, allChromRanges);
				sLogger.info("Processing chromosome: " + chrom);
				sLogger.info("   Ranges for chrom: " + chromRanges);
				inVcfs = getInputVcfs(chromRanges);
				openVcfs(inVcfs);
				String lastLine = "";
				while( isAVcfOpenYet(inVcfs) ) {
					// Sort the InputVcf objects so that the next line is the next in the series
					Collections.sort(inVcfs);
					String nextLine = inVcfs.get(0).popLine();
					// Output the line if we are keeping duplicates, 
					// OR if discarding duplicates but the current line is different from the last 
					if( isKeepDuplicates || ! nextLine.equals(lastLine) ) {
						fout.write(nextLine);
						fout.newLine();
					}
					lastLine = nextLine;
					removeSpentVcfs(inVcfs);
				}
			}
		} catch(Exception e) {
			System.err.println("Error merging vcfs: " + e.getMessage());
			throw e;
		} finally {
			if( fout != null )
				fout.close();
			closeAllVcfs(inVcfs);
		}
	}




	private void openVcfs(List<InputVcf> inVcfs) throws FileNotFoundException, IOException {
		for(InputVcf vcf : inVcfs) {
			vcf.open();
		}
	}

	private void removeSpentVcfs(List<InputVcf> inVcfs) {
		for(int i = inVcfs.size() - 1; i >=0; i--) {
			if( ! inVcfs.get(i).isOpen() )
				inVcfs.remove(i);
		}
	}

	private List<InputVcf> getInputVcfs(List<ChromRange> chromRanges) {
		List<InputVcf> inputVcfList = new ArrayList<InputVcf>();
		for(ChromRange chromRange : chromRanges) {
			InputVcf inVcf = new InputVcf(chromRange.filePath, chromRange.filePositionFirst, chromRange.numLines);
			inputVcfList.add(inVcf);
		}
		return inputVcfList;
	}

	private List<String> getAllHeadersInOrder(List<String> inputVcfPaths) throws IOException {
		String fileFormatLine = null;
		String colHeaderLine = null;
		List<String> headerLines = new ArrayList<String>();
		BufferedReader fin = null;
		
		try {
			// Get all header lines from all files
			for(String vcfPath : inputVcfPaths) {
				fin = isZippedInput(vcfPath)
						? new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(vcfPath))))
						: new BufferedReader(new InputStreamReader(new FileInputStream(vcfPath)));
				String line = null;
				while( (line = fin.readLine()) != null  &&  line.startsWith("#") ) {
					if( line.toLowerCase().startsWith("##fileformat") )
						fileFormatLine = line;
					else if( line.startsWith("##") ) {
						if( ! headerLines.contains(line) )
							headerLines.add(line);
					}
					else
						colHeaderLine = line;
				}
				fin.close();
			}

			// Sort the header lines, then add the "##fileformat" to beginning and "#CHROM" column header line to end
			Collections.sort(headerLines, String.CASE_INSENSITIVE_ORDER);
			if( fileFormatLine != null && fileFormatLine.length() > 0 )
				headerLines.add(0, fileFormatLine);
			if( colHeaderLine != null && colHeaderLine.length() > 0 )
				headerLines.add(colHeaderLine);
		}finally {
			if( fin != null )
				fin.close();
		}
		
		return headerLines;
	}

	private void writeHeaders(List<String> headers, BufferedWriter fout) throws IOException {
		for(String header : headers) {
			fout.write(header);
			fout.newLine();
		}
	}
	

	private List<String> getChroms(List<ChromRange> allChromRanges) {
		List<String> chroms = new ArrayList<String>();
		for(ChromRange range : allChromRanges) {
			if( ! chroms.contains(range.chrom.toLowerCase()) )
				chroms.add(range.chrom.toLowerCase());
		}
		return chroms;
	}

	private List<ChromRange> getRangesForChrom(String chrom, List<ChromRange> allChromRanges) {
		List<ChromRange> matchingRanges = new ArrayList<ChromRange>();
		for(ChromRange range : allChromRanges) {
			if( chrom.equalsIgnoreCase(range.chrom) )
				matchingRanges.add(range);
		}
		return matchingRanges;
	}



	private void closeAllVcfs(List<InputVcf> inputVcfs) throws IOException {
		if( inputVcfs == null )
			return;
		for(InputVcf vcf : inputVcfs)
			vcf.close();
	}
	
	private boolean isAVcfOpenYet(List<InputVcf> inputVcfs) {
		for(InputVcf vcf : inputVcfs) {
			if( vcf.isOpen() )
				return true;
		}
		// None open
		return false;
	}

	private boolean isZippedInput(String filePath) {
		boolean isZip = false;
		try {
			FileInputStream fin = new FileInputStream(filePath);
		    PushbackInputStream pb = new PushbackInputStream( fin, 2 ); //we need a pushbackstream to look ahead
		    byte [] signature = new byte[2];
		    pb.read( signature ); //read the signature
		    pb.unread( signature ); //push back the signature to the stream
	    
		    //check if matches standard gzip magic number
		    isZip = signature[0] == (byte)0x1f
		    	 && signature[1] == (byte)0x8b;
		} catch(Exception e) {
			System.err.println(e.getMessage());
		}
		return isZip;
	}

	/** Get all chromosome ranges within all files, then sort them */
	List<ChromRange> getAllChromRangesAllFilesSorted(List<String> inputVcfs) throws IOException {
		List<ChromRange> chromRangeList = new ArrayList<ChromRange>();
		for(String vcfPath : inputVcfs) {
			chromRangeList.addAll(getAllChromRangesFromFile(vcfPath));
		}
		
		Collections.sort(chromRangeList);
		return chromRangeList;
	}
	

	private List<ChromRange> getAllChromRangesFromFile(String vcfPath) throws IOException {
		InputStream fin = null;
		List<ChromRange> chromRangesInFile = new ArrayList<ChromRange>();
		try {
			sLogger.info("Finding chromosome ranges within file: " + vcfPath);
			fin = isZippedInput(vcfPath)
					? new GZIPInputStream(new FileInputStream(vcfPath))
					: new FileInputStream(vcfPath);
			long fileposLineStart = 0;
			byte[] fileBuffer = new byte[1024*1024];
			int bufferLen = -1;

			// Support lines up to 1MB
			byte[] lineBuffer = new byte[1024*1024];
			int lineLen = 0;

			List<NextLine> lines = new ArrayList<NextLine>();
			long filePos = 0;
			while( (bufferLen = fin.read(fileBuffer)) != -1 ) {
				for(int i=0; i < bufferLen; i++) {
					if( fileBuffer[i] == '\r' || fileBuffer[i] == '\n' ) {
						String line = new String(lineBuffer, 0, lineLen);
						if( line != null && line.length() > 0 && ! line.startsWith("#") )
							lines.add(new NextLine(line, fileposLineStart));
						lineLen = 0;
						fileposLineStart = filePos + 1;
					} else {
						lineBuffer[lineLen++] = fileBuffer[i];
					}
					filePos++;
				}
			}
			
			// If there are some bytes from the end of the file, then add them to a new string 
			if( lineLen > 0 ) {
				String line = new String(lineBuffer, 0, lineLen);
				lines.add(new NextLine(line, fileposLineStart));
			}
			
			// Now, go thru all lines and add to chrom range list
			ChromRange chromRange = null;
			for(NextLine line : lines) {
				if( chromRange == null  ||  ! chromRange.chrom.equalsIgnoreCase(line.chrom)   ||  chromRange.startLast > line.start ) {
					sLogger.info("  Range-last: " + chromRange);
					chromRange = new ChromRange(line.chrom, line.start, line.start, vcfPath, line.filePos, 0);
					sLogger.info("  Range-new:  " + chromRange);
					chromRangesInFile.add(chromRange);
				} else {  // Update the existing ChromRange object with new ending chrom and file positions
					chromRange.startLast = line.start;
				}
				chromRange.numLines++;
			}
		}finally {
			if( fin != null )
				fin.close();
		}
		return chromRangesInFile;
	}

	/** Compare chromosomes the way nature intended:
	 *  Remove "chr" from the beginning of any chrom, then if both are ints, then compare as ints, instead of strings  */
	private int compareChrom(String chrom1, String chrom2) {
		chrom1 = chrom1.toLowerCase().replace("chr", "");
		chrom2 = chrom2.toLowerCase().replace("chr", "");
		
		if( isInt(chrom1) && isInt(chrom2) )
			return Integer.valueOf(chrom1).compareTo(Integer.valueOf(chrom2));
		else
			return chrom1.compareToIgnoreCase(chrom2);
	}
	

	
	// ====================================================================================================================
	
	
	class InputVcf implements Comparable<InputVcf> {
		private String filePath;
		private BufferedReader fin;
		private String currentLine;
		// If this is set to 0 it will read until the end of the file.  
		// Else, only read to maxLinesToRead, then close.
		private long maxLinesToRead = 0;
		private long startingFilePos = 0;
		private long numLinesRead = 0;
	
		
		public InputVcf(String filePath) {
			this.filePath = filePath;
		}

		public InputVcf(String filePath, long startingFilePos, long maxLinesToRead) {
			this.filePath = filePath;
			this.maxLinesToRead = maxLinesToRead;
			this.startingFilePos = startingFilePos;
		}

		public int compareTo(InputVcf otherVcf) {
			if( this.currentLine == null && otherVcf.currentLine == null ) 
				return 0;
			if( this.currentLine == null )
				return 1;
			else if( otherVcf.currentLine == null )
				return -1;
			else {
				// Split the line by tabs, then sort by first column (chromosome - alphabetically), then by 2nd column (position - numerically)
				String[] partsThis  = this.currentLine.split("\t");
				String[] partsOther = otherVcf.currentLine.split("\t");
				
				// If there are not two columns to compare, or if the 2nd column is NOT a number, then just do a full string compare on the entire rows
				if( partsThis.length < 2 || partsOther.length < 2 || ! NumberUtils.isNumber(partsThis[1]) || ! NumberUtils.isNumber(partsOther[1]) )
					return this.currentLine.compareToIgnoreCase(otherVcf.currentLine);
				
				// If the first column (chrom) is NOT the same, then return the lesser of the two
				int chromCompare = compareChrom(partsThis[0], partsOther[0]);
				if( chromCompare != 0 )
					return chromCompare;
				
				return Integer.valueOf(NumberUtils.toInt(partsThis[1])).compareTo(NumberUtils.toInt(partsOther[1]));
			}
		}
		
		public void open() throws FileNotFoundException, IOException {
			fin = isZippedInput(filePath)
					? new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(filePath))))
					: new BufferedReader(new InputStreamReader(new FileInputStream(filePath)));
			fin.skip(startingFilePos);
			currentLine = fin.readLine();
			numLinesRead++;
		}
		
		public void close() throws IOException {
			if( fin != null )
				fin.close();
		}
		
		/** Pull the next line from the file 
		 * @throws IOException */
		public String popLine() throws IOException {
			String line = currentLine;
			
			// If we've read up to the max or the currentLine is null,
			// Then close the file and set next line to null
			if( (maxLinesToRead != 0  &&  numLinesRead >= maxLinesToRead)  ||  currentLine == null ) {
				close();
				currentLine = null;
			} else if( fin != null  &&  currentLine != null ) {
				currentLine = fin.readLine();
				numLinesRead++;
			}
			
			return line;
		}
		
		public boolean isOpen() {
			return currentLine != null;
		}
	}

	/** Mapping between file position and chromosome and start.
	 *  This will allow us to quickly jump to the position within the file where the line is located
	 */
	class ChromRange implements Comparable<ChromRange> {
		public String chrom;
		// The first starting position within the range
		public long startFirst;
		// The last starting position within the range
		public long startLast;
		public String filePath;
		// The file position for the first line within the range
		public long filePositionFirst;
		// The number of lines within this range
		public long numLines;

		public ChromRange(String chrom, long startFirst, long startLast, String filePath, long fileposLineFirst, long numLines) {
			this.chrom = chrom;
			this.startFirst = startFirst;
			this.startLast  = startLast;
			this.filePath   = filePath;
			this.filePositionFirst = fileposLineFirst;
			this.numLines  = numLines;
		}

		public String toString() {
			return "[" + chrom + ":" + startFirst + "-" + startLast + ", " + filePath + ":" + filePositionFirst + " (" + numLines + " lines)]";
		}

		/** Compare by chromosome, then by starting position */
		public int compareTo(ChromRange other) {
			int chromCompare = compareChrom(this.chrom, other.chrom);
			if( chromCompare != 0 )
				return chromCompare;
			
			return Long.valueOf(this.startFirst).compareTo(Long.valueOf(other.startFirst));
		}
	}
	
	private boolean isInt(String s) {
		try {
			Integer.parseInt(s);
			return true;
		} catch(Exception e) {
			return false;
		}
	}
	

	class NextLine {
		public long filePos;
		// Remove the "chr" from front if it exists
		public String chrom;
		public long start;
		
		public NextLine(String line, long filePos) {
			this.filePos = filePos;
			String[] parts = line.split("\t");
			this.chrom = parts[0].toUpperCase().replace("CHR", "").trim();
			this.start = Long.parseLong(parts[1].trim());
		}
		
		public String toString() {
			return "[" + chrom + ":" + start + " (file:" + filePos + ")]";
		}
	}
}
