package edu.mayo.bior.catalog.verification;

import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import com.google.gson.JsonObject;

import edu.mayo.bior.catalog.GoldenAttribute;

/** Verify the ref alleles in JSON against the ref alleles in the reference assembly, and that none of Alt alleles match the Ref alleles and are all valid alleles. */
public class CatalogVariantVerifier
{
   private static final Logger sLogger = LoggerFactory.getLogger(CatalogVariantVerifier.class);
   public static final String UNKNOWN_CHR = "UNKNOWN";

   private SeqLookup mSequenceLookup = null;
   private MessageLogger mLogger;
   private long mNumberRefAllelesMatch = 0;
   private long mNumberRefAllelesNotMatch = 0;

   private static final String GOLDEN_REF_ALLELE = GoldenAttribute._refAllele.name();
   private static final String GOLDEN_ALT_ALLELES = GoldenAttribute._altAlleles.name();
   private static final String GOLDEN_STRAND = GoldenAttribute._strand.name();

   //
   // must match 1 or more of the characters in brackets
   // empty is not a valid allele but a single "A" or "G" nucleotide, for example, is valid.
   // A multi-character sequence also works with this pattern matching without checking each
   //   character in a loop.
   private static final Pattern DNA_NUCLEOTIDES = Pattern.compile("^[ACGTN]{1,}+$");

   public CatalogVariantVerifier(HumanReferenceInfo humanReferenceInfo,
                                 String chromosomeRefSeqToLoad, MessageLogger logger)
   {
      mLogger = logger;
      if (UNKNOWN_CHR.equals(chromosomeRefSeqToLoad))
      {
         logInfo("Will not verify reference allele values for chromosome " + UNKNOWN_CHR);
         return;
      }
      try
      {
         if (humanReferenceInfo != null && humanReferenceInfo.getChrSizeMap() != null &&
            humanReferenceInfo.getRefseqFileNm() != null)
         {
            mSequenceLookup = new SeqLookup(humanReferenceInfo.getRefseqFileNm());
         }
      }
      catch (IOException e)
      {
         String msg = "Unable to successfully initialize reference sequence lookup utility for chromosome '" +
            chromosomeRefSeqToLoad + "'. Will not be verifying reference allele values.  Exception: " + e.getMessage();
         logWarning(msg);
      }
   }

   public void verify(JsonObject catalogRowJson, CatalogEntryGoldenJson goldenJsonVariant) throws NumberFormatException, IOException
   {
      if (!isVariant(goldenJsonVariant)) {
         return;
      }

      String refAllele = goldenJsonVariant.getRefAllele();
      verifyGoldenJsonRefAllele(refAllele, catalogRowJson);
      verifyGoldenJsonAltAllele(goldenJsonVariant.getAltAlleles(), catalogRowJson, refAllele);

      if (StringUtils.isBlank(refAllele)) {
         logError(GOLDEN_REF_ALLELE + " value is empty for variant for chr: " + goldenJsonVariant.getChr() +
               " position: " + goldenJsonVariant.getMinBP(), VerifyErrorCodes.JSON_GOLDEN_ATTRIBUTE_MISSING_REF_ALLELE);
      } else if (goldenJsonVariant.getMinBP() != null && goldenJsonVariant.getMaxBP() != null) {
         long jsonGoldenVariantLength = (goldenJsonVariant.getMaxBP() - goldenJsonVariant.getMinBP()) + 1;
         // Ignore the length comparison if this is a structural variant
         if (! isAnyAltStructural(goldenJsonVariant.getAltAlleles())  &&  refAllele.length() != jsonGoldenVariantLength) {
        	 String msg = String.format("%s length [%d] is not equal to calculated length [%d]. Json: '%s'",
                                       GOLDEN_REF_ALLELE, refAllele.length(),
                                       jsonGoldenVariantLength, catalogRowJson.toString());
        	 logError(msg, VerifyErrorCodes.JSON_GOLDEN_ATTRIBUTE_REF_ALLELE_LENGTH_NOT_SAME_AS_CALCULATED_LENGTH);
         }
         
         if (mSequenceLookup != null) {
        	verifyRefAlleleMatchesRefAssembly(goldenJsonVariant, refAllele);
         }
      }
   }

   // Is a structural variant if one of the alts contains brackets, like "<DEL>", "<INS>", "<DUP>", "<DEL:ME:ALU>", etc
   private boolean isAnyAltStructural(List<String> alts) {
	   for(String alt : alts) {
		   if( isAltStructural(alt) )
			   return true;
	   }
	   return false;
   }
   
   private boolean isAltStructural(String alt) {
	   return alt.contains("<") && alt.contains(">");
   }
   
   private void verifyRefAlleleMatchesRefAssembly(CatalogEntryGoldenJson goldenJsonVariant, String refAllele) {
		String refFromRefAssembly = null;
		try {
			refFromRefAssembly = mSequenceLookup.getRefSeq(goldenJsonVariant.getChr(), goldenJsonVariant.getMinBP(), goldenJsonVariant.getMaxBP());
		}catch(Exception e) {
			logError(e.getMessage(), VerifyErrorCodes.REF_SEQUENCE_LOOKUP_FAILED);
		}
		
		if( ! VerifyUtils.isEmpty(refFromRefAssembly)) {
			// If this is NOT a structural variant (in which case the ref may be a single nucleotide, but the length > 1)
			// AND the ref does not match the reference assembly sequence, then log an error
			if ( ! isAnyAltStructural(goldenJsonVariant.getAltAlleles())  &&  ! isRefAlleleMatch(refAllele, refFromRefAssembly, goldenJsonVariant.getChr(), goldenJsonVariant.getMinBP(), goldenJsonVariant.getMaxBP()) )
			{
				mNumberRefAllelesNotMatch++;
				int code = VerifyErrorCodes.JSON_GOLDEN_ATTRIBUTE_REF_ALLELE_DOES_NOT_MATCH_REF_SEQUENCE;
				if (VerifyUtils.isChrM(goldenJsonVariant.getChr()))
				{
					code = VerifyErrorCodes.JSON_GOLDEN_ATTRIBUTE_REF_ALLELE_DOES_NOT_MATCH_REF_SEQUENCE_IN_CHROM_M;
				}
				logError(GOLDEN_REF_ALLELE + " [" + refAllele + "] does not match reference sequence value [" +
						refFromRefAssembly + "] for:  chr=" + goldenJsonVariant.getChr() + " position=" + goldenJsonVariant.getMinBP(),
						code);
			} else {
				mNumberRefAllelesMatch++;
			}
		} else {
			logError("Allele lookup against reference sequence returned empty result for chr=" +
					goldenJsonVariant.getChr() + " minBP=" + goldenJsonVariant.getMinBP() + " maxBP=" + goldenJsonVariant.getMaxBP(),
					VerifyErrorCodes.REF_SEQUENCE_NOT_FOUND);
			mNumberRefAllelesNotMatch++;
		}
	}
   
   
   /** Decide whether the two ref alleles matching, taking into account case and possible 'N' occurrences anywhere in the refAssembly string */
   private boolean isRefAlleleMatch(String refAllele, String refFromRefAssembly, String chr, long min, long max) {
	   // Warn if there are any N's in the refs as this may signify that the sequence is not expected
	   if( refAllele.toUpperCase().contains("N")  ||  refFromRefAssembly.toUpperCase().contains("N") ) {
		   logWarning("An 'N' was detected in one of the reference sequences, which may signify a misalignment of the ref allele to the reference assembly.  "
		   		+ "(Ref='" + refAllele + "',  refAssemblySequence='" + refFromRefAssembly +"')  " + chr + ":" + min + "-" + max);
	   }
	   
	   if(refAllele.equalsIgnoreCase(refFromRefAssembly)) {
		   return true;
	   }
	   
	   // Else, compare sizes of refs
	   if( refAllele.length() != refFromRefAssembly.length() ) {
		   return false;
	   }
	   
	   // Else, compare with N's (unknowns) possibly in the string
	   boolean areAllMatches = true;
	   for(int i=0; i < refAllele.length(); i++) {
		   boolean isAnN = refAllele.charAt(i) == 'N' || refFromRefAssembly.charAt(i) == 'N';
		   boolean isChrMatch = refAllele.charAt(i) == refFromRefAssembly.charAt(i);
		   if( ! (isAnN || isChrMatch) )
			   areAllMatches = false;
	   }
	   return areAllMatches;
   }

   public long getNumberRefAllelesMatchReferenceSequence()
   {
      return mNumberRefAllelesMatch;
   }

   public long getNumberRefAllelesNotMatchReferenceSequence()
   {
      return mNumberRefAllelesNotMatch;
   }

   private boolean isVariant(CatalogEntryGoldenJson goldenJson)
   {
      return goldenJson.isVariant();
   }

   private void verifyGoldenJsonRefAllele(String goldenRefAllele, JsonObject catalogRowJson)
   {
      if (!validDNANucleotides(goldenRefAllele))
      {
         // TODO - Should this be a WARNING?
         logError(GOLDEN_REF_ALLELE + " [" + goldenRefAllele + "] contains an invalid nucleotide value. Json: " +
                    catalogRowJson.toString(),
            VerifyErrorCodes.REF_ALLELE_CONTAINS_INVALID_NUCLEOTIDE);
      }

      String ref = null;
      if (catalogRowJson.get("REF") != null)
      {
         ref = catalogRowJson.get("REF").getAsString();
      }
      else if (catalogRowJson.get("ref") != null)
      {
         ref = catalogRowJson.get("ref").getAsString();
      }

      if (ref != null && !ref.equals(goldenRefAllele))
      {
         logWarning(GOLDEN_REF_ALLELE + " [" + goldenRefAllele + "] is not equal to the REF value [" + ref + "]");
      }
   }

   private void verifyGoldenJsonAltAllele(List<String> altAlleleList, JsonObject catalogRowJson, String goldenRefAllele)
   {
      if (altAlleleList == null)
      {
         return;
      }

      Set<String> altAlleleSet = new HashSet<String>();
      for (String eachAltAllele : altAlleleList)
      {
         if (eachAltAllele.length() == 0)
         {
            logError("One of the " + GOLDEN_ALT_ALLELES + " values [" + eachAltAllele + "] is empty value. json: " +
               catalogRowJson.toString(), VerifyErrorCodes.JSON_GOLDEN_ATTRIBUTE_ALT_ALLELES_EMPTY);
            continue;   // rest of the checks don't make sense if value is empty string.
         }
         // TODO - Should this be a WARNING?
         // We have a bad Alt if it is NOT a structural variant (thus doesn't have <DEL>, <INS>, etc)
         //   AND it contains an invalid nucleotide (not A,C,G,T,N)
         if (! isAltStructural(eachAltAllele)  &&  ! validDNANucleotides(eachAltAllele))
         {
        	 logError("One of the " + GOLDEN_ALT_ALLELES + " [" + eachAltAllele + "] contains an invalid nucleotide value. Json: " +
        			 catalogRowJson.toString(), VerifyErrorCodes.ALT_ALLELE_CONTAINS_INVALID_NUCLEOTIDE);
         }
         if (goldenRefAllele != null && goldenRefAllele.equals(eachAltAllele))
         {
            logError("One of the " + GOLDEN_ALT_ALLELES + " [" + eachAltAllele + "] is equal to the " + GOLDEN_REF_ALLELE + ". Json: " +
                  catalogRowJson.toString(),
               VerifyErrorCodes.JSON_GOLDEN_ATTRIBUTE_ALT_ALLELE_EQUALS_REF_ALLELE);
         }
         if (!altAlleleSet.add(eachAltAllele))
         { // list should always be updated if list elements are unique to each other.
            logWarning("One of the " + GOLDEN_ALT_ALLELES + " values [" + eachAltAllele +
                  "] is repeated multiple times. Json: " + catalogRowJson.toString());
         }
      }
   }

   static boolean validDNANucleotides(String dnaSeq)
   {
      return DNA_NUCLEOTIDES.matcher(dnaSeq).find();
   }

   private void logInfo(String msg)
   {
      mLogger.logInfo(msg);
   }

   private void logWarning(String msg)
   {
      mLogger.logWarning(msg);
   }

   private void logError(String msg, int code)
   {
      mLogger.logError(msg, code);
   }

}
