package edu.mayo.bior.pipeline.VEP;

import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;
import java.util.TreeMap;

import org.apache.log4j.Logger;

/** Compare SequenceOntology severities 
 *    See: http://useast.ensembl.org/info/genome/variation/predicted_data.html?redirect=no */
public class VepCsqSeqOntComparator implements Comparator<String>{
	// Allow keys to be compared WITHOUT case sensitivity
	private static final Map<String, Integer> sSeqMap = new TreeMap<String, Integer>(String.CASE_INSENSITIVE_ORDER);

	private static final Logger sLogger = Logger.getLogger(VepCsqSeqOntComparator.class);

	// See: http://useast.ensembl.org/info/genome/variation/predicted_data.html?redirect=no
	static {
		int val = 100;
		sSeqMap.put("transcript_ablation", 			val--);
		sSeqMap.put("splice_acceptor_variant", 		val--);
		sSeqMap.put("splice_donor_variant", 		val--);
		sSeqMap.put("stop_gained", 					val--);
		sSeqMap.put("frameshift_variant", 			val--);
		sSeqMap.put("stop_lost", 					val--);
		sSeqMap.put("initiator_codon_variant", 		val--);
		sSeqMap.put("transcript_amplification", 	val--);
		sSeqMap.put("inframe_insertion", 			val--);
		sSeqMap.put("inframe_deletion", 			val--);
		sSeqMap.put("missense_variant", 			val--);
		sSeqMap.put("splice_region_variant", 		val--);
		sSeqMap.put("incomplete_terminal_codon_variant", val--);
		sSeqMap.put("stop_retained_variant", 		val--);
		sSeqMap.put("synonymous_variant", 			val--);
		sSeqMap.put("coding_sequence_variant", 		val--);
		sSeqMap.put("mature_miRNA_variant", 		val--);
		sSeqMap.put("5_prime_UTR_variant", 			val--);
		sSeqMap.put("3_prime_UTR_variant", 			val--);
		sSeqMap.put("non_coding_transcript_exon_variant", val--);
		// Is "non_coding_exon_variant" the same as "non_coding_transcript_exon_variant" above?  Just a different VEP version?
		sSeqMap.put("non_coding_exon_variant", 		val--);	// (NOT ON WEBSITE)
		// Add the variation "nc" for "non_coding" 
		sSeqMap.put("nc_transcript_exon_variant", 	val--); // (NOT ON WEBSITE)
		sSeqMap.put("nc_exon_variant", 				val--);	// (NOT ON WEBSITE)
		sSeqMap.put("intron_variant", 				val--);
		sSeqMap.put("NMD_transcript_variant", 		val--);
		sSeqMap.put("non_coding_transcript_variant", val--);
		// Add the variation "nc" for "non_coding" 
		sSeqMap.put("nc_transcript_variant", 		val--);	// (NOT ON WEBSITE)
		sSeqMap.put("upstream_gene_variant", 		val--);
		sSeqMap.put("downstream_gene_variant", 		val--);
		sSeqMap.put("TFBS_ablation", 				val--);
		sSeqMap.put("TFBS_amplification", 			val--);
		sSeqMap.put("TF_binding_site_variant", 		val--);
		sSeqMap.put("regulatory_region_ablation", 	val--);
		sSeqMap.put("regulatory_region_amplification", val--);
		sSeqMap.put("regulatory_region_variant", 	val--);
		sSeqMap.put("feature_elongation", 			val--);
		sSeqMap.put("feature_truncation", 			val--);
		sSeqMap.put("intergenic_variant", 			val--);
	}
	
	public int compare(String seqOnt1, String seqOnt2) {
		String mostSevere1 = getMostSevere(seqOnt1);
		String mostSevere2 = getMostSevere(seqOnt2);
		
		int sevScore1 = getScore(mostSevere1);
		int sevScore2 = getScore(mostSevere2);
		
		return sevScore2 - sevScore1;
	}

	protected int getScore(String mostSevere1) {
		Integer score = sSeqMap.get( mostSevere1 );
		if( score == null ) {
			score = 0;
			final String MSG = "Warning: VEP CSQ SequenceOntology not recognized: " + mostSevere1;
			System.err.println(MSG);
			sLogger.warn(MSG);
		}
		return score;
	}

	/** A sequenceOntology string could consist of multiple types .  For example:
	 *  	intron_variant&nc_transcript_variant
	 *		intron_variant&NMD_transcript_variant
	 *  Also, if the string starts with nc_transcript, convert it to non_coding_transcript 	 */
	protected String getMostSevere(String seqOnt) {
		if( seqOnt == null )
			seqOnt = "";
		
		String mostSev = seqOnt;
		if( seqOnt.contains("&") ) {
			String[] seqs = seqOnt.split("&");
			Arrays.sort(seqs, this);
			mostSev = seqs[0];
		}
		
		return mostSev;
	}
	
}
