#!/bin/bash

# This script will generate a configuration file for the Mayo ChIP-Seq analysis tool. 
# This run_info file will be placed in the current working directory or a specified install directory.
# See the usage text for additional details.

usage() {
	cat <<EOF
	
Usage: generate_run_info.sh -o <output_dir>
Install chip-seq in specified directory (default is current working directory).

Mandatory arguments to long options are mandatory for short options too.

  -h, --help                     Display this usage text
  -o, --output-dir				 Desired output directory
  -m, --mem-info				 Path to memory info file (Default: output directory/mem_info.txt)
  -r <run_info_file>             Desired run_info file name (Default: output_directory/run_info.txt)
  -t, --tool-info				 Path to tool info file (Default: output_directory/tool_info.txt)
  -v, --verbose                  Display all command processing

Report problems to moore.raymond@mayo.edu or kalmbach.michael@mayo.edu
EOF
}

read_variable() {
	 message=$1
	 details=$2
	 required=$3
	 variable_type=$4
	 
	 while :
	 do
		 while :
		 do
		 	read -p "$(echo -e "$1: ")" VAR
		 	case $VAR in
		 		h | help )  echo -e "Details:\n$details" ;;
		 		q | quit )  echo "Exiting."; exit 0 ;;
		 		"" ) if [ ! -z $required ] ; then break ; fi ;;
		 		* ) break ;;
			esac
		 done	
		 
		 case $4 in
			file | dir ) 
			  	if [[ ! -e $VAR ]] 
			  	then
			  		echo "$VAR does not exist. Please retry or quit." 
			  	else
			  		break;	
			  	fi 
			  	;;
		  	* ) break ;;
		 esac
	 done		 	
}

write_run_info () {
	variable=$1
	value=$2
	echo "$variable=$value" >> $RUN_INFO	
}

collect_samples() {
	# Initialize sample list
	SAMPLES=
	details="This sample list will be compared against the reference files to provide ChIP-Seq results." 
	
	while :
	do
		read -p "$(echo -e "\nTo see a list of your sample directory, type 'l'\nWhen finished with your samples, press 'd' for done.\nPlease enter your sample names:\n> ")" VAR
	 	case $VAR in
			d | done ) break ;; 
	 		h | help )  echo -e "Details:\n$details" ;;
	 		l | ls | list ) ls $INPUT_DIR; next ;;
	 		q | quit )  echo "Please type 'done' instead" ;;
		esac	
		
		if [[ ! -e $INPUT_DIR/$VAR ]] 
		then
			echo "$INPUT_DIR/$VAR does not exist. Please retry or quit." 
		else
			SAMPLES="$SAMPLES $VAR"	
		fi
	done	
	
	SAMPLES=$(echo $SAMPLES | perl -p -e "s/^ //")
}

collect_paired_samples() {
	# Initialize sample list
	END1_SAMPLES=
	END2_SAMPLES=
	sample_read="1"
	details="This sample list will be compared against the reference files to provide ChIP-Seq results." 
	
	while :
	do
		read -p "$(echo -e "\nTo see a list of your sample directory, type 'l'\nWhen finished with your samples, press 'd' for done.\n\nPlease enter the file name for read $sample_read for this sample:\n> ")" VAR
	 	case $VAR in
			d | done ) break ;; 
	 		h | help )  echo -e "Details:\n$details" ;;
	 		l | ls | list ) ls $INPUT_DIR; continue ;;
	 		q | quit )  echo "Please type 'done' instead" ;;
		esac	
		
		if [[ ! -e $INPUT_DIR/$VAR ]] 
		then
			echo "$INPUT_DIR/$VAR does not exist. Please retry or quit." 
		elif [[ $sample_read == "1" ]]
		then
			END1_SAMPLES="$END1_SAMPLES $VAR"	
			sample_read="2" 
		else
			END2_SAMPLES="$END2_SAMPLES $VAR"	
			sample_read="1"
		fi
	done	
	
	END1_SAMPLES=$(echo $END1_SAMPLES | perl -p -e "s/^ //")
	END2_SAMPLES=$(echo $END2_SAMPLES | perl -p -e "s/^ //")
	
	size_of_read_1=$(echo $END1_SAMPLES | tr " " "/n" | wc -l)
	size_of_read_2=$(echo $END2_SAMPLES | tr " " "/n" | wc -l)
	
	if [[ $size_of_read_1 != $size_of_read_2 ]] 
	then
		echo "Not all pairs were specified. Please check your directories and try again."
		echo "Read 1 samples: $END1_SAMPLES"
		echo "Read 2 samples: $END2_SAMPLES"
		exit 1
	fi
}

collect_ip_and_input_samples() {
	# Initialize sample list
	IP_SAMPLES=
	INPUT_SAMPLES=
	IP_OR_CONTROL="IP"
	details="This list identifies the control that IP samples should be compared against." 
	
	while :
	do
		read -p "$(echo -e "\nTo see a list of your sample directory, type 'l'\nWhen finished with your samples, press 'd' for done.\n\nPlease enter the file name for your $IP_OR_CONTROL sample:\n> ")" VAR
	 	case $VAR in
			d | done ) break ;; 
	 		h | help )  echo -e "Details:\n$details" ;;
	 		l | ls | list ) echo $END1_SAMPLES | tr " " "\n"; continue ;;
	 		q | quit )  echo "Please type 'done' instead" ;;
		esac	
		
		RESULT=`echo $END1_SAMPLES | grep $VAR`
		
		if [[ -z $RESULT ]] 
		then
			echo "$VAR does not exist in the sample list. Please retry or type 'done'." 
		elif [[ $IP_OR_CONTROL == "IP" ]]
		then
			IP_SAMPLES="$IP_SAMPLES $VAR"	
			IP_OR_CONTROL="control" 
		else
			INPUT_SAMPLES="$INPUT_SAMPLES $VAR"	
			IP_OR_CONTROL="IP"
		fi
	done	
	
	IP_SAMPLES=$(echo $IP_SAMPLES | perl -p -e "s/^ //")
	INPUT_SAMPLES=$(echo $INPUT_SAMPLES | perl -p -e "s/^ //")
	
	size_of_IP=$(echo $IP_SAMPLES | tr " " "/n" | wc -l)
	size_of_INPUT=$(echo $INPUT_SAMPLES | tr " " "/n" | wc -l)
	
	if [[ $size_of_read_1 != $size_of_read_2 ]] 
	then
		echo "Not all pairs were specified. Please check your directories and try again."
		echo "IP samples: $IP_SAMPLES"
		echo "INPUT/control samples: $INPUT_SAMPLES"
		exit 1
	fi
}

get_filter_type() {
	supported_filter_types=$1
	while :
	do
		read_variable "\nSupported filter types are:\n$supported_filter_types\nPlease specify the desired filter type"	"This determines the desired filter type."
		case $SEQUENCE_TYPE in
			se )
				case $VAR in
					U1 | U2 )  
						write_run_info "FILTER_TYPE" "$VAR"
						break
						;;
					* ) 
						echo -e "\nERROR: Filter type $VAR not recognized. Please use one of the supported filter types."
						continue 
						;;
				esac
			;;
			pe )
				case $VAR in
					U02 | U12 | U22 )  
						write_run_info "FILTER_TYPE" "$VAR"
						break
						;;
					* ) 
						echo -e "\nERROR: Filter type $VAR not recognized. Please use one of the supported filter types."
						continue 
						;;
				esac
			;;
		esac
	done
}

get_peak_caller() {
	while :
	do
		read_variable "Please select your desired peak caller -- MACS2 (m) or SICER (s)" "This determines the method used to call peaks in your data."
		case $VAR in
			m )
				read_variable "Would you also like to run IDR Analysis? (y/n)" "This determines whether IDR analysis will also be run."
				case $VAR in
					y | yes )
					 	PEAK_CALLER=macs2idr 
						break
						;;
					* ) 
						PEAK_CALLER=macs2noidr
						break ;;
				esac
				;;
			s )
				PEAK_CALLER=sicer
				;;
			* ) 
				echo -e "\nERROR: Peak caller $VAR not recognized. Please use one of the supported peak callers."
				continue 
				;;
		esac
	done
	
	write_run_info "PEAK_CALLER" "$PEAK_CALLER"
}

gather_user_provided_genes() {
	read_variable "Please provide the directory with your gene list(s)." "This directory should hold text files describing the gene lists of interest." "" "dir"
	CEAS_GENE_DIR=$VAR
	
	while :
	do
		read -p "$(echo -e "\nTo see a list of your gene directory, type 'l'\nWhen finished entering your gene lists, press 'd' for done.\n\nPlease enter the file name for read $sample_read for this gene list:\n> ")" VAR
	 	case $VAR in
			d | done ) break ;; 
	 		h | help )  echo -e "Details:\n$details" ;;
	 		l | ls | list ) ls $CEAS_GENE_DIR; continue ;;
	 		q | quit )  echo "Please type 'done' instead" ;;
		esac	
		
		if [[ ! -e $CEAS_GENE_DIR/$VAR ]] 
		then
			echo "$CEAS_GENE_DIR/$VAR does not exist. Please retry or quit." 
		else
			CEAS_GENE_LIST="$CEAS_GENE_LIST $VAR"
		fi
	done	
	
	CEAS_GENE_LIST=$(echo $CEAS_GENE_LIST | perl -p -e "s/^ //")
}

# Parse arguments from input parameters
MEMORY_INFO_FILE=
OUTPUT_DIR=
RUN_INFO=
TOOL_INFO_FILE=
VERBOSE=false

while [ ! -z $1 ] ; do
	case "$1" in
		-h | --help ) usage; exit ;;
		-m | --mem-info ) MEMORY_INFO_FILE=$2; shift 2;;
		-o | --output-dir ) OUTPUT_DIR=$2; shift 2;;	
  		-r | --run-info ) RUN_INFO=$2; shift 2;;
  		-t | --tool-info	) TOOL_INFO_FILE=$2; shift 2;;
  	 	-v | --verbose ) VERBOSE=true;;
	esac
done

if [[ -z $OUTPUT_DIR ]]
then
	echo "The -o option is required. Please specify an output directory."
fi

if [[ ! -e $OUTPUT_DIR ]]
then
	read -p "$(echo -e "WARNING: $OUTPUT_DIR does not exist.\nWould you like to create it? ")" VAR
 	case $VAR in
 		y | yes )  mkdir -p $OUTPUT_DIR ;;
 		* ) echo "Exiting."; exit 0 ;;
	esac
	
	if [[ $? != 0 ]] 
	then
		echo "Failed to create $OUTPUT_DIR. Please check your permissions and try again."
		exit 1;
	fi
fi

# At this point, $OUTPUT_DIR should exist. Modify the variables to point to the correct output location.
if [[ -z $MEMORY_INFO_FILE ]]
then
	MEMORY_INFO_FILE=$OUTPUT_DIR/mem_info.txt
fi

if [[ -z $RUN_INFO ]]
then
	RUN_INFO=$OUTPUT_DIR/run_info.txt
fi

if [[ -z $TOOL_INFO_FILE ]]
then
	TOOL_INFO_FILE=$OUTPUT_DIR/tool_info.txt
fi

if [[ -e $RUN_INFO ]] 
then
	echo "The file at $RUN_INFO already exists. Please specify a different file name or delete the existing file."
	exit 1
fi

echo -e "\n*** Your run_info file will be stored in $RUN_INFO ***\n"

# Introduction:

echo -e "##### GENERAL QUESTIONS #####\nMost of these fields are used in the final report."
echo -e "At any time, you have the option to ask for help (h) or quit (q).\n" 

# Input required variables. Each step will append to the run_info file.
read_variable "Project name (no spaces)" "\tThis variable is used for file names that gather data across all samples and within the final report.\n\tExample: ChipSeqProjectABCD"
write_run_info "PROJECT_NAME" "$VAR"

read_variable "Who is this analysis being run for? (lastname_firstname)" "\tThis variable refers to the Principle Investigator for this project.\n\tExample: Smith_John"
write_run_info "PI" "$VAR"

read_variable "What genome build are you using? (hg19, etc.)" "\tThis variable describes the genome build used as a reference.\n\tExamples: hg19, mm10"
write_run_info "GENOMEBUILD" "$VAR"

read_variable "If available, please specify a unique run identifier. Otherwise, specify 'none'" "\tThis variable is used to build up file names to distinguish individual runs.\n\tExample: Mayo_specific_identifier"
write_run_info "RUNID" "$VAR"

read_variable "Will this analysis be run in standalone (s) or on a cluster (c)?" "\tThis question determines whether additional cluster-specific properties are needed."
case $VAR in 
	c | cluster )
		read_variable "Who should be notified when analysis is complete? (email address)" "\tThis value is only used for jobs submitted to a cluster.\n\tExample: user@somewhere.com"
		write_run_info "USEREMAIL" "$VAR"
		
		read_variable "Sample Base Name (no spaces)" "\tThis variable is used for job names submitted to the grid.\n\tExample: TreatedWithXYZ"
		write_run_info "SAMPLENAME" "$VAR"
	;;
	s | standalone )
		write_run_info "USEREMAIL" "none"
		write_run_info "SAMPLENAME" "none"
	;;
esac

read_variable "Would you like to use $OUTPUT_DIR as the location for your analysis output? (y/n)" "\tThis directory will hold all results files for your analysis. Please ensure that you have sufficient space and access privileges before running your analysis."
case $VAR in
	y | yes ) write_run_info "WORK_DIR" "$VAR" ;;
	* ) 
		read_variable "Please specify your desired output directory." "\tThis directory will hold all results files for your analysis. Please ensure that you have sufficient space and access privileges before running your analysis." "" "dir"
		write_run_info "WORK_DIR" "$VAR"
		;;
esac


echo -e "##### SAMPLE INFORMATION #####\nData specific to your samples."
echo -e "At any time, you have the option to ask for help (h) or quit (q).\n" 

read_variable "Where are your samples located? (/path/to/samples/)" "\tThis path identifies the location of sample(s) for this analysis.\n\tExample: /path/to/samples/" "" "dir"
# Store this value for later
INPUT_DIR=$VAR
write_run_info "SEQ_DIR" "$VAR" "" "dir"

read_variable "What is the file extension on these samples? (fastq.gz, fastq, etc)" "\tThis helps identify which samples to use.\n\tExample: fastq.gz"
write_run_info "SEQ_SUFFIX" "$VAR"

read_variable "Are these samples paired-end (pe) or single-end (se)?" "\This value identifies whether to use single- or paired-end analysis.\n\tExample: pe"
# Store this value for later, since this influences other questions.
SEQUENCE_TYPE=$VAR
write_run_info "SEQ_TYPE" "`echo $VAR | tr [:lower:] [:upper:]`"

case $SEQUENCE_TYPE in
	se )
		collect_samples
		write_run_info "END1_SEQ" "$SAMPLES"
		END1_SAMPLES=$SAMPLES
		get_filter_type "\tU1 (unique match)\n\tU2 (primary alignment)"
	;;
	pe )
		collect_paired_samples
		write_run_info "END1_SEQ" "$END1_SAMPLES"
		write_run_info "END2_SEQ" "$END2_SAMPLES"
		get_filter_type "\tU22 (both ends unique)\n\tU12 (one or both ends unique)\n\tU02 (primary alignments)"
	;;
esac

echo "You have entered the following samples: "
echo $END1_SAMPLES | tr ' ' '\n'
echo ""
collect_ip_and_input_samples
write_run_info "IP_FILE" "$IP_SAMPLES"
write_run_info "INPUT_FILE" "$INPUT_SAMPLES"

echo -e "##### ANALYIS INFORMATION #####\nOptions specific to your analysis."
echo -e "At any time, you have the option to ask for help (h) or quit (q).\n" 

get_peak_caller

read_variable "What is the minimum peak neighbor distance desired? (Default: 10000)" "This determines the minimum distance between two values to call as a true peak. This value should be large enough to avoid duplicate calls for a single region and small enough to provide meaningful results." "optional"
if [[ -z $VAR ]] 
then
	VAR="10000"
fi
write_run_info "PKGENE_NEIGHDIST" "$VAR"

read_variable "Would you like to run CEAS Analysis? (y/n)" "This determines whether to perform CEAS analysis."
case $VAR in
	y | yes )
	 	CEAS_ANALYSIS="Yes" ;;
	* ) 
		CEAS_ANALYSIS="No" ;;
esac
write_run_info "RUN_CEAS" "$CEAS_ANALYSIS"

CEAS_OPTIONS="--bg --sizes=1000,2000,3000 --bisizes=2000,4000 --span=2000 --pf-res=50 --rel-dist=2000"
read_variable "The default options for CEAS are: $CEAS_OPTIONS. Would you like to change them? (y/n)" "Specifying 'yes' to this step will modify the CEAS options." 
case $VAR in
	y | yes )
	 	read_variable "Please specify your desired CEAS options." "These values will be passed directly to the CEAS application." 
	 	CEAS_OPTIONS=$VAR
		;;
	* ) 
		 ;;
esac
write_run_info ">CEAS_ARGS" "$CEAS_OPTIONS"

CEAS_PEAK_CUTOFF="10"
read_variable "The default value for the CEAS peak cutoff is 10. Specify a different value or press 'enter' to continue." "The peak cutoff value is -log10(pvalue) or -log10(qvalue)" "optional"
if [[ -z $VAR ]] 
then
	VAR=$CEAS_PEAK_CUTOFF
fi
write_run_info "CEAS_PEAK_CUTOFF" "$VAR"

read_variable "Do you have any user-provided gene files? (type 'n' if unknown)" "CEAS offers the option of filtering to specific genes of interest. These can be specified here."
case $VAR in
	y | yes )
	 	gather_user_provided_genes
		;;
	* ) ;;
esac
write_run_info "CEAS_GENE_DIR" "$CEAS_GENE_DIR"
write_run_info "CEAS_GENE_LIST" "$CEAS_GENE_LIST"

read_variable "Would you like to run MEME Analysis? (y/n)" "This determines whether to perform MEME analysis."
case $VAR in
	y | yes )
	 	MEME_ANALYSIS="Yes" 
		;;
	* ) 
		MEME_ANALYSIS="No" ;;
esac
write_run_info "RUN_MEME" "$MEME_ANALYSIS"

MEME_OPTIONS="-dna -mod zoops -nmotifs 5 -minw 10 -maxw 20 -maxsize 999999999 -revcomp"
read_variable "The default options for MEME are: $MEME_OPTIONS. Would you like to change them? (y/n)" "Specifying 'yes' to this step will modify the MEME options." 
case $VAR in
	y | yes )
	 	read_variable "Please specify your desired MEME options." "These values will be passed directly to the MEME application." 
	 	MEME_OPTIONS=$VAR
		;;
	* ) ;;
esac
write_run_info "MEME_ARGS" "$MEME_OPTIONS"

MEME_PEAK_SIZE="200"
read_variable "The default value for the MEME peak size is $MEME_PEAK_SIZE. Specify a different value or press 'enter' to continue" "Specifies the peak size used for scanning motif. Example: for 200 bp, the peak center +/- 100 bp regions will be used (for a total size of 200bp)." "optional"
if [[ -z $VAR ]] 
then
	VAR=$MEME_PEAK_SIZE
fi
write_run_info "MEME_PEAK_SIZE" "$VAR"

MEME_PEAK_CUTOFF="0.1"
read_variable "The default value for the MEME peak cutoff is $MEME_PEAK_CUTOFF. Specify a different value or press 'enter' to continue" "Specifies the peak cutoff used to parse top peaks for motif finding.\n\tIf specified as an integer of >1, then this represents the number of top peaks based on the p-value.\n\tIf this number is bigger than the total number of peaks, then all the peaks will be used.\n\tIf specified between 0.001 and 1, then this represents the percentage of top peaks based on the p-value (0.5 means top 50% of the peaks)." "optional"
if [[ -z $VAR ]] 
then
	VAR=$MEME_PEAK_CUTOFF
fi
write_run_info "MEME_PEAK_CUTOFF" "$MEME_PEAK_CUTOFF"

read_variable "Would you like to run Mayo's Gene Ontology Analysis? (y/n)" "This determines whether to perform Mayo's Gene Ontology analysis."
case $VAR in
	y | yes )
	 	GOM_ANALYSIS="Yes" 
		;;
	* ) 
		GOM_ANALYSIS="No" ;;
esac
write_run_info "RUN_GOM" "$GOM_ANALYSIS"

GOM_REG_DOM="(5000 1000 100000 5000)"
read_variable "The default value for the gene regulatory domain is $GOM_REG_DOM. Specify a different value, press 'h' for formatting help, or press 'enter' to continue" "Defines the gene regulatory domain, values are separated by single space. Basal regulatory domain: from upstream 5000 bp to downstream 1000 bp; extended regulatory domain: extend upstream by an additional 100000 bp and downstream by an additional 5000 bp." "optional"
if [[ -z $VAR ]] 
then
	VAR=$GOM_REG_DOM
fi
write_run_info "REG_DOM" "$VAR"

ANNO_METHOD="qvalue"
read_variable "The default value for the raw p-value adjustment is $ANNO_METHOD. Specify a different value or press 'enter' to continue" "Specifies how the raw p value needs to be adjusted in GO annotation." "optional"
if [[ -z $VAR ]] 
then
	VAR=$ANNO_METHOD
fi
write_run_info "MEME_PEAK_CUTOFF" "$VAR"

write_run_info "MEMORY_INFO" "$MEMORY_INFO_FILE"
write_run_info "TOOL_INFO" "$TOOL_INFO_FILE"

echo "Processing is complete. Your run_info file is available at $RUN_INFO"