#!/bin/bash
## Raymond Moore
## ChipSeq 2.0
## 6.12.13
shopt -s nocasematch

SKIP=0
MACS=1
PAIRED=1
CWD=$(pwd)


function skipToLogs(){
	echo ""
	read -p "Skip to Logs?" APT
	if [[ ($APT == "y" || $APT == "") ]]
	then
		SKIP=1
	else
		exit 1
	fi
}



read -p "$CWD is a WORK_DIR? " OPT

if [[ !($OPT == "y" || $OPT == "") ]]
then
	echo -e "Incorrect Directory: Quit!\n"
	exit 1
fi

read -p "Is this a Macs2 run? " OPT
if [[ !($OPT == "y" || $OPT == "") ]]
then
	echo -e "  So this must be Sicer!\n"
	MACS=0
fi

read -p "Is this a Paired-End run? " OPT
if [[ !($OPT == "y" || $OPT == "") ]]
then
	echo -e "  So this must be Single-End Sequence!\n"
	PAIRED=0
fi

######
### Look for Expected Directories, this is a dependancy step.
######
MISSINGDIR=0
DIRS=$(ls)
if [[ ! "$DIRS" =~ "delivery" ]] ; then echo -e "\e[1;31mMissing delivery/\e[0m\n"; fi
if [[ ! "$DIRS" =~ "fastqc" ]] ; then echo -e "\e[1;31mMissing fastqc/\e[0m\n"; MISSINGDIR=1; fi
if [[ ! "$DIRS" =~ "logs" ]] ; then echo -e "\e[1;31mMissing logs/\e[0m\n"; MISSINGDIR=1; fi
if [[ ! "$DIRS" =~ "mapout" ]] ; then echo -e "\e[1;31mMissing mapout/\e[0m\n"; MISSINGDIR=1; fi
if [[ ! "$DIRS" =~ "mapout" ]] ; then echo -e "\e[1;31mMissing mapout/\e[0m\n"; MISSINGDIR=1; fi
if [[ $MACS -eq 1 ]]
then
	if [[ ! "$DIRS" =~ "macs2out" ]] ; then echo -e "\e[1;31mMissing macs2out DIR\e[0m\n"; MISSINGDIR=1; fi
else
	if [[ ! "$DIRS" =~ "sicerout" ]] ; then echo -e "\e[1;31mMissing sicerout DIR\e[0m\n"; MISSINGDIR=1; fi 
fi

if [[ $MISSINGDIR -eq 1 ]]
then
	skipToLogs
else
	echo -e "\tAll Expected Directories are here!\n"
fi


######
### Attempt to List Samples by Name
######
if [[ !($SKIP -eq 1) ]]
then
	SAMPLENBAMES=$(ls fastqc/ | cut -d. -f1 | sort | uniq | sed 's/^/\t/')
	SAMPLECNT=$(ls fastqc/ | cut -d. -f1 | sort | uniq | wc -l)
	echo "I have $SAMPLECNT Samples:"
	echo -e "$SAMPLENBAMES\n"
fi



######
### Look @ FASTQC dir
######
if [[ !($SKIP -eq 1) ]]
then
	FASTQCCNT=$(ls fastqc/ |wc -l)
	read -p "Do you expect $FASTQCCNT dirs in fastqc/? " OPT
	if [[ !($OPT == "y" || $OPT == "") ]]
	then
		ls fastqc/ | sed 's/^/\t/';	skipToLogs
	fi
fi



######
### Look @ Alignment dir
######
## able to use *E.U* becuase of the paired end > U filter naming convention.
if [[ !($SKIP -eq 1) ]]
then
	if [[ $PAIRED -eq 1 ]]
	then
		MAPCNT=$(ls mapout/*E.U*.sorted.bam | wc -l)
		read -p "Do you expect $MAPCNT sorted bams in mapout/? " OPT 
		if [[ !($OPT == "y" || $OPT == "") ]]
		then
			ls mapout/*E.U*.sorted.bam | sed 's/^/\t/'
			skipToLogs
		fi
		echo "Here are the relative sizes:"
		du -hs mapout/*E.U*.sorted.bam | sed 's/^/\t/'
		read -p "Are these sizes ok? " OPT 
		if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi
	fi


	MAP2CNT=$(ls mapout/*.s1.bam | wc -l)
	read -p "Do you expect $MAP2CNT filtered/deduplicated bams in mapout/? " OPT 
	if [[ !($OPT == "y" || $OPT == "") ]]
	then
		ls mapout/*.s1.bam | sed 's/^/\t/'
		skipToLogs
	fi

	echo "Here are the relative sizes (should be half!):"
	du -hs mapout/*.s1.bam | sed 's/^/\t/'
	read -p "Are these sizes ok? " OPT 
	if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi

	
	if [[ $PAIRED -eq 1 ]]
	then
		### in alignment, an additional file is created to input into Macs2, only for PE alignment
		if [[ $MACS -eq 1 ]]
		then
			echo "Since this is Macs2 Run, here are the # of Insert Sizes (~300+):"
			wc -l mapout/*.size.txt | awk '{sub(/^[ \t]+/, ""); print}' | tr -s " " "\t" | sed 's/^/\t/'
			read -p "Are these counts ok? " OPT 
			if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi
		fi
	fi

	### generated before:wig
	BDGCNT=$(ls mapout/*.raw.bdg.gz | wc -l)
	read -p "Do you expect $BDGCNT Beta Graph Files [.bdg.gz] in mapout/? " OPT 
	if [[ !($OPT == "y" || $OPT == "") ]]
	then
		ls mapout/*.raw.bdg.gz | sed 's/^/\t/'
		skipToLogs
	fi

	echo "Here are the relative sizes (< 1b):"
	du -hs mapout/*.raw.bdg.gz  | sed 's/^/\t/'
	read -p "Are these sizes ok? " OPT 
	if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi

	### generated after:bdg
	### this is the index file for UCSC genome browser.
	WIGCNT=$(ls mapout/*norm.wig.gz | wc -l)
	read -p "Do you expect $WIGCNT WIG files in mapout/? " OPT 
	if [[ !($OPT == "y" || $OPT == "") ]]
	then
		ls mapout/*norm.wig.gz | sed 's/^/\t/'
		skipToLogs
	fi

	echo "Here are the relative sizes (< 1b):"
	du -hs mapout/*norm.wig.gz  | sed 's/^/\t/'
	#read -p "Are these sizes ok? " OPT 
	#if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi


	read -p "Dig into the norm.wig.gz files this time? " OPT
	if [[ !($OPT == "y" || $OPT == "") ]] 
	then
		### Special WIG file check for empty entries in a particular chromosome 
		for WIGFH in mapout/*norm.wig.gz
		do
			echo -e "\t$WIGFH\n\t\tCHR  -> # Entries"
			zcat $WIGFH | \
			perl -e '@h;while(<>){if($_=~/=(chr\S+)/){push(@h,$.);push(@h,$1)}} push(@h,$.); for($i=0;$i<$#h;$i+=2){print "\t\t$h[$i+1] -> ".($h[$i+2]-$h[$i])."\n"}'
			read -p "Are all of the entries present? " OPT 
			if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi
		done
	fi

	### generated after:wig
	TDFCNT=$(ls mapout/*norm.tdf | wc -l)
	read -p "Do you expect $TDFCNT TDF files in mapout/? " OPT 
	if [[ !($OPT == "y" || $OPT == "") ]]
	then
		ls mapout/*norm.tdf | sed 's/^/\t/'
		skipToLogs
	fi

	echo "Here are the relative sizes (< 1b):"
	du -hs mapout/*norm.tdf  | sed 's/^/\t/'
	read -p "Are these sizes ok? " OPT 
	if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi

	
	echo "Did the files copy to delivery correctly? (matches above):"
	du -hs delivery/mapout/*norm.tdf  | sed 's/^/\t/'
	read -p "Are these files ok? " OPT 
	if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi   #expand this to fix copy issues in line (8M difference still seems to be ok...weird, but greater than not.)
	
	
fi ##end align 



# #### this script can generate new tdf file
# IGVTOOLS=/data2/bsi/staff_analysis/m105265/packages/IGVTools
# IGV_GENOME=/data2/bsi/staff_analysis/m105265/packages/IGVTools/genomes/mm10.genome
# #IGV_GENOME=/data2/bsi/staff_analysis/m105265/packages/IGVTools/genomes/hg19.genome
# for files in *norm.wig.gz
# do
# FNAME=$(basename $files .wig.gz)
# pigz -d -k $files
# ${IGVTOOLS}"/"igvtools toTDF ${FNAME}.wig ${FNAME}.tdf ${IGV_GENOME}
# rm ${FNAME}.wig
# done




######
### Look @ Peak Calling dir
######
if [[ !($SKIP -eq 1) ]]
then
	######
	### Look @ Macs dir
	######
	if [[ $MACS -eq 1 ]]
	then
		
		echo -e "\n Looking into Macs2 Results"
		MACSETCNT=$(ls delivery/macs2out/* | grep -v '.bdg$' | grep -v '.r$' | wc -l | awk '{print $1/5}')
		read -p "Do you expect $MACSETCNT analyses? " OPT 
		if [[ !($OPT == "y" || $OPT == "") ]]
		then
			ls delivery/macs2out/* | cut -d. -f1 | sort | uniq | sed 's/^/\t/'
			skipToLogs
		fi
		
		echo "Here are the number of records per file (< 1 row):"
		wc -l delivery/macs2out/* |awk '{sub(/^[ \t]+/, ""); print}' |tr -s " " "\t"  | sed 's/^/\t/'
		read -p "Are these sizes ok? " OPT 
		if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi


		echo "Look inside the Peak vs Gene files:"
		for PG in delivery/macs2out/*peak_vs_gene.xls
		do
			echo "$PG "
			head -6 $PG |cut -f 2,9 | sed 's/^/\t/'
			read -p "Is this acceptable? " OPT 
			if [[ !($OPT == "y" || $OPT == "") ]]
			then
				head -8 $PG |cut -f 2,9-18 | sed 's/^/\t/' 
				skipToLogs
			fi
		done;

	######
	### Look @ Sicer dir
	######
	else
		echo -e "\n Looking into Sicer Results"
		
		SICECNT=$(ls sicerout/*_peak_vs_gene.xls| wc -l )
		read -p "Do you expect $SICECNT analyses? " OPT
		if [[ !($OPT == "y" || $OPT == "") ]]
		then
			ls delivery/sicerout/* | cut -d. -f1 | sort | uniq | sed 's/^/\t/'
			skipToLogs
		fi

		echo "Here are the number of records per _peak_vs_gene.xls file (Non-Empty):"
		 wc -l delivery/sicerout/*_peak_vs_gene.xls
		read -p "Are these sizes ok? " OPT 
		if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi
		
		echo "Look inside the Peak vs Gene files:"
		for PG in delivery/sicerout/*peak_vs_gene.xls
		do
			echo "$PG "
			head -6 $PG |cut -f 2,9 | sed 's/^/\t/'
			read -p "Is this acceptable? " OPT 
			if [[ !($OPT == "y" || $OPT == "") ]]
			then
				head -8 $PG |cut -f 2,9-18 | sed 's/^/\t/' 
				skipToLogs
			fi
		done;
		
		DELIVEREDSAMPLES=(`ls delivery/sicerout/ | cut -d'.' -f1 | sort | uniq`)
		for SAMP in ${DELIVEREDSAMPLES[*]}
		do
			echo -e "\e[1;35m $SAMP \e[0m"
			ls delivery/sicerout/$SAMP* | cut -d'/' -f3 | sed 's/^/\t/' 
			read -p "Are all 4 Files present? (*xls,*bed,*summary,*summary-FDR1E-2) " OPT 
			if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi
		done
		
		
	fi
fi ##end peak caller


########
### Look into Delievery Directory more
########
DELIV01=`ls delivery/*html`
echo -e "\t\e[1;32m > $DELIV01 < \e[0m"
read -p "Do I have my HTML Report?" OPT 
if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi

DELIV02=`ls delivery/igv*`
echo -e "\t\e[1;32m > $DELIV02 < \e[0m"
read -p "Do I have my IGV Session?" OPT 
if [[ !($OPT == "y" || $OPT == "") ]] ; then skipToLogs; fi


######
### Look @ Logs, if there is an issue
######
read -p "Do you want to investigate the logs?" OPT 
if [[ ($OPT == "y" || $OPT == "") ]]  ### maybe take auto out?
then
	LASTRECORD=`ls -ltr logs/ | tail -1 | cut -f10-12 -d' '`
	echo -e "\tLast Record: $LASTRECORD"
	LASTINDEX=`ls -tr logs/ | tail -1 | cut -d'.' -f6`
	echo -e "\tGrabbing Only logs indexed by: $LASTINDEX"
	
	LOGSETS=(`ls logs/*$LASTINDEX* | cut -d'.' -f4 | sort | uniq`)
	for i in "${LOGSETS[@]}"
	do
		du -hs logs/*$i*$LASTINDEX* | sed 's/^/\t/'
		read -p "Dig into logs for: $i?" APT
		if [[ ($APT == "y" || $APT == "") ]]
		then		
			CWLOGS=(`ls logs/*$i*$LASTINDEX*`)
			for j in "${CWLOGS[@]}"
			do
				echo -e "\t>> $j"
				grep -v '^+' $j | sed 's/^/\t/'
				read -p "Continue? Look for the term 'error'." BPT 
				if [[ !($BPT == "y" || $BPT == "") ]] ; then echo -e "\t$j"; exit 1; fi
				echo ""
				grep -i 'error\|abort' $j | sed 's/^/\t/'
				read -p "Continue to next log?" BPT 
				if [[ !($BPT == "y" || $BPT == "") ]] ; then echo -e "\t$j"; exit 1; fi
			done
		fi
	done
fi



