#!/bin/bash

#set programs and paths:

workDir=$1
[ "$workDir" == "" ]&&workDir=`pwd`
[ -e $workDir/workspace ]&&rm -r $workDir/workspace
mkdir -p $workDir/workspace/imtornado
cd $workDir/workspace/imtornado

read_r1_length=$2
if [ -z ${read_r1_length} ];then
        echo "please set read length (i.e. 250)"
        exit
fi

read_r2_length=`echo "$read_r1_length*0.8"|bc`
read_r2_length=${read_r2_length%.*}
read_min_length=`echo "$read_r1_length*0.75"|bc`
read_min_length=${read_min_length%.*}
read_paired_length=`echo "$read_r1_length+$read_r2_length"|bc`

if [ $R1PAIRED_READ_TYPE -eq 1 ];then
	read_paired_length=200
fi

sub_QC(){
#Filter with Trimmomatic
	fastqname=`basename $x`
	echo "proessing "$x" "$(date +%T)
	extension=`echo $x | awk -F . '{print $NF}'` # get file extension
	if [ $extension == gz ]; then
		extension="fastq.gz"
	fi
	fastqbase=${fastqname%.$extension}
	if [ "$R1PAIRED_TRIMMOMATIC_para" == "" ];then
		$R1PAIRED_TRIMMOMATIC -phred33 $x $fastqbase\.trimmomatic.fastq LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:$read_min_length 2>> trimmomatic.log
	else
		$R1PAIRED_TRIMMOMATIC -phred33 $x $fastqbase\.trimmomatic.fastq "${R1PAIRED_TRIMMOMATIC_para}" 2>> trimmomatic.log
	fi
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_fastq2fasta.py $fastqbase\.trimmomatic.fastq $fastqbase.trimmomatic.fasta
#echo "Remove ambigs..."
#no check for polys... let OTUing take care of this
#there has to be a faster way to do this.
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_remove_ambigs.py <(cat $fastqbase.trimmomatic.fasta | sed 's/\/[12]$//g') $fastqbase.trim.fasta
}

sub_merge(){

echo "sub_merge..."
#Rename read IDs accoring to sample ID (QIIME convention).
#Trim read lengths to specified numbers
#do not discard orphans
#output the common ids to PREFIX.common.accnos
#...some codes here....

#Rename and trim
	[ -e "test_R1.fasta" ] && rm test_R1.fasta
	[ -e "test_R2.fasta" ] && rm test_R2.fasta
	[ -e "test.groups" ] && rm test.groups
	[ -e "test.common.accnos" ] && rm test.common.accnos
	echo "merge reads... "$(date +%T)
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_rename_ids.py --trim_r1 $read_r1_length --trim_r2 $read_r2_length "test" *R1.trim.fasta
#output=test_R1.fasta, test_R2.fasta, test.groups, test.common.accnos
	[ -s "test_R1.fasta" ] || echo "File test_R1.fasta not found or zero sized." 1>&2
	[ -s "test_R2.fasta" ] || echo "File test_R2.fasta not found or zero sized." 1>&2
	[ -s "test.groups" ] || echo "File test.groups not found or zero sized." 1>&2
	[ -s "test.common.accnos" ] || die 1 "File test.common.accnos not found or zero sized."
}

sub2_concatenate_read_pairs(){
	local RX=$1 #RX=R1, R2
	echo "sub2_concatenate_read_pairs "$RX" "$(date +%T)
#Concatenate paired reads. This is only used for R1+R2 OTUing.
#First, pick out the common ids
	echo "Pick common ids... "$RX" "$(date +%T)
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_picker.py <(cat test.common.accnos | sed '/^\s*$/d') test_$RX\.fasta test_$RX\.common.fasta

#flatten out the reads
	echo "Flatten read files..."$RX" "$(date +%T)
#I changed this part because in case of a large file (cause a very long single line), output will be zero
	cat test_$RX\.common.fasta | awk '{if(index($0, ">")==1){print pre; print;pre=""}else{pre=pre""$0}}END{print pre}' | (read;cat) > test_$RX\.common.flat.fasta
}
sub_concatenate_read_pairs(){ #for paired only
#Strip out the R2 reads from their IDs (this step is skipped by Jeff)
#and concatenate
	echo "sub_concatenate_read_pairs: "$(date +%T)
	sub2_concatenate_read_pairs R1
	sub2_concatenate_read_pairs R2
	if [ $R1PAIRED_READ_TYPE -eq 1 ];then
		echo "R1PAIRED_READ_TYPE=$R1PAIRED_READ_TYPE"
		$R1PAIRED_JAVA -jar ${R1PAIRED_SOURCE_DIR}/scripts/MicrobiomeRetrieveFastq.jar ./ test_R1.common.flat.fasta > test_R1_pandaseq.fastq
		$R1PAIRED_JAVA -jar ${R1PAIRED_SOURCE_DIR}/scripts/MicrobiomeRetrieveFastq.jar ./ test_R2.common.flat.fasta > test_R2_pandaseq.fastq
		$R1PAIRED_PANDASEQ -f test_R1_pandaseq.fastq -r test_R2_pandaseq.fastq -o 5 2>test_pandaseq.log | sed 's/_original_id=/ original_id=/' > test_paired.fasta
		cp test_paired.fasta test_padded.fasta
	elif [ $R1PAIRED_READ_TYPE -eq 2 ];then
		paste test_R1.common.flat.fasta test_R2.common.flat.fasta | awk '{if(index($0,">")==1){s=substr($0, 1,length($0)/2);print s}else{print $1$2}}' > test_paired.fasta
#also concatenate with a gap
		paste test_R1.common.flat.fasta test_R2.common.flat.fasta | awk '{if(index($0,">")==1){s=substr($0, 1,length($0)/2);print s}else{print $1"NNNNNNNN"$2}}' > test_padded.fasta
	else
		echo "something wrong: R1PAIRED_READ_TYPE=${R1PAIRED_READ_TYPE}"
		exit -1
	fi
}


sub2_extract_paired_names(){
#ok, this is weird... for now extract the name pairings (OTU and original ID)
#this is only for paired... we will use it for taxonomy and alignment
#get only the original ids
	grep ">" test_paired.otus1.5.fasta | tr -d '>' | tr ' ' '\t' | sed 's/;.*// ; s/\t.*=/\t/' | cut -f 2 > test_paired.original.accnos
#and pick it from the gapped read set
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_picker.py test_paired.original.accnos test_padded.fasta test_padded.otus.fasta
#Rename it again? will it work? I'm assuming the order is kept.
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_otu_renamer.py test_padded.otus.fasta test_padded.otus2.fasta 1 # 20001 # just change format of fasta name
}


sub_process_OTU(){
	local RX=$1 #RX=R1, R2, or paired
	local read_length=$2
	echo "sub_process_OTU: "$RX" read_length="$read_length" "$(date +%T)
#OTU R1 and R2, separately
#dereplicate
#annotate the uniques with the sizes
#and remove reads shorter than the specified trims
#	echo "Annotating unique sizes..."
#	echo "R1...R2...paired..."
#$USEARCH64 -derep_fulllength test_$RX\.fasta -output test_$RX\.derep.fasta -sizeout -minseqlength $read_length
	$R1PAIRED_USEARCH -derep_fulllength test_$RX\.fasta -fastaout test_$RX\.derep.fasta -sizeout -minseqlength $read_length

#remove singletons then OTU the reads!
	$R1PAIRED_USEARCH -sortbysize test_$RX\.derep.fasta -fastaout test_$RX\.derep2.fasta -minsize 2 > test_$RX\.derep2.log
	$R1PAIRED_USEARCH -cluster_otus test_$RX\.derep2.fasta -otus test_$RX\.otus.fasta > test_$RX\.otus.log

#rename the OTU ids...
	if [ $RX == "paired" ];then
		$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_otu_renamer.py test_paired.otus.fasta test_paired.otus1.5.fasta 1 # 20001
		sub2_extract_paired_names
	fi
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_otu_renamer.py test_$RX\.otus.fasta test_$RX\.otus2.fasta 1 # 20001
}


sub2_prepare_paired_to_align(){
#In the paired set, prepare to align separately
	grep ">" test_paired.otus3.fasta | tr -d '>' | tr ' ' '\t' | sed 's/;.*// ; s/\t.*=/\t/' | awk '{print $2"\t"$1}' > test_paired.align.map
#get the ids to look for in the files...
	cut -f 1 test_paired.align.map > test.to_align.accnos
}



sub_get_taxonomy_representatives(){
	local RX=$1 #RX=R1, R2 or paired
	echo "sub_get_taxonomy_representatives: "$RX" "$(date +%T)
#get the taxonomy of the representatives
	RXX=$RX;
	if [ $RXX == "paired" ];then
		RXX="padded"
	fi
	$R1PAIRED_MOTHUR "#set.logfile(name=test_$RX.otus2.GG99.wang.probs.log);classify.seqs(fasta=test_$RXX.otus2.fasta, taxonomy=${R1PAIRED_SOURCE_DIR}/external/database/GG99.taxonomy, template=${R1PAIRED_SOURCE_DIR}/external/database/GG99.fna, processors=$R1PAIRED_MOTHUR_THREADS, iters=1000)" > /dev/null

#copy the taxonomy files to the results (I used mv)
	mv test_$RXX.otus2.GG99.wang.taxonomy test_$RX.otus2.probs.GG99.wang.taxonomy

#do this again, with no support values
	$R1PAIRED_MOTHUR "#set.logfile(name=test.$RX.otus2.GG99.wang.nonprobs.log);classify.seqs(fasta=test_$RXX.otus2.fasta, taxonomy=${R1PAIRED_SOURCE_DIR}/external/database/GG99.taxonomy, template=${R1PAIRED_SOURCE_DIR}/external/database/GG99.fna, probs=false, processors=4)" > /dev/null
#this step makes some differences (see file size listed)

#copy the taxonomy files to the results (I used mv to change names, this step is not in original script)
	mv test_$RXX.otus2.GG99.wang.taxonomy test_$RX.otus2.nonprobs.GG99.wang.taxonomy

echo "Filter bad reads..."
#find the ones that are obviously not 16S
	grep -v "k__" test_$RX.otus2.nonprobs.GG99.wang.taxonomy | cut -f 1 > bad_taxonomy_$RX.accnos

#And remove those reads from the otu file
#checked file size, correct!
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_remover.py bad_taxonomy_$RX.accnos test_$RX.otus2.fasta test_$RX.otus3.fasta
	if [ $RX == "paired"  ]; then
		sub2_prepare_paired_to_align
	fi
}

sub_rename_to_OTU_ID(){
	echo "sub_rename_to_OTU_ID: "$(date +%T)
	local RX=$1
#get those IDs from the original files
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_picker.py test.to_align.accnos test_R1.fasta test_paired.R1.fasta
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_picker.py test.to_align.accnos test_R2.fasta test_paired.R2.fasta
#rename the ids to the OTU ids
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_custom_read_renamer.py test_paired.align.map test_paired.R1.fasta test_paired.R1.otus3.fasta # was test_paired.R1.to_align.fasta
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_custom_read_renamer.py test_paired.align.map test_paired.R2.fasta test_paired.R2.otus3.fasta # was test_paired.R2.to_align.fasta
}

sub_process_to_align_STK(){

	local RX=$1 #RX=R1, R2, paired.R1, paired.R2 (paired.R1, paired.R2 may not be necessary)
	echo "sub_process_to_align_STK: "$RX" "$(date +%T)

#align the reads... there may be still some bad reads in there
#some STD/STDERR outputs, must save as log
	$R1PAIRED_CMALIGN --cpu $R1PAIRED_CMALIGN_THREADS -g --notrunc --sub --dnaout --noprob --sfile test_$RX\.scores -o test_$RX\.otus3.aligned.stk ${R1PAIRED_SOURCE_DIR}/external/database/seed.16s.reference_model.cm test_$RX\.otus3.fasta > test_$RX\.otus3.log #some otus3 was to_align
#output=*_aligned.stk, *.scores


#convert stk to fasta
echo "Convert to fasta "$RX
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_stk2fasta.py test_$RX\.otus3.aligned.stk test_$RX\.otus3.aligned.fasta

echo "Filter bad reads"
#look for the badly aligned reads
	sed '/#/d' test_$RX\.scores | awk '{if($7<0) print $2}' > bad_align_$RX\.accnos
}

sub_remove_bad_from_OTU(){
	local RX=$1 #RX=R1, R2, paired
	echo "sub_remove_bad_from_OTU: "$RX" "$(date +%T)
#remove them from the OTU file
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_remover.py bad_align_$RX\.accnos test_$RX\.otus3.fasta test_$RX\.otus.final.fasta

#copy the final OTUs into the results file
#because they are actually a results
#...changed...by Jeff
	mv test_$RX\.otus.final.fasta test_$RX\.otus.final.result.fasta
}

sub_remove_bad_from_alignment(){
	local RX=$1 #RX=R1, R2, paired.R1, paired.R2
	echo "sub_remove_bad_from_alignment: "$RX" "$(date +%T)
#and from the alignment
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_remover.py bad_align_$RX\.accnos test_$RX\.otus3.aligned.fasta test_$RX\.otus3.aligned.clean.fasta

echo "Remove gapped columns"
#remove gapped columns from the alignment
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_remove_gaps.py test_$RX\.otus3.aligned.clean.fasta test_$RX\.otus3.aligned.clean2.fasta

#change periods to dashes in alignment, as well as change letters to uppercase
	cat test_$RX\.otus3.aligned.clean2.fasta | tr '.' '-' | tr 'a-z' 'A-Z' > test_$RX\.otus3.aligned.clean3.fasta
}

sub_join_paired_alignment(){
#copy the alignments to ../result
#...skipped..by Jeff
	echo "sub_join_paired_alignment: "$(date +%T)
#remove gaps from the paired alignments in the results, to create the independent OTUs
	cat test_paired.R1.otus3.aligned.clean3.fasta | tr -d '-' > test_paired.R1.otus.final.result.fasta
	cat test_paired.R2.otus3.aligned.clean3.fasta | tr -d '-' > test_paired.R2.otus.final.result.fasta
#Join the paired alignments
#find common reads... just in case
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_common_reads.py test_paired.R1.otus3.aligned.clean3.fasta test_paired.R2.otus3.aligned.clean3.fasta test.otus3.aligned.common.accnos
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_picker.py test.otus3.aligned.common.accnos test_paired.R1.otus3.aligned.clean3.fasta test_paired.R1.otus3.aligned.common.fasta
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_picker.py test.otus3.aligned.common.accnos test_paired.R2.otus3.aligned.clean3.fasta test_paired.R2.otus3.aligned.common.fasta

#flatten reads
#checked file size, correct
	cat test_paired.R1.otus3.aligned.common.fasta | awk '{if(index($0, ">")==1){print pre; print;pre=""}else{pre=pre""$0}}END{print pre}' | (read;cat) > test_paired.R1.otus3.aligned.flat.fasta
	cat test_paired.R2.otus3.aligned.common.fasta | awk '{if(index($0, ">")==1){print pre; print;pre=""}else{pre=pre""$0}}END{print pre}' | (read;cat) > test_paired.R2.otus3.aligned.flat.fasta

#Strip out the R2 reads from their IDs (skipped by Jeff)
#and concatenate
echo "Concatenate read pairs"
#checked file size, correct
	paste test_paired.R1.otus3.aligned.flat.fasta test_paired.R2.otus3.aligned.flat.fasta | awk '{if(index($0,">")==1){s=substr($0, 1,length($0)/2);print s}else{print $1$2}}' > test_paired.otus3.aligned.fasta # was test_paired.aligned.flat.fasta
}

sub_do_FastTree(){
	local RX=$1 #RX=R1, R2, paired
	echo "sub_do_FastTree: "$RX" "$(date +%T)
#now we can tree
# module load fasttree/2.1.7
#	[ -e test_$RX\.tree ] && rm test_$RX\.tree
#	$R1PAIRED_FASTTREEMP -nt -gtr -gamma -spr 4 -mlacc 2 -slownni -out test_$RX\.tree test_$RX\.otus3.aligned.fasta 2> test_$RX\.tree.log
	$R1PAIRED_FASTTREEMP ${R1PAIRED_FASTTREE_para} -out test_$RX\.tree test_$RX\.otus3.aligned.fasta 2> test_$RX\.tree.log
#      -nt -gtr -gamma -spr 4 -mlacc 2 -slownni

#copy the trees to the results directory
#...skipped..by Jeff

#now map the reads into the OTU buckets
#echo "Map OTUs R1"
#split the fastas into per 100000, 
	fidx=`cat test_$RX\.fasta | awk -v RX="$RX" 'BEGIN{fidx=1;cnt=0}{if(index($0,">")==1){cnt++};if(cnt>100000){cnt=1;fidx++};print > "test_"RX".fasta."fidx}END{print fidx}'`
	[ -f "test_${RX}.uc" ]&&rm "test_${RX}.uc"
	for((i=1;i<=$fidx;i++))
	do
		$R1PAIRED_USEARCH -threads $R1PAIRED_USEARCH_THREADS -usearch_global test_$RX\.fasta.$i -db test_$RX\.otus.final.result.fasta -strand plus -id 0.97 -uc test_$RX\.uc.$i > test_$RX\.uc.$i\.log
		cat test_$RX\.uc.$i >> test_$RX\.uc
	done
}

#used patricio's output

sub_make_biom(){
	local RX=$1 #RX=R1, R2, paired
	echo "sub_make_biom: "$RX" "$(date +%T)
#parse the OTU clusters
#checked file size, correct
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_parse_otu_clusters.py test_$RX\.otus.final.result.fasta test_$RX\.uc test_$RX\.otus.txt test_$RX\.failures.txt
#output=*.otus.txt, *.failures.txt


#finally, make the biom tables
	echo "Making OTU tables..."
#checked file size, correct
	$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_make_biom_table.py test_$RX\.otus.txt ../../mapping.txt test_$RX\.otus2.nonprobs.GG99.wang.taxonomy test_$RX\.biom

	if [ -f "${R1PAIRED_QIIME}/activate.sh" ];then
		source "${R1PAIRED_QIIME}/activate.sh"
	fi
	hasbiom=`which biom 2>/dev/null`
	if [ "${hasbiom}" == "" ];then
		echo "WARNING: ${R1PAIRED_QIIME}/activate.sh does not exist, test_${RX}.biom.table cannot be generated, please check the QIIME path in tool.info."
	else
		[ -e "test_${RX}.biom.table" ] && rm test_${RX}.biom.table
		ver=`biom --version 2>/dev/null | grep -o "version 2"`
		if [ "$ver" == "" ];then
			biom convert -i test_${RX}.biom -o test_${RX}.biom.table -b --header-key taxonomy --output-metadata-id "taxonomy" # for biom ver 1.x
		else
			biom convert -i test_${RX}.biom -o test_${RX}.biom.table --to-tsv --header-key taxonomy --output-metadata-id "taxonomy" # for biom ver 2.x
		fi
	fi
}

#Filter with Trimmomatic
[ -e "samtofastq.log" ] && rm samtofastq.log
[ -e "trimmomatic.log" ] && rm trimmomatic.log
for x in `ls ${R1PAIRED_WORK_DIR}/SOFTLINKS/*.*`
do
	#this loop cannot in a function, or too many files open
	sub_QC
done
echo -e "Sample_ID\tTotal_input_reads\tQC_passed_reads (%)" > QC.log.txt
cat trimmomatic.log | awk 'NR%3>0' | awk '{if(NR%2==0){split(pre,a," ");print a[7]" "$0};pre=$0}' | sed 's/.trimmomatic.fastq Input Reads: /\t/' | sed 's/ Surviving: /\t/' | sed 's/ Dropped: /\t/' | cut -f 1,2,3 >> QC.log.txt

sub_merge
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_concatenate_read_pairs

sub_process_OTU R1 $read_r1_length
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_process_OTU R2 $read_r2_length
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_process_OTU paired $read_paired_length

sub_get_taxonomy_representatives R1
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_get_taxonomy_representatives R2
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_get_taxonomy_representatives paired

[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_rename_to_OTU_ID

sub_process_to_align_STK R1
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_process_to_align_STK R2
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_process_to_align_STK paired.R1
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_process_to_align_STK paired.R2
[ $R1PAIRED_READ_TYPE -eq 0 ]||cat bad_align_paired.R1.accnos bad_align_paired.R2.accnos | sort -u > bad_align_paired.accnos

### these are fast steps
sub_remove_bad_from_OTU R1
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_remove_bad_from_OTU R2
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_remove_bad_from_OTU paired

### these are fast steps
sub_remove_bad_from_alignment R1
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_remove_bad_from_alignment R2
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_remove_bad_from_alignment paired.R1
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_remove_bad_from_alignment paired.R2
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_join_paired_alignment

sub_do_FastTree R1
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_do_FastTree R2
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_do_FastTree paired

sub_make_biom R1
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_make_biom R2
[ $R1PAIRED_READ_TYPE -eq 0 ]||sub_make_biom paired

cd ../../
echo "End of IMtornado"$(date +%T)

