#!/bin/bash

echo "start R1Paired calculation..."
workDir=$1
if [ -z ${workDir} ];then
	echo "please set work directory"
	exit
fi

READ_LENGTH=$2
if [ -z ${READ_LENGTH} ];then
        echo "please set read length (i.e. 250)"
        exit
fi
[ -e "${workDir}/workspace/R1Paired" ]&&rm -r "${workDir}/workspace/R1Paired"
mkdir ${workDir}/workspace/R1Paired
cd ${workDir}/workspace/R1Paired
shopt -s expand_aliases
echo "step1:get R1 of paired OTU representatives(use paired OTU fasta names including OTU ID). replace paired read to R1 in test_paired.otus.final.result.fasta"
alias get_read_ID="grep \">\" ../imtornado/test_paired.otus.final.result.fasta | tr -d '>' | tr ' ' '\t' | sed 's/;.*// ; s/\t.*=/\t/' | cut -f 2"
alias pick_reads="$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_picker.py <(get_read_ID) ../imtornado/test_R1.fasta /dev/stdout"
alias flatten_fasta="awk '{if(index(\$0, \">\")==1){print pre; print;pre=\"\"}else{pre=pre\"\"\$0}}END{print pre}' | sed '/^$/d'"
alias paste_fasta="paste <(pick_reads | flatten_fasta) <(cat ../imtornado/test_paired.otus.final.result.fasta | flatten_fasta)"
paste_fasta | awk -F"\t" '{if(index($0, ">")==1){print $2}else{print $1}}' > test_R1Paired_otus.final.fasta

echo "step2:get R1 reads of single(bad R2) and failed paired(i.e. non Bacteria, chimera) reads,to make sure R1 of the mapped paired reads must be excluded"
alias get_paired_mapped_fasta_ID="cat ../imtornado/test_paired.uc | grep -P \"^H\t\" | cut -f 9 | cut -d\" \" -f 1"
$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_picker.py <(cat ../imtornado/test_R1.uc | grep -P "^H\t" | cut -f 9) ../imtornado/test_R1.fasta test_R1_good.fasta #added Sept/19/2017
get_paired_mapped_fasta_ID | $R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_remover.py /dev/stdin test_R1_good.fasta test_R1Single.fasta

echo "step3:get R1 single reads (that mapped to bacteria but not in mapped paired reads)"
echo "first, map R1 single reads from step2 to R1 paired OTU"
fidx=`cat test_R1Single.fasta | awk 'BEGIN{fidx=1;cnt=0}{if(index($0,">")==1){cnt++};if(cnt>100000){cnt=1;fidx++};print > "test_R1Single.fasta."fidx}END{print fidx}'`
[ -f "test_R1Single2Paired.uc" ]&&rm test_R1Single2Paired.uc
for((i=1;i<=$fidx;i++))
do
	$R1PAIRED_USEARCH -threads $R1PAIRED_USEARCH_THREADS -usearch_global test_R1Single.fasta.$i -db test_R1Paired_otus.final.fasta -strand plus -id 0.97 -uc test_R1Single2Paired.uc.$i > test_R1Single2Paired.uc.$i\.log
	cat test_R1Single2Paired.uc.$i | grep -P "^H\t" >> test_R1Single2Paired.uc
done

#second, get unmapped reads
alias get_single_map2paired_read_ID="cat test_R1Single2Paired.uc | cut -f 9 | cut -d\" \" -f 1"
get_single_map2paired_read_ID | $R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_remover.py /dev/stdin test_R1Single.fasta test_R1Single2Paired_unmapped.fasta

###get R1 OTU of single reads that from step3
#third, get R1 OTU from single R1 (step3,second)
#$USEARCH64 -derep_fulllength test_R1Single2Paired_unmapped.fasta -output test_R1Single_derep.fasta -sizeout -minseqlength $READ_LENGTH
$R1PAIRED_USEARCH -derep_fulllength test_R1Single2Paired_unmapped.fasta -fastaout test_R1Single_derep.no.minlen.fasta -sizeout
cat test_R1Single_derep.no.minlen.fasta | awk -v minlen="$READ_LENGTH" '{if(index($0,">")==1){if(len>=minlen){print fas};len=0;fas=$0}else{len=len+length($0);fas=fas"\n"$0}}END{if(len>=minlen){print fas}}' > test_R1Single_derep.fasta

$R1PAIRED_USEARCH -uchime_ref test_R1Single_derep.fasta -db ${R1PAIRED_SOURCE_DIR}/external/database/gold.fa -uchimeout unmapped.uchime -nonchimeras test_R1Single_nonchimeras.fasta -strand plus #only support strand-plus
$R1PAIRED_USEARCH -sortbysize test_R1Single_nonchimeras.fasta -fastaout test_R1Single_sorted.fasta -minsize 2 > test_R1Single_sorted.log
start_index=$(echo "`cat test_R1Paired_otus.final.fasta | grep \">\" | tail -n 1 | cut -d\" \" -f 1 | sed 's/>//'` + 1" | bc)
$R1PAIRED_USEARCH -cluster_otus test_R1Single_sorted.fasta -otus test_R1Single_sorted.raw.name.otus.fasta
$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_otu_renamer.py test_R1Single_sorted.raw.name.otus.fasta test_R1Single_otus.fasta $start_index

#forth, get OTUs (include aligned/stk format for fast tree) that belong to bacteria taxonomy and match 16S secondary structure (STK)
$R1PAIRED_MOTHUR "#set.logfile(name=test_R1Single_otus.nonprobs.log);classify.seqs(fasta=test_R1Single_otus.fasta, taxonomy=${R1PAIRED_SOURCE_DIR}/external/database/GG99.taxonomy, template=${R1PAIRED_SOURCE_DIR}/external/database/GG99.fna, probs=false,processors=$R1PAIRED_MOTHUR_THREADS)" > /dev/null
mv test_R1Single_otus.GG99.wang.taxonomy test_R1Single_otus.nonprobs.GG99.wang.taxonomy

#keep k__Bacteria and k__Archaea, and bootstrap values must be more than 20 for bacteria and 70 for archaea out of 100
#cat  test_R1Single_otus.nonprobs.GG99.wang.taxonomy | awk '{split($2,a,";");split(a[1],b,"[()]");if(!(b[1]=="k__Bacteria"&&b[2]>20||b[1]=="k__Archaea"&&b[2]>70)){print $1}}' > bad_taxonomy.accnos
#$PYTHON $BIN/tornado_read_remover.py bad_taxonomy.accnos test_R1Single_otus.fasta test_R1Single_otus.bac.tax.fasta
$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_remover.py <(grep -v "k__" test_R1Single_otus.nonprobs.GG99.wang.taxonomy | cut -f 1) test_R1Single_otus.fasta test_R1Single_otus.bac.tax.fasta
if [ `cat test_R1Single_otus.bac.tax.fasta | wc -l` -eq 0 ];then
	echo "test_R1Single_otus.bac.tax.fasta has no sequences. R2 quality is very good!"
	exit
fi
#this is tricky, when do cmalign, must put R1 of paired and R1 of non-paired together, otherwise tornado_remove_gaps.py cannot merge sequences from different cmalign!!!why?

cat test_R1Paired_otus.final.fasta test_R1Single_otus.bac.tax.fasta > test_R1PairedSingle_otus.bac.tax.fasta

$R1PAIRED_CMALIGN --cpu 1 -g --notrunc --sub --dnaout --noprob --sfile test_R1PairedSingle_otus.bac.tax.stk.scores -o test_R1PairedSingle_otus.bac.tax.stk ${R1PAIRED_SOURCE_DIR}/external/database/seed.16s.reference_model.cm test_R1PairedSingle_otus.bac.tax.fasta > test_R1PairedSingle_otus.bac.tax.stk.log
$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_stk2fasta.py test_R1PairedSingle_otus.bac.tax.stk /dev/stdout | flatten_fasta > test_R1PairedSingle_otus.bac.tax.stk.fasta
#get R1Paired OTU(all these should be high score) and R1Sinlge high score OTU
alias get_stk_high_score="sed '/#/d' test_R1PairedSingle_otus.bac.tax.stk.scores | awk -v start_index=$start_index '{if(\$2>=$start_index){if(\$7>=0)print \$2}else{print \$2}}'"
$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_picker.py <(get_stk_high_score) test_R1PairedSingle_otus.bac.tax.stk.fasta test_R1PairedSingle_otus.clean.fasta
$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_remove_gaps.py test_R1PairedSingle_otus.clean.fasta /dev/stdout | tr '.' '-' | tr 'a-z' 'A-Z' | flatten_fasta > test_R1PairedSingle_otus.fasttree.fasta


R1Paired_OTUs=`cat test_R1Paired_otus.final.fasta | grep ">" | wc -l`
R1Paired_lines=`echo "2*$R1Paired_OTUs" | bc`
R1RairedSingle_lines=`wc -l test_R1PairedSingle_otus.fasttree.fasta | cut -d" " -f1`
R1Single_lines=`echo "$R1RairedSingle_lines-$R1Paired_lines" | bc`

echo "OTU start_index (number of identical reads, size>=2) ="$start_index
echo "R1Paired_lines=$R1Paired_lines"
echo "R1RairedSingle_lines=$R1RairedSingle_lines"
echo "R1Single_lines=$R1Single_lines"

head -$R1Paired_lines test_R1PairedSingle_otus.fasttree.fasta > test_R1Paired_otus.fasttree.fasta
tail -$R1Single_lines test_R1PairedSingle_otus.fasttree.fasta > test_R1Single_otus.fasttree.fasta

###do this tedious, but it have fasta id with original name
$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_picker.py <(cat test_R1Single_otus.fasttree.fasta | grep "^>" | sed 's/^>//') test_R1Single_otus.bac.tax.fasta test_R1Single_otus.final.fasta
fidx=`cat test_R1Single2Paired_unmapped.fasta | awk 'BEGIN{fidx=1;cnt=0}{if(index($0,">")==1){cnt++};if(cnt>100000){cnt=1;fidx++};print > "test_R1Single2Paired_unmapped.fasta."fidx}END{print fidx}'`
[ -f "test_R1Single2Single.uc" ]&&rm test_R1Single2Single.uc
for((i=1;i<=$fidx;i++))
do
	$R1PAIRED_USEARCH -threads $R1PAIRED_USEARCH_THREADS -usearch_global test_R1Single2Paired_unmapped.fasta.$i -db test_R1Single_otus.final.fasta -strand plus -id 0.97 -uc test_R1Single2Paired_unmapped.uc.$i > test_R1Single2Paired_unmapped.uc.$i\.log
	cat test_R1Single2Paired_unmapped.uc.$i >> test_R1Single2Single.uc
done

#merge fasta, OTU, uc and tax
$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_read_picker.py <(get_paired_mapped_fasta_ID) ../imtornado/test_paired.fasta test_paired_mapped.fasta # mapped=denove mapped, this is tricky!
# remmove this line: cat test_paired_mapped.fasta test_R1Single.fasta > test_PairedSingle.fasta # this includes bad reads, differ from test_PairedSingle.uc!
cat ../imtornado/test_paired.otus.final.result.fasta test_R1Single_otus.final.fasta > test_PairedSingle_otus.final.fasta
cat ../imtornado/test_paired.uc test_R1Single2Paired.uc test_R1Single2Single.uc | grep -P "^H\t" > test_PairedSingle.uc
cat ../imtornado/test_paired.otus2.nonprobs.GG99.wang.taxonomy test_R1Single_otus.nonprobs.GG99.wang.taxonomy > test_PairedSingle_otus.nonprobs.GG99.wang.taxonomy

#this part is tricky, I need to reformat the fasta
alias test_R2Paired_otus.fasttree.fasta="$R1PAIRED_JAVA -jar ${R1PAIRED_SOURCE_DIR}/scripts/SortFastaByID.jar ../imtornado/test_paired.R2.otus3.aligned.flat.fasta"
paste test_R1Paired_otus.fasttree.fasta <(test_R2Paired_otus.fasttree.fasta) | awk '{if(index($0,">")==1){s=substr($0, 1,length($0)/2);print s}else{print $1$2}}' > test_long_R1_paired.otus.fasttree.fasta


alias paired_append_short_R1="cat test_long_R1_paired.otus.fasttree.fasta test_R1Single_otus.fasttree.fasta"
aligned_length=`tail -1 test_long_R1_paired.otus.fasttree.fasta | awk '{print length($0)}'`
paired_append_short_R1 | awk -F"\t" -v len=$aligned_length '{gap="";if(index($0,">")!=1){for(i=0;i<(len-length($0));i++){gap=gap"-"}};print $0""gap}' > test_PairedSingle_otus.fasttree.fasta
###make biom
$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_parse_otu_clusters.py test_PairedSingle_otus.final.fasta test_PairedSingle.uc test_PairedSingle.uc.txt test_PairedSingle.uc.unmapped.ids
$R1PAIRED_PYTHON ${R1PAIRED_SOURCE_DIR}/external/tornado_make_biom_table.py test_PairedSingle.uc.txt ../../mapping.txt test_PairedSingle_otus.nonprobs.GG99.wang.taxonomy test_PairedSingle.biom
if [ -f "${R1PAIRED_QIIME}/activate.sh" ];then
	source "${R1PAIRED_QIIME}/activate.sh"
fi
hasbiom=`which biom 2>/dev/null`
if [ "${hasbiom}" == "" ];then
	echo "WARNING: ${R1PAIRED_QIIME}/activate.sh does not exist, test_PairedSingle.biom.table cannot be generated, please check the QIIME path in tool.info."
else	
	[ -e "test_PairedSingle.biom.table" ] && rm test_PairedSingle.biom.table
	 ver=`biom --version 2>/dev/null | grep -o "version 2"`
	if [ "$ver" == "" ];then
		biom convert -i test_PairedSingle.biom -o test_PairedSingle.biom.table -b --header-key taxonomy --output-metadata-id "taxonomy" # for biom ver 1.x
	else
		biom convert -i test_PairedSingle.biom -o test_PairedSingle.biom.table --to-tsv --header-key taxonomy --output-metadata-id "taxonomy" # for biom ver 2.x
	fi
fi
$R1PAIRED_FASTTREEMP ${R1PAIRED_FASTTREE_para} -out test_PairedSingle.tree test_PairedSingle_otus.fasttree.fasta  2> test_PairedSingle.tree.log

echo "End of R1Paired"

