grp_cmp_func <- function (
		
		# Files and directory
		resdir="/data2/microbiome/jeff/hybriddenovo/mytest/workspace/R1PairedAnalysis", 
		datdir="/data2/microbiome/jeff/hybriddenovo/mytest/workspace/R1PairedAnalysis",
		codedir="/data2/microbiome/jeff/im_tornado/junR/", #"~/Dropbox/To_Stephen/Demo/Code", 
		stat.file='Stats.R',
		otu.file='test_paired.biom',
		map.file='mapping.txt',
		tree.file='test_paired.tree', 
		seq.file='test_paired_otus.final.fasta',
		ko.file=NULL,#'metagenome_predictions_ko.biom', 
		cog.file=NULL,#'metagenome_predictions_cog.biom',
		ko.ann.file=NULL,#'kegg.map.RData',
		R.obj='Data.RData',
		
		# Variable of interest, variable to adjust, level to use, etc
		rm.var.ind=NULL,
		num.var=NULL,
		
		grp.name="Sample_Type",
		grp.name.c="Sample_Type",
		adj.name='batch',
		subject=NULL,
		strata=NULL,
		cluster.meta.info="Sample_Type",
		
		grp.level.use=c('Breast', 'Skin_Tissue'),
		selection=NULL,

		# Filter criteria
		filter.dep=2000,
		prev=0.10,
		minp=0.002,
		
		# Rarefy criteria
		rarefy.dep=10000,
		
		# Alpha diversity parameter
		alpha.measure=c('Observed', 'Chao1', 'Shannon', 'InvSimpson'),
		rarefy.iter=5, 
		
		# Beta diversity parameter
		beta.measure=c('UniFrac', 'GUniFrac', 'WUniFrac', 'BC'),
		beta.rarefy=TRUE,
		ord.method='cmd',
		omni.dist.name=c('UniFrac', 'GUniFrac', 'WUniFrac', 'BC'),
		
		# Predictive model parameter
		RF.taxa.level='Genus',
        nBoot=100,
		boruta.level='Tentative',
		
		# Taxa and Function diversity parameter, Multiple test correction
		diff.method='perm',
		
		transformation='sqrt',
		normalization='GMPR',
		winsor.qt=0.95,
		mt.method='fdr',
		cutoff=0.1,
		test.taxa.level=c('Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'),
		test.kegg.level=c('KEGG_Metabolism'),
		test.cog.level=c('COG_Category2'),
		vis.taxa.level=c('Phylum', 'Family', 'Genus'),

        # Subtype discovery
		cluster.method='pam', 
		cluster.dist.name='UniFrac', 
		cluster.stat='gap'

) {
	cat("Load data ... \n")
	source(file.path(codedir, stat.file))			
	load_package()
	
	setwd(datdir)
	
	if (is.null(R.obj)) {
		
		cat("Load OTU file...\n")
		otu.file <- file.path(datdir, otu.file)
		map.file <- file.path(datdir, map.file)
		tree.file <- file.path(datdir, tree.file)
		ko.file <- file.path(datdir, ko.file)
		cog.file <- file.path(datdir, cog.file)
		ko.ann.file <- file.path(datdir, ko.ann.file)
		seq.file <- file.path(datdir, seq.file)
		
		data.obj <- load_data(otu.file, map.file, tree.file, ko.file=ko.file, cog.file=cog.file, ko.ann.file=ko.ann.file, meta.sep='\t')
		ind <- rownames(data.obj$meta.dat)[colSums(data.obj$otu.tab) >= filter.dep]
		
		cat(rownames(data.obj$meta.dat)[colSums(data.obj$otu.tab) < filter.dep], 'excluded due to insufficient reads!\n')
		data.obj <- subset_data(data.obj, ind)
		dist.obj <- construct_distance(data.obj)
		
		data.obj.rff <- load_data(otu.file, map.file, tree.file, rff=TRUE, dep=rarefy.dep, ko.file=ko.file, cog.file=cog.file, ko.ann.file=ko.ann.file, meta.sep='\t')
		dist.obj.rff <- construct_distance(data.obj.rff)

		save(data.obj, data.obj.rff, dist.obj, dist.obj.rff, file='Data.RData')
			
	} else {
		load(R.obj)
	}
	
	#####################################
# Extract the relevant data set         #
	#####################################
	cat('Subset unrarefied version ...\n')
	if (is.null(grp.level.use)) {
		if (is.null(selection)) {
			samIDs <- rownames(data.obj$meta.dat)
		} else {
			samIDs <- rownames(data.obj$meta.dat)[eval(parse(text=selection), envir=data.obj$meta.dat)]
		}
	} else {
		if (is.null(selection)) {
			samIDs <- rownames(data.obj$meta.dat)[data.obj$meta.dat[, grp.name] %in% grp.level.use]
		} else {
			samIDs <- rownames(data.obj$meta.dat)[data.obj$meta.dat[, grp.name] %in% grp.level.use & eval(parse(text=selection), envir=data.obj$meta.dat)]
		}				
	}
	# remove NA	
	samIDs <- intersect(samIDs,  rownames(data.obj$meta.dat)[!is.na(data.obj$meta.dat[, grp.name])])
	
	data.obj <- subset_data(data.obj, samIDs)
	dist.obj <- subset_dist(dist.obj, samIDs)
	
	# summary(data.obj$meta.dat)
	if (!is.null(rm.var.ind)) {
		data.obj$meta.dat <- data.obj$meta.dat[, -rm.var.ind]
	}
	
	if (grp.name %in% num.var) {
		data.obj$meta.dat[, grp.name] <- as.numeric(data.obj$meta.dat[, grp.name])
	} else {
		data.obj$meta.dat[, grp.name] <- factor(data.obj$meta.dat[, grp.name], levels=grp.level.use)
	}
	
	colnames(data.obj$meta.dat) <- gsub("^\\s+|\\s+$", "", colnames(data.obj$meta.dat))
	
	if (!is.null(num.var)) {
		cat.var <- setdiff(colnames(data.obj$meta.dat), num.var)
		for (i in 1:length(cat.var)) {
			data.obj$meta.dat[, cat.var[i]] <- factor(data.obj$meta.dat[, cat.var[i]])
		}
		
		for (i in 1:length(num.var)) {
			data.obj$meta.dat[, num.var[i]] <- as.numeric(data.obj$meta.dat[, num.var[i]])
		}
	}
	
	cat('Subset rarefied version ...\n')
	if (is.null(grp.level.use)) {
		if (is.null(selection)) {
			samIDs <- rownames(data.obj.rff$meta.dat)
		} else {
			samIDs <- rownames(data.obj.rff$meta.dat)[eval(parse(text=selection), envir=data.obj.rff$meta.dat)]
		}
	} else {
		if (is.null(selection)) {
			samIDs <- rownames(data.obj.rff$meta.dat)[data.obj.rff$meta.dat[, grp.name] %in% grp.level.use]
		} else {
			samIDs <- rownames(data.obj.rff$meta.dat)[data.obj.rff$meta.dat[, grp.name] %in% grp.level.use & eval(parse(text=selection), envir=data.obj.rff$meta.dat)]
		}				
	}
	# remove NA	
	samIDs <- intersect(samIDs,  rownames(data.obj.rff$meta.dat)[!is.na(data.obj.rff$meta.dat[, grp.name])])
	
	data.obj.rff <- subset_data(data.obj.rff, samIDs)
	dist.obj.rff <- subset_dist(dist.obj.rff, samIDs)
	
	# summary(data.obj.rff$meta.dat)
	if (!is.null(rm.var.ind)) {
		data.obj.rff$meta.dat <- data.obj.rff$meta.dat[, -rm.var.ind]
	}
	
	if (grp.name %in% num.var) {
		data.obj.rff$meta.dat[, grp.name] <- as.numeric(data.obj.rff$meta.dat[, grp.name])
	} else {
		data.obj.rff$meta.dat[, grp.name] <- factor(data.obj.rff$meta.dat[, grp.name], levels=grp.level.use)
	}
	
	colnames(data.obj.rff$meta.dat) <- gsub("^\\s+|\\s+$", "", colnames(data.obj.rff$meta.dat))
	
	if (!is.null(num.var)) {
		cat.var <- setdiff(colnames(data.obj.rff$meta.dat), num.var)
		for (i in 1:length(cat.var)) {
			data.obj.rff$meta.dat[, cat.var[i]] <- factor(data.obj.rff$meta.dat[, cat.var[i]])
		}
		
		for (i in 1:length(num.var)) {
			data.obj.rff$meta.dat[, num.var[i]] <- as.numeric(data.obj.rff$meta.dat[, num.var[i]])
		}
	}
	
	phylo.obj <- phyloseq(otu_table(data.obj$otu.tab, taxa_are_rows=T), phy_tree(data.obj$tree), 
			tax_table(data.obj$otu.name), sample_data(data.obj$meta.dat))
	phylo.obj.rff <- phyloseq(otu_table(data.obj.rff$otu.tab, taxa_are_rows=T), phy_tree(data.obj.rff$tree), 
			tax_table(data.obj.rff$otu.name), sample_data(data.obj.rff$meta.dat))
	
	dir.create(resdir, showWarnings = FALSE)
	setwd(resdir)
	save(data.obj, dist.obj, data.obj.rff, dist.obj.rff, phylo.obj, phylo.obj.rff, file="Data.wk.RData")
	
	theme_set(theme_bw(base_size=16))
	#####################################
# 0. Summary statistics, sequence statistics, overall microbiota profile         #
	#####################################
	dir.create('./Summary')
	setwd('./Summary')
	cat("---Calculating summary statistics  ...\n")
	tab <- tableby(as.formula(paste0(grp.name, ' ~ ',  paste(adj.name, collapse='+'))), data=data.obj$meta.dat)
	save(tab, file='SummaryStat.RData')

	cat("---Calculating sequence statistics  ...\n")
	perform_sequence_stat_analysis(data.obj, grp.name)
	
	cat("---Generate taxnomical profiles  ...\n")
	generate_taxa_heatmap(data.obj, meta.info=cluster.meta.info, prev=prev, minp=minp)
	generate_stacked_barplot(data.obj, grp.name=grp.name.c)
	# Proportion
	cat("test\n")
	#generate_taxa_boxplot(data.obj, grp.name=grp.name.c, strata=strata, prev=prev, minp=minp)
	cat("test2\n")
	#generate_taxa_boxplot(data.obj, grp.name=grp.name.c, scale='binary', strata=strata, prev=prev, minp=minp)
	cat("test3\n")
	generate_taxa_barplot(data.obj, grp.name=grp.name.c, strata=strata, prev=prev, minp=minp, taxa.levels=c('Order'))
	cat("test4\n")
	#generate_taxa_barplot_aggregate(data.obj, grp.name=grp.name.c, strata=strata, prev=prev, minp=minp)
	
	#####################################
# 1. Alpah diversity analysis       #
	#####################################
	dir.create('../Alpha_diversity')
	setwd('../Alpha_diversity')
	cat("---Alpha diversity analysis ...\n")
	generate_rarefy_curve(data.obj, phylo.obj, grp.name.c, depth=rarefy.dep, iter.no=rarefy.iter)
	generate_alpha_boxplot(data.obj, phylo.obj, depth=rarefy.dep, grp.name=grp.name.c, strata=strata)
	if (is.null(subject)) {
		perform_alpha_test(data.obj, phylo.obj, depth=rarefy.dep, iter.no=rarefy.iter, grp.name=grp.name, adj.name=adj.name)
	} else {
		perform_alpha_test(data.obj, phylo.obj,  depth=rarefy.dep, iter.no=rarefy.iter, grp.name=grp.name, adj.name=adj.name,
				 model='lme', random = as.formula(paste0(' ~ 1 | ', subject)))
	}
	
	
	#####################################
# 2. Beta  diversity analysis       #
	#####################################
	dir.create('../Beta_diversity')
	setwd('../Beta_diversity')
	cat("---Beta diversity analysis ...\n")
	generate_ordination(data.obj.rff, dist.obj.rff, grp.name=grp.name.c, strata=strata)
	generate_clustering(data.obj.rff, dist.obj.rff, meta.info=cluster.meta.info, is.labRow=T)
	
	generate_distance_barplot(data.obj.rff, dist.obj.rff, grp=grp.name.c, within=T, strata=strata)
	generate_distance_boxplot(data.obj.rff, dist.obj.rff, grp=grp.name.c, within=T, strata=strata)
	
#perform_distance_comp_test(data.obj.rff, dist.obj.rff, grp.name=grp.name)
	perform_permanova_test(data.obj.rff, dist.obj.rff, grp.name=grp.name, adj.name=adj.name, strata=subject)
	perform_mirkat_test(data.obj.rff, dist.obj.rff, grp.name=grp.name, adj.name=adj.name)    # Could not handle correlation
	
	perform_betadisper_test(data.obj.rff, dist.obj.rff, grp.name=grp.name)
	
	#####################################
# 3. Taxa  diversity analysis       #
	#####################################
	# Using rarefied or unrarefied depend on the test
	dir.create('../Taxa_diversity')
	setwd('../Taxa_diversity')
	cat("---Taxa diversity analysis ...\n")		
	cat('--Differential abundance analysis ...\n')
	set.seed(123)
	diff.obj.rff <- perform_differential_analysis(data.obj.rff, grp.name=grp.name, adj.name=adj.name, taxa.levels=test.taxa.level,
			method=diff.method, mt.method=mt.method, subject=subject,
			cutoff=cutoff, prev=prev, minp=minp, ann=diff.method)
	
	cat('\n--Visualize differential abundance analysis ...\n')
	visualize_differential_analysis(data.obj.rff, diff.obj.rff, grp.name=grp.name.c, taxa.levels=vis.taxa.level, 
			mt.method=mt.method, cutoff=cutoff, ann=diff.method)
	
	cat('--Create LefSE format ...\n')
	create_lefse_format(data.obj.rff, diff.obj.rff, grp.name=grp.name.c, cutoff=cutoff, prev=prev, minp=minp, mt.method=mt.method)
	perform_lefse_analysis(data.obj.rff, grp.name=grp.name.c)
	
	#####################################
# 4. Predictive modeling       #
	#####################################
	dir.create('../Predictive_model')
	setwd('../Predictive_model')
	cat('---Predictive modelling ...\n')
	predictionRF(data.obj,  resp.name=grp.name.c, taxa.level=RF.taxa.level, boruta.leve=boruta.level, B=nBoot, prev=prev, minp=minp)
	
	
	#####################################
# 5. Function diversity analysis     #
	#####################################	
	dir.create('../Function_diversity')
	setwd('../Function_diversity')
	cat('---Functional level analysis ...\n')			
	if (!is.null(test.kegg.level)) {		
		
		generate_taxa_boxplot(data.obj.rff, grp.name=grp.name.c, strata=strata, rm.outlier=F, prev=prev, minp=minp, 
				taxa.levels=c(test.kegg.level), ann='KEGG')
		generate_taxa_barplot(data.obj.rff, grp.name=grp.name.c, strata=strata, prev=prev, minp=minp,
				taxa.levels=c(test.kegg.level), ann='KEGG')
#		generate_taxa_barplot_aggregate(data.obj.rff, grp.name=grp.name.c, strata=strata, prev=prev, minp=minp,
#				taxa.levels=c(test.kegg.level), ann='KEGG')
		diff.obj.rff <- perform_differential_analysis(data.obj.rff, grp.name=grp.name, adj.name=adj.name,  method=diff.method, 
				mt.method=mt.method, prev=prev, minp=minp, cutoff=cutoff, taxa.levels=c(test.kegg.level), ann=paste0('KEGG_', diff.method))		
		visualize_differential_analysis(data.obj.rff, diff.obj.rff, grp.name=grp.name.c, cutoff=cutoff, taxa.levels=c(test.kegg.level), 
				ann='KEGG', scale='none', mt.method=mt.method)	
	}
	
	if (!is.null(test.cog.level)) {	
		generate_taxa_boxplot(data.obj.rff, grp.name=grp.name.c, strata=strata, rm.outlier=F, prev=prev, minp=minp,
				taxa.levels=c(test.cog.level), ann='COG')
		generate_taxa_barplot(data.obj.rff, grp.name=grp.name.c, strata=strata, prev=prev, minp=minp,
				taxa.levels=c(test.cog.level), ann='COG')
#		generate_taxa_barplot_aggregate(data.obj.rff, grp.name=grp.name.c, strata=strata, prev=prev, minp=minp,
#				taxa.levels=c(test.cog.level), ann='COG')
		diff.obj.rff <- perform_differential_analysis(data.obj.rff, grp.name=grp.name, adj.name=adj.name,  method=diff.method, mt.method=mt.method, prev=prev, minp=minp,
				cutoff=cutoff, taxa.levels=c(test.cog.level), ann=paste0('COG_', diff.method))	
		visualize_differential_analysis(data.obj.rff, diff.obj.rff, grp.name=grp.name.c, cutoff=cutoff, taxa.levels=c(test.cog.level), 
				ann='COG', scale='none', mt.method=mt.method)				
		
	}
	#####################################
# 5. Subtype analysis     #
	#####################################	
	dir.create('../Subtype_analysis')
	setwd('../Subtype_analysis')
	cat('---Subtype analysis ...\n')
	perform_cluster_analysis(data.obj, dist.obj, dist.name=cluster.dist.name, method=cluster.method, stat=cluster.stat, 
			grp.name=grp.name, adj.name=adj.name) 
	
	
}


grp_cmp_func()
