correctGCbias <- function( chr, loc, counts, gc, train.autosome.idx, 
                         sample.name='sample_1', sspline.df=3, 
                        gc_plot_range=c(0.25,0.7), N_gc_interval=100,
                        robust.q=0.002,ylim.q=0.999,
                        plot_filename='',is.pdf=TRUE,plot.cex=0.6)
{
  
  
  require(gplots)
  
  
  chr <- as.vector(chr)
  loc <- as.vector(loc)
  counts <- as.vector(counts)
  gc <- as.vector(gc)
  
  autosome.chr.idx <- which(!is.element( chr, c("chrX", "chrY", "chrM")))
  sex.chrX.idx <- which(is.element(chr, c("chrX")))
  sex.chrY.idx <- which(is.element(chr, c("chrY")))
  
  ori_median_counts <- median( counts, na.rm=TRUE )
  
  train.counts <- counts[train.autosome.idx]
  robust_interval <- range(quantile( train.counts,
                                     probs=c(robust.q,1-robust.q),na.rm=TRUE))
  sel_bin_idx <- which(  train.counts <= (robust_interval[2]) &
                         train.counts >= (robust_interval[1]) &
                         !is.na(train.counts) )
  
  train.gc <- gc[train.autosome.idx[sel_bin_idx]]
  train.count <- counts[train.autosome.idx[sel_bin_idx]]
   
  sspline.res <- smooth.spline(x = train.gc, y = train.count, df = sspline.df)
 
  
  
  corrected.count <- counts
  
#   #=== additive correction
#   autosome.GC.NAN.idx <- intersect( which(!is.na(gc)), autosome.chr.idx )
#   autosome.GC.NAN.predict.count <- predict(sspline.res, gc[autosome.GC.NAN.idx])$y
#   predict.count[autosome.GC.NAN.idx] <- 
#     counts[autosome.GC.NAN.idx] - autosome.GC.NAN.predict.count + ori_median_counts
  
  #=== multiplicative correction
  GC.NAN.idx <-  which(!is.na(gc))
  GC.NAN.predict.count <- predict(sspline.res, gc[GC.NAN.idx])$y
  corrected.count[GC.NAN.idx] <- 
     ( counts[GC.NAN.idx] / GC.NAN.predict.count ) * ori_median_counts
  
  corrected.count[corrected.count<0] <- 0

 
  gcmin <- min(gc,na.rm=TRUE)
  gcmax <- max(gc,na.rm=TRUE)
  
  #gc.interval <- quantile(  gc,seq(0,1,1/N_gc_interval),na.rm=TRUE)
  #gc.interval[N_gc_interval+1] <- gcmax + 0.1 # make sure max gc is also included
  gc.interval <- seq(gc_plot_range[1],gc_plot_range[2],length.out = N_gc_interval)
  
   
  #=========== 
  
  max.y <- quantile(corrected.count,0.99,na.rm=TRUE)*3.5
  cort.sspline.res <- smooth.spline(x = train.gc, 
                                    y = corrected.count[train.autosome.idx[sel_bin_idx]], 
                                    df = sspline.df)
  
  pdf_filename <- plot_filename
  if(is.pdf){   pdf(pdf_filename)   }


  smoothScatter(gc[train.autosome.idx],counts[train.autosome.idx], xlim=gc_plot_range,
                ylim=c(robust_interval[1],robust_interval[2]),
                xlab='GC content',ylab='Coverage',
                main=paste('GC-coverage (before correction) \n',sample.name,''))
  lines(gc.interval[1:N_gc_interval],
        predict(sspline.res, gc.interval[1:N_gc_interval])$y,col='red',lwd=2.5,type='l')
  
  smoothScatter(gc[train.autosome.idx],corrected.count[train.autosome.idx], xlim=gc_plot_range,
                ylim=c(robust_interval[1],robust_interval[2]),
                xlab='GC content',ylab='Coverage',
                main=paste('GC-coverage (after correction) \n',sample.name,''))
  lines(gc.interval[1:N_gc_interval],
        predict(cort.sspline.res, gc.interval[1:N_gc_interval])$y,
        col='red',lwd=2.5,type='l')
  
  smoothScatter(gc[train.autosome.idx],counts[train.autosome.idx], xlim=gc_plot_range,
                ylim=c(0, max.y),
                xlab='GC content',ylab='Coverage',
                main=paste('GC-coverage (before correction) \n',sample.name,''))
  lines(gc.interval[1:N_gc_interval],
        predict(sspline.res, gc.interval[1:N_gc_interval])$y,col='red',lwd=2.5,type='l')
  
  smoothScatter(gc[train.autosome.idx],corrected.count[train.autosome.idx], xlim=gc_plot_range,
                ylim=c(0, max.y),
                xlab='GC content',ylab='Coverage',
                main=paste('GC-coverage (after correction) \n',sample.name,''))
  lines(gc.interval[1:N_gc_interval],
        predict(cort.sspline.res, gc.interval[1:N_gc_interval])$y,
        col='red',lwd=2.5,type='l')

   
  for(chr.idx in 1:22)
  {
    par(mfrow=c(2,1))
    tmp.chr <- paste('chr',chr.idx,sep='')
    sel.pos.idx <- intersect( which(chr==tmp.chr), train.autosome.idx )
    plot(   loc[sel.pos.idx]/1e6,counts[sel.pos.idx],cex=plot.cex,
          xlab='position (Mb)',ylab='read counts',
          main=paste(tmp.chr,'(before correction)'),
          ylim=quantile(counts[sel.pos.idx],c(1-ylim.q,ylim.q),na.rm=TRUE) )
    abline(h=ori_median_counts,lwd=3,col='red',lty=3)
    plot(   loc[sel.pos.idx]/1e6,corrected.count[sel.pos.idx],cex=plot.cex,
            xlab='position (Mb)',ylab='read counts',
            main=paste(tmp.chr,'(after correction)'),
            ylim=quantile(corrected.count[sel.pos.idx],c(1-ylim.q,ylim.q),na.rm=TRUE) )
    abline(h=ori_median_counts,lwd=3,col='red',lty=3)
  }
  
  
  #===================== chr-X 
  par(mfrow=c(2,1))
  tmp.chr <- 'chrX'
  sel.pos.idx <- which(chr==tmp.chr)
  plot(   loc[sel.pos.idx]/1e6,counts[sel.pos.idx],cex=plot.cex,
          xlab='position (Mb)',ylab='read counts',
          main=paste(tmp.chr,'(before correction)'),
          ylim=quantile(counts[sel.pos.idx],c(1-ylim.q,ylim.q),na.rm=TRUE) )
  abline(h=ori_median_counts,lwd=3,col='red',lty=3)
  plot(   loc[sel.pos.idx]/1e6,corrected.count[sel.pos.idx],cex=plot.cex,
          xlab='position (Mb)',ylab='read counts',
          main=paste(tmp.chr,'(after correction)'),
          ylim=quantile(corrected.count[sel.pos.idx],c(1-ylim.q,ylim.q),na.rm=TRUE) )
  abline(h=ori_median_counts,lwd=3,col='red',lty=3)
#   
# 
#   
#===================== chr-Y 
  par(mfrow=c(2,1))
  tmp.chr <- 'chrY'
  sel.pos.idx <- which(chr==tmp.chr)
  plot(   loc[sel.pos.idx]/1e6,counts[sel.pos.idx],cex=plot.cex,
          xlab='position (Mb)',ylab='read counts',
          main=paste(tmp.chr,'(before correction)'),
          ylim=quantile(counts[sel.pos.idx],c(1-ylim.q,ylim.q),na.rm=TRUE) )
  abline(h=ori_median_counts,lwd=3,col='red',lty=3)
  plot(   loc[sel.pos.idx]/1e6,corrected.count[sel.pos.idx],cex=plot.cex,
          xlab='position (Mb)',ylab='read counts',
          main=paste(tmp.chr,'(after correction)'),
          ylim=quantile(corrected.count[sel.pos.idx],c(1-ylim.q,ylim.q),na.rm=TRUE) )
  abline(h=ori_median_counts,lwd=3,col='red',lty=3)
  
  
  par(mfrow=c(1,1))
  
  
  if(is.pdf){   dev.off()   }
  
  #sel.nan.idx <- which(!is.na(counts))
  return( list(chr=chr,loc=loc, gc=gc,
               o.count=counts,
               c.count=corrected.count,
               median=ori_median_counts,
               sample.name=sample.name,
               sspline.res=sspline.res) )
  
}

