# these functions are work in porogress 
# they are provided only as a record of the procedures used
# the author makes no claim as to their accuracy or suitability for any other work
# the license allows you to make any use vyou like of them, but at your own risk
# you can access them bu pasting this into the R console
# the required library should be installed first...
 
library(NormqPCR)

selhknf <-function (adf,genes,facts,groupby,nrefs=4,comblim=length(genes),stablim=0,vdtoo=F) 
# License LGPL-3 ... RShowDoc("LGPL-3")
# Author: John Young, Pirbright Institute, UK
# return a list of ranked data frames with gene names and stability
# from 1 gene to a number limited by any of nref, comblim, or stablim
# uses stabMeasureRho from required package NormqPCR
# adf		= data frame from raw2log2
# genes		= column indices in adf for input genes; char or int.
# facts		= column indices for experimental factors (currently ignored)
# groupby	= column indices for defining experimental groups; char or int
# 			    multiple comlumns will be concatenated
# nrefs		= maximum selected pool size
# comblim	= maximum combination size (what's the diffrence?)
# stablim	= maximum individual stability score to be allowed in a combination
# vdtoo		= whether to return intra and inter group stability (for plot)
#
# returns a list ...
#    $set1 .. $setN are data frames for combinations of 1..N genes, with columns ...
#        g1 .. gN = names of genes in combo
#        stab     = stability score rho (lower is better)
#        intra    = rho component due to intra-group variance    (if vdtoo)
#        inter    = rho component due to inter-group differences (if vdtoo)
#    $old is a data frame with result of NormqPCR ranking, where only combinations
#        containing already identified most stable gene is considered.
#
# The mathematics is (supposed to be) that described by Andersen et al. (1) in the 
# original Normfinder paper for combinations of more than two genes, and depends
# on the implementation of the Normfinder algorithm in NormqPCR (2)   
# (1) Andersen C.L. et al. (2004) Cancer Research. 2004 (64): 5245-5250
# (2) Perkins J.R. et al.  (2012) BMC Genomics  13:296+
{
  if(is.character(genes)){ # turn into integer index
    genix<-NULL
    for(g in genes)genix<-c(genix,grep(genes[g],colnames(adf)))
  }
  else genix <- genes
  if(is.character(groupby)){ # turn into integer index
    grpix<-NULL
    for(g in groupby)grpix<-c(grpix,grep(groupby[g],colnames(adf)))
  }
  else grpix<-groupby
  amat<-as.matrix(adf[genix]) 
  ngen<-ncol(amat)
  nsamp<-nrow(amat)
  if(ngen<3)stop("needs at least 3 genes")
  if(nrefs>=ngen) nrefs<-ngen-1
  if(nrefs<2) nrefs<-2 # to avoid error with no explanation
  gnames<-colnames(adf)[genix]
  grps<-as.factor(apply(as.matrix(adf[,grpix]),1,paste0,collapse="."))
  ngrp<-length(unique(grps))
  if(length(gnames)!=ngen) stop(paste("Gene names has wrong length",length(gnames),"not",ngen))
  if(length(grps)!=nsamp) stop(paste("Groups has wrong length",length(grps),"not",nsamp))
  # get initial per gene rhos etc
  stab0<-stabMeasureRho(amat,group=grps,log=T,na.rm=T,returnAll=T)
  stablist<-list()
  stablist$set1<-data.frame(g1=names(stab0$rho),stab=stab0$rho)
  if(vdtoo){ # return intra and inter group components too
    stablist$set1<-cbind(stablist$set1,intra=apply(sqrt(stab0$v),1,mean))
    stablist$set1<-cbind(stablist$set1,inter=apply(abs(stab0$d),1,mean))
  }
  staborder<-order(stablist$set1$stab)   # get everything into stability order
  stablist$set1<-stablist$set1[staborder,]
  stab0$d<-stab0$d[staborder,]
  stab0$v<-stab0$v[staborder,]
  gnames<-as.character(stablist$set1$g1)
  ngg<-min(ngen,comblim,ifelse(stablim<=0,ngen,sum(stablist$set1$stab<=stablim)))
  if(ngg<2)ngg<-2 # to avoid error with no explanation!
  # now for old sellectHKs 
  old<-data.frame(rank=character(0),rho=double(0))
  old<-rbind(old,stablist$set1[1,1:2])
  # back to main stream
  for(nhk in 2:min(nrefs,ngg)){ # now do all combinations of up to nrefs
    combos<-t(combn(ngg,nhk))
    ncombs<-nrow(combos)
    stabs<-rep(0,ncombs)
    if(vdtoo){vars<-stabs;difs<-stabs}
    for(combo in 1:ncombs){
      xdif<-apply(stab0$d[combos[combo,],],2,mean)*sqrt(ngen/(ngen-nhk))
      xvar<-apply(stab0$v[combos[combo,],],2,mean)/nhk
      stabs[combo]<-mean(abs(xdif)+sqrt(xvar))
      if(vdtoo){vars[combo]<-mean(sqrt(xvar));difs[combo]<-mean(abs(xdif))}
    }
    combonames<-gnames[combos]
    dim(combonames)<-dim(combos)
    for(arow in 1:nrow(combonames))combonames[arow,]<-combonames[arow,][order(combonames[arow,])]
    # that is supposed to ensure order of gene names is always the same for any combo
    colnames(combonames)<-paste0(rep("g",nhk),1:nhk)
    stabs<-data.frame(combonames,stab=stabs)
    if(vdtoo)stabs<-cbind(stabs,intra=vars,inter=difs)
    aref<-paste0("set",nhk)
    stablist[[aref]]<-stabs[order(stabs[,nhk+1]),]
    # old selectHKs does only combos with already selected genes
    oldix<-rep(T,nrow(stablist[[aref]]))
    for(oix in 1:nrow(old)){
      for(six in 1:length(oldix)){
        if(!(as.matrix(old$g1[oix]) %in% as.matrix(stablist[[aref]])[six,1:nhk]))oldix[six]<-F
       }
    }
    if(any(oldix)){
      astab<-stablist[[aref]][oldix,][1,nhk+1]
      acomb<-as.matrix(stablist[[aref]][oldix,][1,1:nhk])
      old<-rbind(old,data.frame(g1=acomb[!(acomb%in%old$g1)],stab=astab))
    }
  }
  rownames(old)[1]<-"1"
  stablist$old<-old
  stablist
}

selhknfdrop <-function(adf,genes,facts,groupby,nrefs,nelim=length(genes)-nrefs-1,
         comblim=length(genes),stablim=0,vdtoo=F)
# License LGPL-3 ... RShowDoc("LGPL-3")
# Author: John Young, Pirbright Institute, UK
# outer loop of normfinder wrapper procedure
# stewise elimination of worst genes for successive selhknf calls
#
# adf 		=	data frame from raw2aslog2, columns are factors then genes
# genes		=	column indices of genes to use, integer or character
# facts		=	column indices of experimental factors, int or char
# groupby	=	column indices of factors defining experimental groups
# nrefs		=	maximum numbers of selected reference genes
# nelim		=	number of lowest stability genes to eliminate
# comblim	=	max size of combos to consider
# vdtoo		=	return intra and inter group stability as well as sum
# 
# returns a list of selhknf results for each successive gene dropped
# names are those of the last dropped gene starting with "none"
# See comments in selhknf for contents of each element
{
  if(is.character(genes)){  # convert char indices to int
    genix<-NULL
    for(g in genes)genix<-c(genix,grep(genes[g],colnames(adf)))
  }
  else genix <- genes
  ngen<-length(genix)
  gnames<-colnames(adf)[genix]
  res<-list()
  res$none<-selhknf(adf,genix,facts,groupby,nrefs,comblim,stablim,vdtoo)
  dropord<-rep(0,ngen)
  for(i in 1:ngen)dropord[i]<-grep(res$none$set1[ngen-i+1,1],gnames)
  for(i in 2:(nelim+1)){
    genix[dropord[i-1]]<-NA
    res[[gnames[dropord[i-1]]]]<-selhknf(adf,genix[!is.na(genix)],facts,groupby,nrefs,comblim,stablim,vdtoo)
  }
  res$dropped<-c("none",gnames[dropord[1:(nelim-1)]])
  res$call<-sys.call()
  res$date<-date()
  res
}

shplot <-
function(sh,ndrop=4,nstep=4,ncomb=10,edges=T)
# License: LGPL-3
# Author: John Young, Pirbright Institute, UK
# plot graphs (a bit) like genorm for selhknfxdrop output visualisation
#   sh is output of selhknfxdrop(adf,...) where adf is raw2aslog2 output.
#   ndrop   is number of drop cycles to display (including $none)
#   nstep   is number of steps per drop cycle to display
#   ncomb   is maximum number of combinations per graph
#   edges   is whether to include the right and bottom edge graphs
#
# Output is a set of overlayed bar and line graphs. There should be ndrop
# rows of nstep graphs per row, plus one extra row and one extra per row
# if edges is TRUE (the default). Each row is one cycle of the outer loop
# where one least stable gene is dropped from the input per cycle. Each
# graph in the shows selected gene combinations with increasing numbers of
# genes, ordered from best to worst stability. The bars indicate the overall
# stability measure of combinations. The intra-group variance component is
# orange and the inter-group difference component is yellow. The blue points
# show the standard deviation across samples of the difference in normalisation 
# factors between the combinations compared (bars in the genorm graphs). The 
# genes in the combinations are indicated by coloured squares under the bars,
# which can be decoded in a key included if edges is true. 
#
# If edges is true, the extra graphs at the right show comparison of the first
# combination for each increase in reference pool size. The extra graphs at the
# bottom compare, for each pool size, the first combination with each successive
# dropping of the least stable gene. Bars and points have the same meaning as
# in the non-marginal graphs. See the annotated illustration.
# 
# Note that this function is experimental and has not been properly validated.
# any use should cite these publications
#
# Andersen C.L., Ledet-Jensen J., Ørntoft T. (2004) 64, 5245-5250. (Normfinder)
# Perkins J.R., Dawes J.M., McMahon S.B., Bennett D.L.H., Orengo C. and Kohl M. 
# (2012) BMC Genomics 13:296. (NormqPCR)
#
#
{
# get the source data for sd(deltaNF) calculations
# need all the data to get graph scales
  adf<-eval(sh$call[[2]])
  genes<-eval(sh$call[[3]])
  gnames<-colnames(adf[,genes])
  nsamples<-nrow(adf)
  gcols<-c("red","orange2","green3","skyblue2","wheat3","pink3",
           "khaki2","magenta","grey85","seagreen","yellow2","steelblue")
  nfdata<-list()
  for(dr in 1:ndrop){
    nfdata[[dr]]<-list()
    for(st in 1:nstep){
      if(st %in% grep("^set",names(sh[[dr]]))){ # strange, needed cos $old at end 
        nfs<-NULL
        cbm<-min(ncomb,nrow(sh[[dr]][[st]]))   
        if(cbm>0)for(cb in 1:cbm){
          gix<-NULL
          for(g in 1:st)gix<-c(gix,grep(sh[[dr]][[st]][cb,g],colnames(adf)))
          anf<-apply(as.matrix(adf[,gix]),1,mean,na.rm=T) # norm factors
          nfs<-cbind(nfs,anf) 
        } #cb
        nfdata[[dr]][[st]]<-nfs
      } 
      else nfdata[[dr]][[st]]<-matrix(rep(NA,nsamples),nrow=nsamples,ncol=1)
    } #st
  } #dr
  sddnfc<-rep(NA,(ncomb-1)*nstep*ndrop)
  dim(sddnfc)<-c(ncomb-1,nstep,ndrop)
  for(dr in 1:ndrop){
    for(st in 1:nstep){
      mcb<-min(ncol(nfdata[[dr]][[st]])-1,ncomb-1)
      if(mcb>0)for(cb in 1:mcb){
        sddnfc[cb,st,dr]<-sd(nfdata[[dr]][[st]][,cb]-nfdata[[dr]][[st]][,cb+1]) # ,na.rm=T)
      }
    }
  }
  sdclim<-c(floor(10*min(sddnfc,na.rm=T))/10,ceiling(10*max(sddnfc,na.rm=T))/10)
  sddnfs<-rep(NA,(nstep-1)*ndrop)
  dim(sddnfs)<-c(nstep-1,ndrop)
  for(dr in 1:ndrop){
    for(st in 1:(nstep-1)){
      sddnfs[st,dr]<-sd(nfdata[[dr]][[st]][,1]-nfdata[[dr]][[st+1]][,1]) # ,na.rm=T)
    }
  }
  sdslim<-c(floor(10*min(sddnfs,na.rm=T))/10,ceiling(10*max(sddnfs,na.rm=T))/10)
  sddnfd<-rep(NA,nstep*(ndrop-1))
  dim(sddnfd)<-c(nstep,ndrop-1)
  for(dr in 1:(ndrop-1)){
    for(st in 1:nstep){
      sddnfd[st,dr]<-sd(nfdata[[dr]][[st]][,1]-nfdata[[dr+1]][[st]][,1]) # ,na.rm=T)
    }
  }
  sddlim<-c(floor(10*min(sddnfd,na.rm=T))/10,ceiling(10*max(sddnfd,na.rm=T))/10)
  plimstab<-c(10,0)
  for(dr in 1:ndrop){
    for(st in grep("^set",names(sh[[dr]]))){
      arng<-range(apply(sh[[dr]][[st]][1:ncomb,st+(2:3)],1,sum,na.rm=T))
      if(arng[1]<plimstab[1])plimstab[1]<-arng[1]
      if(arng[2]>plimstab[2])plimstab[2]<-arng[2]
    }
  }

# stop("debug stop")
# now setup par
  par(mfrow=c(ndrop+edges,nstep+edges),mar=c(4,2,1,2.5),mex=0.6,oma=c(2,4,2,4))
  laboff<-ifelse(edges,3,4)*nstep/4
  keysq<-matrix(c(rep(-0.9,6),(0.7-(2:7)*1.2)/20,rep(-0.1,6),(0.7-(1:6)*1.2)/20),nrow=6,ncol=4)
  for(dr in 1:ndrop){
    for(st in 1:nstep){
      if(st %in% grep("^set",names(sh[[dr]]))){
        plot.new()
        plot.window(xlim=c(0,ncomb*1.2),ylim=c(0,plimstab[2]),xlab="",ylab="")
        for(c in 1:min(nrow(sh[[dr]][[st]]),ncomb)){		  
          rect(1.2*c-1,0,1.2*c,sh[[dr]][[st]][c,st+2],col="orange",lwd=0.5)
          rect(1.2*c-1,sh[[dr]][[st]][c,st+2],1.2*c,sh[[dr]][[st]][c,st+1],col="khaki",lwd=0.5)	  
		  gclrs<-as.vector(t(sh[[dr]][[st]][c,1:st]))
          gclrs<-gcols[gnames%in%gclrs]
          rect(1.2*c+keysq[1:st,1],keysq[1:st,2]*plimstab[2],
               1.2*c+keysq[1:st,3],keysq[1:st,4]*plimstab[2],col=gclrs,xpd=NA,border="white",lwd=0.5)
          }
        }
        else barplot(as.matrix(rep(NA,ncomb*2),ncol=ncomb),ylim=c(0,plimstab[2]),axisnames=F,
                xlab="",ylab="",col=c("orange","khaki"),axes=F,lwd=0.5)
        scl<-0.95*plimstab[2]/sdclim[2]
		cho<-10/ceiling(2*plimstab[2])
        qq<-sddnfc[,st,dr]*scl
        lines(1:(ncomb-1)*1.2+0.1,qq,type="p",pch=19,col="blue3",lwd=0.5)
        lines(1:(ncomb-1)*1.2+0.1,qq,type="l",pch=19,col="blue3",lwd=0.5)
        tiks<-(0:round(cho*plimstab[2]))/cho
        axis(2,at=tiks,col="orange4",col.axis="orange4",labels=(st==1),lwd=0.5,xpd=NA)
        chi<-10/ceiling(2*sdclim[2])
        tikl<-(0:round(chi*sdclim[2]))/chi
        tiks<-tikl*scl
        if(st==1){
		  mtext("stability",side=2,line=3,col="orange4",cex=0.8)
		  text(0,plimstab[2],paste("-",names(sh)[dr]),pos=4,xpd=NA)
		}
        if(st==nstep){
          axis(4,at=tiks,col="blue3",col.axis="blue3",labels=tikl,lwd=0.5,xpd=NA)
		  if(!edges)text(ncomb*1.2+ncomb/laboff,plimstab[2]/2,"sd(deltaNF)",srt=90,col="blue3",xpd=NA)
		}
        else axis(4,at=tiks,col="blue3",col.axis="blue3",labels=F,lwd=0.5,xpd=NA)
    }
    if(edges)
    {
      z<-rep(NA,2*max(ncomb,nstep))
      dim(z)<-c(2,max(ncomb,nstep))
      for(i in grep("^set",names(sh[[dr]])))z[1:2,i]<-as.vector(t(sh[[dr]][[i]][1,i+(2:3)]))
      plot.new()
      plot.window(xlim=c(0,ncomb*1.2),ylim=c(0,plimstab[2]),xlab="",ylab="")
      for(st in 1:nstep){
        rect(1.2*st-1,0,1.2*st,sh[[dr]][[st]][1,st+2],col="orange",lwd=0.5)
        rect(1.2*st-1,sh[[dr]][[st]][1,st+2],1.2*st,sh[[dr]][[st]][1,st+1],col="khaki",lwd=0.5)	  
        gclrs<-as.vector(t(sh[[dr]][[st]][1,1:st]))
        gclrs<-gcols[gnames%in%gclrs]
        rect(1.2*st+keysq[1:st,1],keysq[1:st,2]*plimstab[2],
             1.2*st+keysq[1:st,3],keysq[1:st,4]*plimstab[2],col=gclrs,xpd=NA,border="white",lwd=0.5)
      }
      scl<-0.95*plimstab[2]/sdslim[2]
      qq<-sddnfs[1:(nstep-1),dr]*scl
      lines(1:(nstep-1)*1.2+0.1,qq,type="p",pch=19,col="blue3",lwd=0.5)
      lines(1:(nstep-1)*1.2+0.1,qq,type="l",pch=19,col="blue3",lwd=0.5)
      tiks<-(0:round(cho*plimstab[2]))/cho
      axis(2,at=tiks,col="orange4",col.axis="orange4",labels=(st==1),lwd=0.5,xpd=NA)
      chi<-10/ceiling(2*sdslim[2])
      tikl<-(0:round(chi*sdslim[2]))/chi
      tiks<-tikl*scl
      loff<-sum(!is.na(z[1,]))+1.5
      if(st==nstep) axis(4,at=tiks,labels=tikl,col="blue3",col.axis="blue3",pos=loff,lwd=0.5)
      else axis(4,at=tiks,labels=F,col="blue3",col.axis="blue3",pos=loff,lwd=0.5,xpd=NA)
      text(loff+4,plimstab[2]/2,"sd(deltaNF)",srt=90,col="blue3",xpd=NA) #ncomb*1.2+ncomb/laboff
    }
  }
  if(edges){
    for(st in 1:(nstep)){  
      z<-rep(NA,2*ncomb)
      dim(z)<-c(2,ncomb)
      for(i in 1:ndrop)if(st %in% grep("^set",names(sh[[i]])))z[1:2,i]<-as.vector(t(sh[[i]][[st]][1,st+(2:3)]))
      plot.new()
      plot.window(xlim=c(0,ncomb*1.2),ylim=c(0,plimstab[2]),xlab="",ylab="")
      for(dr in 1:ndrop){
        rect(1.2*dr-1,0,1.2*dr,sh[[dr]][[st]][1,st+2],col="orange",lwd=0.5)
        rect(1.2*dr-1,sh[[dr]][[st]][1,st+2],1.2*dr,sh[[dr]][[st]][1,st+1],col="khaki",lwd=0.5)	  
        gclrs<-as.vector(t(sh[[dr]][[st]][1,1:st]))
        gclrs<-gcols[gnames%in%gclrs]
        rect(1.2*dr+keysq[1:st,1],keysq[1:st,2]*plimstab[2],
             1.2*dr+keysq[1:st,3],keysq[1:st,4]*plimstab[2],col=gclrs,xpd=NA,border="white",lwd=0.5)
      }
      scl<-0.95*plimstab[2]/sddlim[2]
      qq<-sddnfd[st,1:(ndrop-1)]*scl
      lines(1:(ndrop-1)*1.2+0.1,qq,type="p",pch=19,col="blue3",lwd=0.5)
      lines(1:(ndrop-1)*1.2+0.1,qq,type="l",pch=19,col="blue3",lwd=0.5)
      tiks<-(0:round(cho*plimstab[2]))/cho
      axis(2,at=tiks,col="orange4",col.axis="orange4",labels=(st==1),lwd=0.5,xpd=NA)
      chi<-10/ceiling(2*sddlim[2])
      tikl<-(0:round(chi*sddlim[2]))/chi
      tiks<-tikl*scl
      if(st==nstep) axis(4,at=tiks,labels=tikl,col="blue3",col.axis="blue3",lwd=0.5)
      else axis(4,at=tiks,labels=F,col="blue3",col.axis="blue3",lwd=0.5,xpd=NA)
      if(st==1)mtext("stability",side=2,line=3,col="orange4",cex=0.8)
      if(st==nstep)text(ncomb*1.2+2+ncomb/laboff,plimstab[2]/2,"sd(deltaNF)",srt=90,col="blue3",xpd=NA)
    }
    plot(1,1,xlim=c(0,20),ylim=c(12,0),type="n",bty="n",ann=F,xaxt="n",yaxt="n")
    rect(rep(5,12),0:11,rep(7,12),1:12,col=gcols,border="white",lwd=0.5)
    text(rep(6.5,12),0.5+(0:11),gnames,pos=4,cex=0.8)
  }
}