# these functions are work in porogress # they are provided only as a record of the procedures used # the author makes no claim as to their accuracy or suitability for any other work # the license allows you to make any use vyou like of them, but at your own risk # you can access them bu pasting this into the R console # the required library should be installed first... library(NormqPCR) selhknf <-function (adf,genes,facts,groupby,nrefs=4,comblim=length(genes),stablim=0,vdtoo=F) # License LGPL-3 ... RShowDoc("LGPL-3") # Author: John Young, Pirbright Institute, UK # return a list of ranked data frames with gene names and stability # from 1 gene to a number limited by any of nref, comblim, or stablim # uses stabMeasureRho from required package NormqPCR # adf = data frame from raw2log2 # genes = column indices in adf for input genes; char or int. # facts = column indices for experimental factors (currently ignored) # groupby = column indices for defining experimental groups; char or int # multiple comlumns will be concatenated # nrefs = maximum selected pool size # comblim = maximum combination size (what's the diffrence?) # stablim = maximum individual stability score to be allowed in a combination # vdtoo = whether to return intra and inter group stability (for plot) # # returns a list ... # $set1 .. $setN are data frames for combinations of 1..N genes, with columns ... # g1 .. gN = names of genes in combo # stab = stability score rho (lower is better) # intra = rho component due to intra-group variance (if vdtoo) # inter = rho component due to inter-group differences (if vdtoo) # $old is a data frame with result of NormqPCR ranking, where only combinations # containing already identified most stable gene is considered. # # The mathematics is (supposed to be) that described by Andersen et al. (1) in the # original Normfinder paper for combinations of more than two genes, and depends # on the implementation of the Normfinder algorithm in NormqPCR (2) # (1) Andersen C.L. et al. (2004) Cancer Research. 2004 (64): 5245-5250 # (2) Perkins J.R. et al. (2012) BMC Genomics 13:296+ { if(is.character(genes)){ # turn into integer index genix<-NULL for(g in genes)genix<-c(genix,grep(genes[g],colnames(adf))) } else genix <- genes if(is.character(groupby)){ # turn into integer index grpix<-NULL for(g in groupby)grpix<-c(grpix,grep(groupby[g],colnames(adf))) } else grpix<-groupby amat<-as.matrix(adf[genix]) ngen<-ncol(amat) nsamp<-nrow(amat) if(ngen<3)stop("needs at least 3 genes") if(nrefs>=ngen) nrefs<-ngen-1 if(nrefs<2) nrefs<-2 # to avoid error with no explanation gnames<-colnames(adf)[genix] grps<-as.factor(apply(as.matrix(adf[,grpix]),1,paste0,collapse=".")) ngrp<-length(unique(grps)) if(length(gnames)!=ngen) stop(paste("Gene names has wrong length",length(gnames),"not",ngen)) if(length(grps)!=nsamp) stop(paste("Groups has wrong length",length(grps),"not",nsamp)) # get initial per gene rhos etc stab0<-stabMeasureRho(amat,group=grps,log=T,na.rm=T,returnAll=T) stablist<-list() stablist$set1<-data.frame(g1=names(stab0$rho),stab=stab0$rho) if(vdtoo){ # return intra and inter group components too stablist$set1<-cbind(stablist$set1,intra=apply(sqrt(stab0$v),1,mean)) stablist$set1<-cbind(stablist$set1,inter=apply(abs(stab0$d),1,mean)) } staborder<-order(stablist$set1$stab) # get everything into stability order stablist$set1<-stablist$set1[staborder,] stab0$d<-stab0$d[staborder,] stab0$v<-stab0$v[staborder,] gnames<-as.character(stablist$set1$g1) ngg<-min(ngen,comblim,ifelse(stablim<=0,ngen,sum(stablist$set1$stab<=stablim))) if(ngg<2)ngg<-2 # to avoid error with no explanation! # now for old sellectHKs old<-data.frame(rank=character(0),rho=double(0)) old<-rbind(old,stablist$set1[1,1:2]) # back to main stream for(nhk in 2:min(nrefs,ngg)){ # now do all combinations of up to nrefs combos<-t(combn(ngg,nhk)) ncombs<-nrow(combos) stabs<-rep(0,ncombs) if(vdtoo){vars<-stabs;difs<-stabs} for(combo in 1:ncombs){ xdif<-apply(stab0$d[combos[combo,],],2,mean)*sqrt(ngen/(ngen-nhk)) xvar<-apply(stab0$v[combos[combo,],],2,mean)/nhk stabs[combo]<-mean(abs(xdif)+sqrt(xvar)) if(vdtoo){vars[combo]<-mean(sqrt(xvar));difs[combo]<-mean(abs(xdif))} } combonames<-gnames[combos] dim(combonames)<-dim(combos) for(arow in 1:nrow(combonames))combonames[arow,]<-combonames[arow,][order(combonames[arow,])] # that is supposed to ensure order of gene names is always the same for any combo colnames(combonames)<-paste0(rep("g",nhk),1:nhk) stabs<-data.frame(combonames,stab=stabs) if(vdtoo)stabs<-cbind(stabs,intra=vars,inter=difs) aref<-paste0("set",nhk) stablist[[aref]]<-stabs[order(stabs[,nhk+1]),] # old selectHKs does only combos with already selected genes oldix<-rep(T,nrow(stablist[[aref]])) for(oix in 1:nrow(old)){ for(six in 1:length(oldix)){ if(!(as.matrix(old$g1[oix]) %in% as.matrix(stablist[[aref]])[six,1:nhk]))oldix[six]<-F } } if(any(oldix)){ astab<-stablist[[aref]][oldix,][1,nhk+1] acomb<-as.matrix(stablist[[aref]][oldix,][1,1:nhk]) old<-rbind(old,data.frame(g1=acomb[!(acomb%in%old$g1)],stab=astab)) } } rownames(old)[1]<-"1" stablist$old<-old stablist } selhknfdrop <-function(adf,genes,facts,groupby,nrefs,nelim=length(genes)-nrefs-1, comblim=length(genes),stablim=0,vdtoo=F) # License LGPL-3 ... RShowDoc("LGPL-3") # Author: John Young, Pirbright Institute, UK # outer loop of normfinder wrapper procedure # stewise elimination of worst genes for successive selhknf calls # # adf = data frame from raw2aslog2, columns are factors then genes # genes = column indices of genes to use, integer or character # facts = column indices of experimental factors, int or char # groupby = column indices of factors defining experimental groups # nrefs = maximum numbers of selected reference genes # nelim = number of lowest stability genes to eliminate # comblim = max size of combos to consider # vdtoo = return intra and inter group stability as well as sum # # returns a list of selhknf results for each successive gene dropped # names are those of the last dropped gene starting with "none" # See comments in selhknf for contents of each element { if(is.character(genes)){ # convert char indices to int genix<-NULL for(g in genes)genix<-c(genix,grep(genes[g],colnames(adf))) } else genix <- genes ngen<-length(genix) gnames<-colnames(adf)[genix] res<-list() res$none<-selhknf(adf,genix,facts,groupby,nrefs,comblim,stablim,vdtoo) dropord<-rep(0,ngen) for(i in 1:ngen)dropord[i]<-grep(res$none$set1[ngen-i+1,1],gnames) for(i in 2:(nelim+1)){ genix[dropord[i-1]]<-NA res[[gnames[dropord[i-1]]]]<-selhknf(adf,genix[!is.na(genix)],facts,groupby,nrefs,comblim,stablim,vdtoo) } res$dropped<-c("none",gnames[dropord[1:(nelim-1)]]) res$call<-sys.call() res$date<-date() res } shplot <- function(sh,ndrop=4,nstep=4,ncomb=10,edges=T) # License: LGPL-3 # Author: John Young, Pirbright Institute, UK # plot graphs (a bit) like genorm for selhknfxdrop output visualisation # sh is output of selhknfxdrop(adf,...) where adf is raw2aslog2 output. # ndrop is number of drop cycles to display (including $none) # nstep is number of steps per drop cycle to display # ncomb is maximum number of combinations per graph # edges is whether to include the right and bottom edge graphs # # Output is a set of overlayed bar and line graphs. There should be ndrop # rows of nstep graphs per row, plus one extra row and one extra per row # if edges is TRUE (the default). Each row is one cycle of the outer loop # where one least stable gene is dropped from the input per cycle. Each # graph in the shows selected gene combinations with increasing numbers of # genes, ordered from best to worst stability. The bars indicate the overall # stability measure of combinations. The intra-group variance component is # orange and the inter-group difference component is yellow. The blue points # show the standard deviation across samples of the difference in normalisation # factors between the combinations compared (bars in the genorm graphs). The # genes in the combinations are indicated by coloured squares under the bars, # which can be decoded in a key included if edges is true. # # If edges is true, the extra graphs at the right show comparison of the first # combination for each increase in reference pool size. The extra graphs at the # bottom compare, for each pool size, the first combination with each successive # dropping of the least stable gene. Bars and points have the same meaning as # in the non-marginal graphs. See the annotated illustration. # # Note that this function is experimental and has not been properly validated. # any use should cite these publications # # Andersen C.L., Ledet-Jensen J., Ørntoft T. (2004) 64, 5245-5250. (Normfinder) # Perkins J.R., Dawes J.M., McMahon S.B., Bennett D.L.H., Orengo C. and Kohl M. # (2012) BMC Genomics 13:296. (NormqPCR) # # { # get the source data for sd(deltaNF) calculations # need all the data to get graph scales adf<-eval(sh$call[[2]]) genes<-eval(sh$call[[3]]) gnames<-colnames(adf[,genes]) nsamples<-nrow(adf) gcols<-c("red","orange2","green3","skyblue2","wheat3","pink3", "khaki2","magenta","grey85","seagreen","yellow2","steelblue") nfdata<-list() for(dr in 1:ndrop){ nfdata[[dr]]<-list() for(st in 1:nstep){ if(st %in% grep("^set",names(sh[[dr]]))){ # strange, needed cos $old at end nfs<-NULL cbm<-min(ncomb,nrow(sh[[dr]][[st]])) if(cbm>0)for(cb in 1:cbm){ gix<-NULL for(g in 1:st)gix<-c(gix,grep(sh[[dr]][[st]][cb,g],colnames(adf))) anf<-apply(as.matrix(adf[,gix]),1,mean,na.rm=T) # norm factors nfs<-cbind(nfs,anf) } #cb nfdata[[dr]][[st]]<-nfs } else nfdata[[dr]][[st]]<-matrix(rep(NA,nsamples),nrow=nsamples,ncol=1) } #st } #dr sddnfc<-rep(NA,(ncomb-1)*nstep*ndrop) dim(sddnfc)<-c(ncomb-1,nstep,ndrop) for(dr in 1:ndrop){ for(st in 1:nstep){ mcb<-min(ncol(nfdata[[dr]][[st]])-1,ncomb-1) if(mcb>0)for(cb in 1:mcb){ sddnfc[cb,st,dr]<-sd(nfdata[[dr]][[st]][,cb]-nfdata[[dr]][[st]][,cb+1]) # ,na.rm=T) } } } sdclim<-c(floor(10*min(sddnfc,na.rm=T))/10,ceiling(10*max(sddnfc,na.rm=T))/10) sddnfs<-rep(NA,(nstep-1)*ndrop) dim(sddnfs)<-c(nstep-1,ndrop) for(dr in 1:ndrop){ for(st in 1:(nstep-1)){ sddnfs[st,dr]<-sd(nfdata[[dr]][[st]][,1]-nfdata[[dr]][[st+1]][,1]) # ,na.rm=T) } } sdslim<-c(floor(10*min(sddnfs,na.rm=T))/10,ceiling(10*max(sddnfs,na.rm=T))/10) sddnfd<-rep(NA,nstep*(ndrop-1)) dim(sddnfd)<-c(nstep,ndrop-1) for(dr in 1:(ndrop-1)){ for(st in 1:nstep){ sddnfd[st,dr]<-sd(nfdata[[dr]][[st]][,1]-nfdata[[dr+1]][[st]][,1]) # ,na.rm=T) } } sddlim<-c(floor(10*min(sddnfd,na.rm=T))/10,ceiling(10*max(sddnfd,na.rm=T))/10) plimstab<-c(10,0) for(dr in 1:ndrop){ for(st in grep("^set",names(sh[[dr]]))){ arng<-range(apply(sh[[dr]][[st]][1:ncomb,st+(2:3)],1,sum,na.rm=T)) if(arng[1]plimstab[2])plimstab[2]<-arng[2] } } # stop("debug stop") # now setup par par(mfrow=c(ndrop+edges,nstep+edges),mar=c(4,2,1,2.5),mex=0.6,oma=c(2,4,2,4)) laboff<-ifelse(edges,3,4)*nstep/4 keysq<-matrix(c(rep(-0.9,6),(0.7-(2:7)*1.2)/20,rep(-0.1,6),(0.7-(1:6)*1.2)/20),nrow=6,ncol=4) for(dr in 1:ndrop){ for(st in 1:nstep){ if(st %in% grep("^set",names(sh[[dr]]))){ plot.new() plot.window(xlim=c(0,ncomb*1.2),ylim=c(0,plimstab[2]),xlab="",ylab="") for(c in 1:min(nrow(sh[[dr]][[st]]),ncomb)){ rect(1.2*c-1,0,1.2*c,sh[[dr]][[st]][c,st+2],col="orange",lwd=0.5) rect(1.2*c-1,sh[[dr]][[st]][c,st+2],1.2*c,sh[[dr]][[st]][c,st+1],col="khaki",lwd=0.5) gclrs<-as.vector(t(sh[[dr]][[st]][c,1:st])) gclrs<-gcols[gnames%in%gclrs] rect(1.2*c+keysq[1:st,1],keysq[1:st,2]*plimstab[2], 1.2*c+keysq[1:st,3],keysq[1:st,4]*plimstab[2],col=gclrs,xpd=NA,border="white",lwd=0.5) } } else barplot(as.matrix(rep(NA,ncomb*2),ncol=ncomb),ylim=c(0,plimstab[2]),axisnames=F, xlab="",ylab="",col=c("orange","khaki"),axes=F,lwd=0.5) scl<-0.95*plimstab[2]/sdclim[2] cho<-10/ceiling(2*plimstab[2]) qq<-sddnfc[,st,dr]*scl lines(1:(ncomb-1)*1.2+0.1,qq,type="p",pch=19,col="blue3",lwd=0.5) lines(1:(ncomb-1)*1.2+0.1,qq,type="l",pch=19,col="blue3",lwd=0.5) tiks<-(0:round(cho*plimstab[2]))/cho axis(2,at=tiks,col="orange4",col.axis="orange4",labels=(st==1),lwd=0.5,xpd=NA) chi<-10/ceiling(2*sdclim[2]) tikl<-(0:round(chi*sdclim[2]))/chi tiks<-tikl*scl if(st==1){ mtext("stability",side=2,line=3,col="orange4",cex=0.8) text(0,plimstab[2],paste("-",names(sh)[dr]),pos=4,xpd=NA) } if(st==nstep){ axis(4,at=tiks,col="blue3",col.axis="blue3",labels=tikl,lwd=0.5,xpd=NA) if(!edges)text(ncomb*1.2+ncomb/laboff,plimstab[2]/2,"sd(deltaNF)",srt=90,col="blue3",xpd=NA) } else axis(4,at=tiks,col="blue3",col.axis="blue3",labels=F,lwd=0.5,xpd=NA) } if(edges) { z<-rep(NA,2*max(ncomb,nstep)) dim(z)<-c(2,max(ncomb,nstep)) for(i in grep("^set",names(sh[[dr]])))z[1:2,i]<-as.vector(t(sh[[dr]][[i]][1,i+(2:3)])) plot.new() plot.window(xlim=c(0,ncomb*1.2),ylim=c(0,plimstab[2]),xlab="",ylab="") for(st in 1:nstep){ rect(1.2*st-1,0,1.2*st,sh[[dr]][[st]][1,st+2],col="orange",lwd=0.5) rect(1.2*st-1,sh[[dr]][[st]][1,st+2],1.2*st,sh[[dr]][[st]][1,st+1],col="khaki",lwd=0.5) gclrs<-as.vector(t(sh[[dr]][[st]][1,1:st])) gclrs<-gcols[gnames%in%gclrs] rect(1.2*st+keysq[1:st,1],keysq[1:st,2]*plimstab[2], 1.2*st+keysq[1:st,3],keysq[1:st,4]*plimstab[2],col=gclrs,xpd=NA,border="white",lwd=0.5) } scl<-0.95*plimstab[2]/sdslim[2] qq<-sddnfs[1:(nstep-1),dr]*scl lines(1:(nstep-1)*1.2+0.1,qq,type="p",pch=19,col="blue3",lwd=0.5) lines(1:(nstep-1)*1.2+0.1,qq,type="l",pch=19,col="blue3",lwd=0.5) tiks<-(0:round(cho*plimstab[2]))/cho axis(2,at=tiks,col="orange4",col.axis="orange4",labels=(st==1),lwd=0.5,xpd=NA) chi<-10/ceiling(2*sdslim[2]) tikl<-(0:round(chi*sdslim[2]))/chi tiks<-tikl*scl loff<-sum(!is.na(z[1,]))+1.5 if(st==nstep) axis(4,at=tiks,labels=tikl,col="blue3",col.axis="blue3",pos=loff,lwd=0.5) else axis(4,at=tiks,labels=F,col="blue3",col.axis="blue3",pos=loff,lwd=0.5,xpd=NA) text(loff+4,plimstab[2]/2,"sd(deltaNF)",srt=90,col="blue3",xpd=NA) #ncomb*1.2+ncomb/laboff } } if(edges){ for(st in 1:(nstep)){ z<-rep(NA,2*ncomb) dim(z)<-c(2,ncomb) for(i in 1:ndrop)if(st %in% grep("^set",names(sh[[i]])))z[1:2,i]<-as.vector(t(sh[[i]][[st]][1,st+(2:3)])) plot.new() plot.window(xlim=c(0,ncomb*1.2),ylim=c(0,plimstab[2]),xlab="",ylab="") for(dr in 1:ndrop){ rect(1.2*dr-1,0,1.2*dr,sh[[dr]][[st]][1,st+2],col="orange",lwd=0.5) rect(1.2*dr-1,sh[[dr]][[st]][1,st+2],1.2*dr,sh[[dr]][[st]][1,st+1],col="khaki",lwd=0.5) gclrs<-as.vector(t(sh[[dr]][[st]][1,1:st])) gclrs<-gcols[gnames%in%gclrs] rect(1.2*dr+keysq[1:st,1],keysq[1:st,2]*plimstab[2], 1.2*dr+keysq[1:st,3],keysq[1:st,4]*plimstab[2],col=gclrs,xpd=NA,border="white",lwd=0.5) } scl<-0.95*plimstab[2]/sddlim[2] qq<-sddnfd[st,1:(ndrop-1)]*scl lines(1:(ndrop-1)*1.2+0.1,qq,type="p",pch=19,col="blue3",lwd=0.5) lines(1:(ndrop-1)*1.2+0.1,qq,type="l",pch=19,col="blue3",lwd=0.5) tiks<-(0:round(cho*plimstab[2]))/cho axis(2,at=tiks,col="orange4",col.axis="orange4",labels=(st==1),lwd=0.5,xpd=NA) chi<-10/ceiling(2*sddlim[2]) tikl<-(0:round(chi*sddlim[2]))/chi tiks<-tikl*scl if(st==nstep) axis(4,at=tiks,labels=tikl,col="blue3",col.axis="blue3",lwd=0.5) else axis(4,at=tiks,labels=F,col="blue3",col.axis="blue3",lwd=0.5,xpd=NA) if(st==1)mtext("stability",side=2,line=3,col="orange4",cex=0.8) if(st==nstep)text(ncomb*1.2+2+ncomb/laboff,plimstab[2]/2,"sd(deltaNF)",srt=90,col="blue3",xpd=NA) } plot(1,1,xlim=c(0,20),ylim=c(12,0),type="n",bty="n",ann=F,xaxt="n",yaxt="n") rect(rep(5,12),0:11,rep(7,12),1:12,col=gcols,border="white",lwd=0.5) text(rep(6.5,12),0.5+(0:11),gnames,pos=4,cex=0.8) } }