# mdclust - multidimensional clustering # (C) 2003 Martin Dugas # no warranty, free for academic use # # class discovery method for microarray analysis # calculates clusters of samples and associated genes # # Example: # library(multtest) # data(golub) # golub.mdc <- mdclust(golub) # search.mdclust(factor(golub.cl),golub.mdc) # # parameters: # data: matrix or dataframe, 1 row corresponds to 1 gene, 1 column to 1 sample # colnames and rownames can be used # s: number of selected genes # sort: sort output by "score", "cluster", "gene" or "no" sorting # fast=T/F: if true, analyse only top 1000 genes according to variance # plot=T/F: graphical output yes/no # # value: mdclust-object, consisting of: # $observations: vector of column labels # $genes: vector of differential genes # $scores: vector of scores # $scorelabel # $clusters: matrix of clusters corresponding to $genes (1: high expression, 0: low expression) # # additional functions: # plot.mdclust: graphical output of mdclust-objects # search.mdclust: search clusters within an mdclust-object, which fit best a certain data vector library(mva) library(exactRankTests) mdclust <- function(data, s=-1, sort="cluster", fast=F, plot=T) { # convert data frame to matrix if (is.data.frame(data)) data <- as.matrix(data) np <- max(dim(data)[2],0) if ( np <6 ) { cat("data matrix too small\n"); break } if ( s < 1 ) nmax <- np else nmax <- s ng <- max(dim(data)[1],0) if (length(rownames(data)) == 0 ) { rownames(data) <- 1:ng } cat (np,"samples",ng,"genes\n") # eliminate genes with variance 0 # eliminate NAs to avoid problems with kmeans var <- NULL for (i in 1:ng) { bool <- is.na(data[i,]) data[i,bool] <- mean(data[i,!bool],na.rm=T) var[i] <- var(data[i,],na.rm=T) if (is.na(var[i])) var[i] <- 0 } if (fast) { data<- data[order(var,decreasing=T)[1:1000],] ng <- max(dim(data)[1],0) cat(ng,"genes, ordered by variance\n") } else { data <- data[var>0,] ng <- max(dim(data)[1],0) cat(ng,"genes with variance > 0\n") } nmax <- min(nmax, ng) result <- NULL result$scorelabel <- "score" # initialize graphics output if (plot) initplot(np,nmax,rownames(data),colnames(data),result$scorelabel) if (length(colnames(data))==0) result$observations <- 1:np else result$observations <- colnames(data) for ( n in 1:nmax ) { # determine 2 clusters group <- NULL retry <- 0 try(group <- kmeans(t(data),2,100)$cluster - 1) # cluster to small while (retry < 100 && (sum(group==0) < 3 || sum(group==1) < 3 )) { retry <- retry + 1 select <- sample(1:dim(data)[1],sample(1:dim(data)[1],1) ) try(group <- kmeans(t(data[select,]),2,100)$cluster - 1) } if (retry>0) cat("cluster size too small - retry=",retry,"\n") if (sum(group==0) < 3 || sum(group==1) < 3) next # search most differential gene for this clustering tmax <- 0 best <- 1 for ( gene in 1:dim(data)[1] ) { x <- data[gene,][group==0] y <- data[gene,][group==1] try( t <- (mean(x)-mean(y))/sqrt((var(x)/length(x)) + (var(y)/length(y))) ) if (is.na(t)) next if (abs(t) >= tmax) {best <- gene ; tmax <- abs(t)} } # make sure that group 0 corresponds to low expression and group 1 to high expression g0 <- data[best,][group==0] g1 <- data[best,][group==1] if ( mean(g0) > mean(g1) ) group <- 1-group glabel <- rownames(data)[best] result$genes <- c(result$genes,glabel) result$scores <- c(result$scores,tmax) result$clusters <- c(result$clusters,group) if (plot) plotresult(n,nmax,np,glabel,tmax,group) # delete this gene from data matrix data <- data[-best,] } if (length(result) != 0) { result$clusters <- matrix(result$clusters,ncol=np,byrow=T) result <- sort.mdclust(result,sort,decreasing=T) if (plot) plot.mdclust(result) return(result) } } sort.mdclust <- function (mdclustobj, sort="cluster",decreasing=F) { genes <- mdclustobj$genes scores <- mdclustobj$scores scorelabel <- mdclustobj$scorelabel clusters <- mdclustobj$clusters obs <- mdclustobj$observations np <- length(obs) nmax <- length(genes) index <- 1:nmax if (sort == "score") index <- order(scores,decreasing=decreasing) if (sort == "gene") index <- order(genes,decreasing=decreasing) if (sort == "cluster") { ng <- length(genes) if (ng == 1) break list <- 1:ng index <- order(scores,decreasing=decreasing)[1] i <- index bool <- rep(T,ng) for (loop in 1:(ng-1)) { bool[i] <- F mindist <- ng for (k in list[bool]) { # search gene with minimum distance to gene i actdist <- dist(rbind(clusters[i,],clusters[k,]),method="binary") if (actdist < mindist) { mindist <- actdist; pos <- k } } index <- c(index,pos) i <- pos } } output <- NULL output$observations <- obs output$genes <- genes[index] output$scores <- scores[index] output$scorelabel <- scorelabel output$clusters <- clusters[index,] return(output) } plot.mdclust <- function (mdclustobj) { genes <- mdclustobj$genes scores <- mdclustobj$scores scorelabel <- mdclustobj$scorelabel clusters <- mdclustobj$clusters obs <- mdclustobj$observations np <- length(obs) nmax <- length(genes) initplot(np,nmax,genes,obs,scorelabel) for (i in 1:length(genes)) { plotresult(i,nmax,np,genes[i],scores[i],clusters[i,]) } } initplot <- function(np,nmax,rownames,colnames,scorelabel) { frame() par(mai=c(0.05,0.05,0.05,0.05)) if (length(rownames) == 0) x0 <- -0.4 else { gw <- max(sapply(rownames,strwidth,units="figure",cex=min(1,30/nmax))) + 0.01 gw <- min(0.4,gw) x0 <- min(-0.4, (-0.2 - gw) / (0.8 - gw)) } if (length(colnames) == 0) strw <- 0.01 else strw <- max(sapply(colnames,strwidth,units="figure",cex=min(1,30/np))) par(usr=c(x0*np,np,min(nmax-20,0),nmax+(0.05+strw)*(nmax-min(nmax-20,0)))) text(x0*np,nmax*1.02,"gene",pos=4) text(-0.2*(1-x0)*np,nmax*1.02,scorelabel,pos=4) for ( i in 1:np ) { label <- colnames[i] if (length(label) == 0) label <- i text(i-0.5, nmax*1.01, paste(label), pos=4, offset=0, srt=90, cex=min(1,30/np) ) } } plotresult <- function(n,nmax,np,gene,score,group) { left <- par("usr")[1] text(left,nmax-n+0.5,paste(gene),pos=4,cex=min(1,30/nmax)) text(-0.2*(np-left),nmax-n+0.5,paste(format(score,dig=5)),pos=4,cex=min(1,30/nmax)) for (poly in 1:np) { # high expression = white, low expression = black polygon(c(poly-1,poly-1,poly,poly),c(nmax-n+1,nmax-n,nmax-n,nmax-n+1),col=1-group[poly]) } } search.mdclust <- function (data,mdclustobj,nclust=10) { nclust <- min(nclust,length(mdclustobj$genes)) p <- NULL for (i in 1:length(mdclustobj$genes) ) { cluster <- mdclustobj$clusters[i,] if (is.factor(data)) { p[i] <- fisher.test(data,cluster)$p.value } else { p[i] <- wilcox.exact(data[cluster==1],data[cluster!=1])$p.value } } index <- order(p,-mdclustobj$scores)[1:nclust] result <- NULL result$observations <- data result$genes <- mdclustobj$genes[index] result$scores <- p[index] result$scorelabel <- "p-value" result$clusters <- matrix(mdclustobj$clusters[index,],ncol=length(data)) plot.mdclust(sort.mdclust(result,sort="cluster",decreasing=F)) return(result) }