## ## AVQ Cluster.R ## Scott Carlson, smcarlson@gmail.com ## Harvey Cohen Lab, Stanford University School of Medicine ## ## This file contains R code to perform sequential k-means clustering ## as described in the manuscript "Using Clustering to Unravel Correlations ## in Proteomic Data." The primary wrapper is AVQ.Cluster, which takes a ## data matrix of replicate values and returns the estimated noise for each ## feature, performs angle-vector quantization, and returns the clustered data. ## It also writes the clustered data as a text file, and writes a text file ## describing the contents of all the clusters. ## ## ## ## Copyright and liability: ## This code is property of the Harvey Cohen Lab at the Stanford University ## School of Medicine. It is available and free to use for all non-commercial ## purposes with correct citation. Commercial use requires express permission ## from Dr. Harvey Cohen, or a party authorized to act on his behalf. ## ## This code is provided as-is. Use of the code acknowledges that the authors ## and their affiliated institutions have no responsibility for any damages ## resulting from that use. ## # AVQ.Cluster # # This is the primary wrapper function. It takes input in one of two formats, either as a numeric matrix # or an input file. To use a matrix, pass values to the parameters "values", "identities", and "features". # The identities and features should be character vectors (they will be coerced regardless). Input files should # be formatted for read.table(), with one sample per column, feature names in the first column, sample names in the # 1st row, and sample classes in the 2nd row. A significant number of samples must be provided in replicate to # get a reasonable estimation of the measurement noise! # # Output is an R list with attributes "original", "noise.var", "centroids", "encoding", "likelihood", and "weights." These are # the original dataset with replicates averaged, the noise variance of measurements of each feature, the cluster averages, a # numeric vector assigning each feature to a cluster, the likelihood of the data at each clustering step (also shows the Bayes # Information Criterion), and a vector showing the fraction of total weight each feature contributes to its cluster. # # Two text files, specified by "output.data" and "output.clusters", will also be produced. "output.data" is a matrix of # clustered features, and "output.clusters" lists which features are assigned to each cluster. No output file will # be produced is the output path is left as NA. # AVQ.Cluster <- function(values=NA, identities=NA, features=NA, input.path=NA, output.data=NA, output.clusters=NA, verbose=FALSE) { ## Verify the input data ## # Load the data if none was provided if (is.na(values)[1]) { if (is.na(input.path)) { stop("Either 'values' or 'input.path' must be provided.") } data <- read.table(input.path, sep="\t", as.is=TRUE) features <- as.character(data[c(-1,-2),1]) identities <- as.character(data[1,-1]) values <- data[c(-1,-2),-1] values <- matrix(as.numeric(as.matrix(values)), ncol=(dim(values)[[2]])) } dimnames(values) <- list(features, identities) # Check that there are some technical replicates if ( all(duplicated(identities)==FALSE) ) stop( "There must be some duplicate measurements to estimate noise." ) # Check that numbers of identities and features match the dimensions of values if (length(identities) != dim(values)[[2]]) stop( paste("Columns of data do not match the number of sample identities (", dim(values)[[2]], "columns,", length(identities), "identities" , ")" ) ) if (length(features) != dim(values)[[1]]) stop( paste("Rows of data do not match the number of feature names (", dim(values)[[1]], "rows,", length(features), "features" , ")" ) ) ## Calculate the averaged data ## averaged <- computeAverages(values, identities) averaged$noise.var = averaged$noise.std^2 ## Run clustering ## # Variance has to be scaled down by the number of replicates replicate.factor <- mean( 1 / averaged$total.count[1,] ) clusters <- angleVectorQuantization(averaged$avg.peaks, averaged$noise.var * replicate.factor, verbose=verbose) ## Print a brief summary ## print( noquote( paste("Data contained", length(features) ,"features", "and", length(averaged$identities) ,"unique samples") ) ) print( noquote( paste("After clustering there are", max(clusters$encoding),"clusters") ) ) ## Write the output files ## if ( !is.na(output.data) ) { output.matrix <- clusters$centroids dimnames(output.matrix) <- list( averaged$identities, paste("Cluster",1:max(clusters$encoding)) ) write.table(output.matrix, output.data, sep="\t") } if ( !is.na(output.clusters) ) { # Cluster label column cluster.col <- unlist( sapply( 1:max(clusters$encoding), function(index, encoding){ c(index, rep("", sum(encoding==index)-1)) }, clusters$encoding ) ) # Cluster member column (sorted by weight) features.col <- unlist( sapply( 1:max(clusters$encoding), function(index, encoding, features, weights){ features[ index==encoding ][ order(-weights[encoding==index]) ] }, clusters$encoding, dimnames(averaged$avg.peaks)[[2]], clusters$weights ) ) # Feature weight column (sorted by weight) weight.col <- unlist( sapply( 1:max(clusters$encoding), function(index, encoding, weights){ weights[ index==encoding ][ order(-weights[encoding==index]) ] }, clusters$encoding, clusters$weights ) ) output.matrix <- cbind(cluster.col, features.col, weight.col) dimnames(output.matrix)[[1]] <- character(0) dimnames(output.matrix)[[2]] <- c("Cluster", "Features", "Weight") write.table( output.matrix, output.clusters, sep="\t" ) } return( c(list(original=averaged$avg.peaks, noise.var=averaged$noise.std^2), clusters) ) } ##################################################################### # Input parameters are: # Assume that there are N samples and P protein peaks # values An N x P matrix of P column vectors, each of size N # noise.var A vector of size P denoting the VARIANCE of the noise # interations.max The maximum number of times to iterate cluster optimization # cluster.min The smallest number of clusters to consider. Optimization is very slow for small # number of clusters, so it is much faster to not consider them. # verbose If TRUE the function will print a report of each clustering step ##################################################################### angleVectorQuantization <- function(values, noise.var, iterations.max = 50, cluster.min = max(floor(dim(values)[[2]]*0.03),1), verbose=TRUE ) { ## Check consistency of the parameters ## if (!is.numeric(cluster.min)) { stop("The minimum number of clusters must be numeric") } if (cluster.min < 1) { stop("The minimum number of clusters must be 1 or greater") } ## Check that parameters dimensions match ## num.features <- dim(values)[2] num.dimensions <- dim(values)[1] if (length(noise.var) != num.features) { stop( paste("Number of features", num.features,"does not match the number of noise values", length(noise.var) ,".") ) } ## Shift and scale each feature to have a mean of zero, standard deviation of 1 ## # First normalize all vector to have a mean of zero, unit magnitude values.mean <- apply(values, 2, mean, na.rm=TRUE) values.std <- sqrt( apply(values, 2, var, na.rm=TRUE) ) #* apply(!is.na(values), 2, mean) * ((num.dimensions-1)/num.dimensions) ) # Each column vector of "units" contains a feature with zero mean and magnitude equal to vector length units <- t( (t(values) - values.mean) / values.std ) # Set all missing values to zero after normalization # This is not ideal, but we are unsure what to do with NA otherwise units[is.na(units)] <- 0 # Calculate weights for each feature to be used for determining centroids, these are inversely proportional to the normalized noise noise <- noise.var / (values.std^2) ## Begin agglomerative clustering by assigning every feature to individual centroids ## ## The variables "features.dot" and "likelihood" are calculated here for every ## ## feature then updated as necessary. "features.dot" stores the dot-product ## ## between every feature and its centroid to compare to other dot-products when ## ## dot-products when assigning features to centroids. "likelihood" stores the ## ## contribute of each feature to the likelihood of the model. ## if (verbose) { print("Beginning clustering") } encoding <- 1:num.features best.encoding <- encoding if (verbose) { print("Creating initial centroids") } new.centroids <- findCentroids(units, noise, encoding) centroids <- new.centroids$centroids features.dot <- new.centroids$features.dot likelihood <- new.centroids$likelihood rm(new.centroids) if (verbose) { print("Initializing centroid correlations") } centroid.cors <- matrix( c( rep(0, num.features), rep(NA, num.features) ), nrow=2, byrow=TRUE ) for (centroid in 1:num.features) { centroid.cors <- centroidCorrelation(centroid, centroids, centroid.cors) } ## Initialize values to track the score for each stage of clustering ## best.centroids <- centroids agglom.likelihood <- numeric(0) maximum.likelihood <- Likelihood(likelihood, num.dimensions, num.features, max(encoding), encoding)[1] ## Agglomerative phase, join centroids and optimize each time. ## ## This chooses the "natural" number of clusters using the Bayes ## ## Information Criterion to balance parameters and goodness-of-fit ## while ( max(encoding) > cluster.min ) { ## Join the next pair of centroids ## # Determine the pair of centroids with highest correlation closest.centroids.idx <- closestCentroids(centroid.cors, centroids) # Report the state before joining if (verbose) { print( noquote( paste("Joining centroids:", closest.centroids.idx) ) ) print( noquote( paste("Before joining centroids have members = ", sum(encoding == closest.centroids.idx[1]), ", ", sum(encoding == closest.centroids.idx[2])) ) ) } # Join these two centroids joined.centroids <- joinCentroids(units, centroids, encoding, closest.centroids.idx, centroid.cors) encoding <- joined.centroids$encoding centroids <- joined.centroids$centroids centroids.affected <- joined.centroids$affected centroid.cors <- joined.centroids$centroid.cors cors.affected <- joined.centroids$cors.affected ## Update centroids and feature assignments with the new encodings. This assigns every feature to ## ## the closest centroid, then recalculates centroids based on feature assignments and noise weights. ## ## That iterates until no features change their centroid assignment. ## optimized.centroids <- optimizeCentroids(iterations.max, units, noise, encoding, centroids, centroid.cors, centroids.affected, features.dot, likelihood) encoding <- optimized.centroids$encoding centroids <- optimized.centroids$centroids features.dot <- optimized.centroids$features.dot likelihood <- optimized.centroids$likelihood iterations <- optimized.centroids$iterations cors.affected <- optimized.centroids$cors.affected centroids.affected <- optimized.centroids$centroids.affected centroid.cors <- optimized.centroids$centroid.cors ## Update correlations among any centroids that may have changed their nearest-neighbor ## for (centroid in unique(cors.affected)) { centroid.cors <- centroidCorrelation(centroid, centroids, centroid.cors) } # Calculate the new likelihood total.likelihood <- Likelihood(likelihood, num.dimensions, num.features, max(encoding), encoding) agglom.likelihood <- cbind(agglom.likelihood, c(total.likelihood, max(encoding))) # Record the current encoding if it gives the best likelihood if (total.likelihood[1] > maximum.likelihood) { maximum.likelihood <- total.likelihood[1] best.encoding <- encoding best.centroids <- centroids best.dots <- features.dot best.likelihood <- likelihood } ## Print a report on this clustering step ## if (verbose) { print( noquote( paste("Iterations to converge:", iterations, " Remaining centroids:", max(encoding)) ) ) print( noquote( paste("Likelihood:", round(total.likelihood[1],0) ) ) ) } } ## Report on the agglomerative clustering ## if (verbose) { print( paste("Agglomerative clustering ended with", max(best.encoding), "centroids, with a maximum log-likelihood of", round(maximum.likelihood, 4),".", sep=" ") ) } ## Calculate the contribution of each feature it its centroid ## # Each feature has a contribution proportional to the feature-centroid # dot-product and inversely proportional to the feature noise variance weights <- apply(units * best.centroids[, best.encoding], 2, sum) / noise weights <- sapply( 1:num.features, function(index, encoding, weights) { weights[index] / sum(weights[encoding == encoding[index]]) }, best.encoding, weights) return( list(centroids=best.centroids, encoding=best.encoding, likelihood=agglom.likelihood, weights=weights) ) } ###################################################################### ## ## ## ## ## Functions used by clustering ## ## ## ## ## ###################################################################### ##################################################################### # Optimizes centroids using k-means-like expectation maximization. # Returns a list containing new encoding, centroids, feature-centroid # dot-products, feature likelihoods, a vector of centroids hat have # changed, and a count of iterations to converge. ##################################################################### optimizeCentroids <- function(iterations.max, units, noise, encoding, centroids, centroid.cors=NA, centroids.affected, features.dot, likelihood) { # Initialize a list of centroids which need to have their centroid-centroid correlations updated cors.affected <- c(centroids.affected, which(centroid.cors[1,] %in% centroids.affected)) iterations <- 0 total.centroids.affected <- centroids.affected while (( length(centroids.affected) > 0 ) & (iterations < iterations.max)) { ## Calculate new centroids given current feature assignments ## new.centroids <- findCentroids(units, noise, encoding, unique(centroids.affected), features.dot, likelihood) features.dot <- new.centroids$features.dot likelihood <- new.centroids$likelihood centroids[,centroids.affected] <- new.centroids$centroids ## Assign each feature to the closest centroid ## cluster.result <- clusterByLeastAngle(units, noise, centroids, encoding, unique(centroids.affected), features.dot, likelihood) encoding <- cluster.result$encoding centroids.affected <- cluster.result$affected cors.affected <- c(cors.affected, centroids.affected, which(centroid.cors[1,] %in% centroids.affected)) total.centroids.affected <- unique( c(total.centroids.affected, centroids.affected) ) ## Remove any centroids that had no features clustered to them. ## ## Collapse the centroid and cross.scores matrices and the encoding ## ## vector to eliminate the lost centroid. ## remove.idx <- rev(which( !(1:max(encoding) %in% encoding) )) if ( length(remove.idx) > 0 ) { print("Centroids with no features:"); print(remove.idx) } for (centroid.idx in remove.idx) { ## Collapse the data structures ## centroids <- centroids[,-centroid.idx] encoding[ encoding > centroid.idx ] <- encoding[ encoding >= centroid.idx ] - 1 # centroids.affected lists the centroids modified by this round of reassignments centroids.affected <- centroids.affected[ centroids.affected != centroid.idx ] centroids.affected[ centroids.affected > centroid.idx ] <- centroids.affected[ centroids.affected > centroid.idx ] - 1 # Recalculate correlation for any centroid that considered this their closest match cors.affected <- c(cors.affected, which(centroid.cors[1,] == centroid.idx)) centroid.cors <- centroid.cors[,-centroid.idx] if ( length(centroid.cors)==2 ) { centroid.cors <- matrix(centroid.cors, nrow=2) } centroid.cors[1, centroid.cors[1,] >= centroid.idx] <- centroid.cors[1, centroid.cors[1,] >= centroid.idx] - 1 if ( any(cors.affected == centroid.idx) ) { cors.affected <- cors.affected[ -which(cors.affected == centroid.idx) ] } cors.affected[ cors.affected > centroid.idx ] <- cors.affected[ cors.affected > centroid.idx ] - 1 } iterations <- iterations + 1 } # Clean up the centroids if they didn't converge if ( length(centroids.affected) > 0 ) { new.centroids <- findCentroids(units, noise, encoding, unique(centroids.affected), features.dot, likelihood) features.dot <- new.centroids$features.dot likelihood <- new.centroids$likelihood centroids[,centroids.affected] <- new.centroids$centroids } return( list(encoding=encoding, centroids=centroids, features.dot=features.dot, likelihood=likelihood, centroids.affected=total.centroids.affected, centroid.cors=centroid.cors, cors.affected=cors.affected, iterations=iterations) ) } ##################################################################### # Find the weighted centroids of the set of column vectors which have # been grouped into clusters as specified by encoding. # Returns the result as a column vector. # Assumes a canonical labeling ##################################################################### findCentroids <- function(units, noise, encoding, affected=unique(encoding), features.dot=rep(0, length(encoding)), likelihood=rep(0, length(encoding))) { centroids <- NULL ## Compute new centroids for affected centroids ## for (j in affected) { mask <- (encoding == j) if (sum(mask) > 0) { computed.centroid <- findCentroid(units[, mask], noise[mask]) } else { print(paste("No encoding", j)) computed.centroid <- rep(NA, nrow(units)) } centroids <- cbind(centroids, computed.centroid) } ## Update the dot-products of features clustered to affected centroids ## centroid.idx <- match(encoding, affected) recompute.mask <- (!is.na(centroid.idx)) #These have to be recomputed # Compute new dot-products for features joined to affected centroids features.dot[recompute.mask] <- apply(units[,recompute.mask] * centroids[,centroid.idx[recompute.mask]], 2, sum) # Flip the direction of any centroid that points away from the majority of its members for (j in affected) { mask <- (encoding == j) if ( sum(sign(features.dot[mask])) < 0 ) { features.dot[mask] <- -features.dot[mask] centroids[, which(affected==j)] <- -centroids[, which(affected==j)] } } ## Update the likelihood of each feature clustered to an affected centroid ## likelihood[recompute.mask] <- features.dot[recompute.mask]^2 / noise[recompute.mask] * 0.5 return( list(centroids = centroids, features.dot = features.dot, likelihood = likelihood) ) } ##################################################################### # Find a single centroid, normalized to a unit vector. ##################################################################### findCentroid <- function(units, noise) { # Handle the degenerate case explicitly, because otherwise R makes a bad typing decision if (length(noise) == 1) return(units/sqrt(sum(units^2, na.rm=TRUE))) # Verify parameters if (length(units) == 0) stop("findCentroid got bad parameters: no vectors to find cluster") if (ncol(units) != length(noise)) stop("findCentroid got bad parameters: weights not same as number as vectors") # Involves finding the principle component of the matrix Sum(wj uj %*% t(uj)) num.dimensions <- nrow(units) matrix.M <- matrix(apply(units, 2, function(x){x %o% x}) %*% t(t(1 / noise)), nrow=num.dimensions) centroid <- matrix((eigen(matrix.M)$vectors)[,1], ncol=1) return(centroid/sqrt(sum(centroid^2))) } ##################################################################### # Return an index of classification for each column vector against the centroids # The centroid with the smallest absolute angle to the vector wins # # Parameters: # units matrix of column vectors where each vector is a feature # centroids matrix of column vectors where each vector represents the direction of a centroid # encoding assignments of features to clusters # affected a vector specifying which centroids have changed and need to be updated # features.dot the current best dot-product between each feature and its centroid # likelihood the likelihood of each feature given its current centroid # include.negative a boolean value where TRUE indicates that negative correlations should be treated the same as positive correlations # ##################################################################### clusterByLeastAngle <- function(units, noise, centroids, encoding, affected, features.dot, likelihood, include.negative = F) { old.encoding = encoding if (length(affected) > 0) { # v.mask are those vectors whose centroids have not been affected v.mask <- is.na(match(encoding, affected)) # Mark features that definitely have to have features.dot recomputed recompute.mask <- (!v.mask) # Compare the old dot products with those of affected centroids if (any(v.mask)) { if (include.negative) { v.affected.max.dot.products = apply(abs(t(centroids[,affected]) %*% as.matrix(units[,v.mask])), 2, max, na.rm=TRUE) } else { v.affected.max.dot.products = apply(t(centroids[,affected]) %*% as.matrix(units[,v.mask]), 2, max, na.rm=TRUE) } recompute.mask[v.mask] = (v.affected.max.dot.products > features.dot[v.mask]) } if (include.negative) { dot.products <- abs(t(centroids) %*% as.matrix(units[, recompute.mask])) } else { dot.products <- t(centroids) %*% as.matrix(units[, recompute.mask]) } # Return the index of the centroid with the highest dot product for each column of units reencoding.idx.value <- apply(dot.products, 2, maxIndexAndValue) encoding[recompute.mask] <- reencoding.idx.value[1,] } return( list(encoding = encoding, affected = findAffectedCentroids(old.encoding, encoding)) ) } ##################################################################### # Calculates the total likelihood of the clustering including the Bayes Information Criterion ##################################################################### Likelihood <- function(likelihood, dimensions, features, clusters, encoding) { model.likelihood <- sum(likelihood) bayes.info.criterion <- -clusters * (dimensions-1) / 2 * log(features) return( c(model.likelihood + bayes.info.criterion, model.likelihood, bayes.info.criterion) ) } ##################################################################### # Returns a list of centroid indexes which need to be recomputed based on changing assignments ##################################################################### findAffectedCentroids <- function(old.encoding, new.encoding) { change.mask <- (old.encoding != new.encoding) unique(c(old.encoding[change.mask], new.encoding[change.mask])) } ##################################################################### # Return both the index and value of the minimum in a vector ##################################################################### maxIndexAndValue <- function(x) { idx <- order(-x)[1] return(c(idx, x[idx])) } ##################################################################### # Joins together to centroids. Returns the a list containing the new # encoding, centroids, vector of affected centroid indices, and a vector of # centroid indices that need their centroid-centroid correlations updated. ##################################################################### joinCentroids <- function(units, centroids, encoding, join.idx, centroid.cors) { # Force the first index to be the smaller centroid index if (join.idx[1] > join.idx[2]) join.idx = rev(join.idx) # Determine new encoding encoding[ encoding == join.idx[2] ] <- join.idx[1] encoding[ encoding > join.idx[2] ] <- encoding[ encoding > join.idx[2] ] - 1 # Update centroid correlations centroid.cors <- centroid.cors[,-join.idx[2]] if ( length(centroid.cors) == 2 ) { centroid.cors <- matrix(centroid.cors, nrow=2) } centroid.cors[1, centroid.cors[1,]==join.idx[2] ] <- join.idx[1] centroid.cors[1, centroid.cors[1,]>=join.idx[2] ] <- centroid.cors[1,centroid.cors[1,]>=join.idx[2]] - 1 if ( length(centroid.cors) == 2 ) { centroid.cors <- matrix(centroid.cors, nrow=2) } cors.affected <- c(join.idx[1], which(centroid.cors[1,] == join.idx[1])) return( list(centroids = as.matrix(centroids[,-join.idx[2]]), encoding = encoding, affected = join.idx[1], centroid.cors = centroid.cors, cors.affected=cors.affected) ) } ##################################################################### # Calculate the correlation between a given centroid and all the others ##################################################################### centroidCorrelation <- function(index, centroids, centroid.cors){ cors <- cor(centroids[,index], centroids) cors[index] <- -Inf change.mask <- (cors > centroid.cors[1,]) if ( any(change.mask) ) { centroid.cors[1,change.mask] <- index centroid.cors[2,change.mask] <- cors[change.mask] } centroid.cors[,index] <- maxIndexAndValue(cors) return(centroid.cors) } ##################################################################### # Returns a vector of the indices of the two centroids with the highest # correlation. ##################################################################### closestCentroids <- function(centroid.cors, centroids, include.negative = F) { idx.1 <- which( centroid.cors[2,] == max(centroid.cors[2,]) )[1] return( c(idx.1, centroid.cors[1,idx.1]) ) } ##### Helper functions ########### printClustering <- function(peakNames, encoding) { sapply(unique(encoding), function(id, ids, peakNames){peakNames[id==ids]}, encoding, peakNames) } ####################### computeAverages <- function(peaks, identities) { if (ncol(peaks) != length(identities)) { stop("Number of columns does not match with length of identities") } results <- tapply(1:ncol(peaks), identities, function(columns, pp) { if (length(columns) < 2) return(list(vars=NA, counts=1)) return(list( means=apply(pp[,columns], 1, mean, na.rm=TRUE), vars=apply(pp[,columns], 1, var, na.rm=TRUE), counts=apply(!is.na(pp[,columns]), 1, sum) ) ) }, peaks) err.sq.sums <- 0 count.sums <- 0 avg.peaks <- NULL total.counts <- NULL identities <- NULL for (j in 1:length(results)) { z <- results[[j]] z$vars[is.na(z$vars)] <- 0 err.sq.sums <- err.sq.sums + (z$counts-1)*z$vars count.sums <- count.sums + (z$counts-1)*(z$vars > 0) if (!is.null(z$means)) { avg.peaks <- rbind(avg.peaks, z$means) total.counts <- rbind(total.counts, z$counts) identities <- c(identities, names(results)[j]) } } noise.std <- sqrt(err.sq.sums/count.sums) return(list(identities=identities, avg.peaks=avg.peaks, noise.std=noise.std, total.counts=total.counts)) }