## ## AVQ Example.R ## ## Scott Carlson, smcarlson@gmail.com ## Harvey Cohen Lab, Stanford University School of Medicine ## ## Example code showing how to use angle-vector quantization. The first ## example loads data from a provided file, the second shows how it can ## be called using R data objects. ## ## Load the source code ## # SET THIS TO THE DIRECTORY OF THE EXAMPLE CODE AND DATA path <- "C:\\Documents and Settings\\Scott Carlson\\My Documents\\Biostatistics\\Experiments\\Papers\\Clustering\\Public\\Code and Examples" setwd(path) source("AVQ Cluster.R") ## Clustering on KD data loaded from a file ## # SET THESE TO THE INPUT FILE AND DESIRED OUTPUT FILES input.path <- "KD data.txt" output.centroids <- "KD centroids.txt" output.clusters <- "KD clusters.txt" clusters <- AVQ.Cluster(input.path=input.path, output.data=output.centroids, output.clusters=output.clusters, verbose=FALSE) ## Clustering on R objects ## load("KD data.RData") output.centroids <- "KD centroids.txt" output.clusters <- "KD clusters.txt" clusters <- AVQ.Cluster(values=KD.data, features=features, identities=identities, output.data=output.centroids, output.clusters=output.clusters, verbose=FALSE) ## Print some summaries ## # Correlation matrices sapply( unique(clusters$encoding), function(index, values, encoding){ print(round(cor(matrix( values[,index==encoding], ncol=sum(index==encoding)) ),2)) }, clusters$original, clusters$encoding) print(noquote(c("These are correlation matrices for each features in each cluster. The correlation heatmap generated", "below may be easier to read than numerical output."))) # Histogram of cluster sizes cluster.sizes <- sapply(1:max(clusters$encoding), function(index, encoding){sum(index==encoding)}, clusters$encoding) hist( cluster.sizes, xlab="Cluster size (features)", ylab="Number of clusters", main="Distribution of Cluster Sizes", breaks=30, col="grey") print(noquote(c("The histogram shows how many features are in each cluster. Large clusters are usually highly", "abundant proteins like albumin, hemoglobin, immunoglobulin, etc."))) # Show a heatmap or correlation coefficients correlation <- cor( clusters$original[ , unlist( sapply( order(-cluster.sizes), function(cluster, encoding){which(encoding==cluster)}, clusters$encoding ) ) ] ) library(sma) x11() plot.cor( correlation, zlim=c(-1,1), title="Correlation matrix" ) print(noquote(c("The correlation heatmap has features grouped into clusters and sorted by decreasing cluster size.","Red blocks along the diagonal show correlated features from the largest clusters, and off-diagonal", "blocks show clusters that have some positive or negative correlation with each other.")))