R code for A High-Dimensional Cell-Phenotype Visualization Tool
21 May 2020: Bug has been fixed so that clusters’ integer labels greater than 9 now print properly.
# Supply SupStarPlots function with five items.
# 1) Dataframe DF: First column of dataframe must contain clusters' sequential integer
# labels, 1, 2, 3,..., starting at 1 and without gap in sequence.
# 2) Sequence of penalty values for lambda1. Example, seq(0.001, 20.001, 1).
# 3) Sequence of penalty values for lambda2. Example, seq(0.001, 20.001, 1).
# 4) Positive, real, scalar value M for tuning all markers' vector lengths in plot. M
# typically 1.0 or near this value.
# 5) Character vector of color names for ColOpt, the length of which should equal the
# number of clusters in DF. Example, c("navy blue", "cyan", "brown")
# EXAMPLE USAGE:
# ColorOptions <- c("navy blue", "cyan", "brown")
# SupStarPlots(DF = FakeData, lambda1 = seq(0.001, 20.001, 1), lambda2 = seq(0.001, 20.001, 1),
# ColOpt = ColorOptions, M = 1.0)
# COMMENTS:
# 1) Penalized supervised star plot method is applicable to 3 or more clusters.
# 2) User can set lambda1 to single small value (e.g., lambda1 = 0.001) to produce better
# separation of clusters in current sample's plot at cost of reducing reproducibility
# in new samples.
# 3) Please refer to VIRAL IMMUNOLOGY 32:1-8 for
# all details on the method.
# Load the four necessary packages.
library(matrixcalc)
library(plotrix)
library(JPEN)
library(VCA)
# The penalized supervised star plot function.
SupStarPlots <- function(DF, lambda1, lambda2, ColOpt, M) {
# Create a new data frame 1) ordered by clusters' integer labels and 2) with marker data
# centered and scaled.
ScRMat <- data.frame(Cluster = DF[order(DF[, 1]), 1], scale(DF[order(DF[, 1]), -1]))
# Quantity of observations.
lil.n <- nrow(ScRMat)
# Quantity of markers.
lil.p <- ncol(ScRMat[, -1])
# Quantity of clusters.
kappa <- length(unique(DF[, 1]))
# Function for estimating importance values. Runs separately by marker.
VCfunc <- function(colm) {
# Create data frame for analysis of variance.
To.AOV <- data.frame(y = ScRMat[, colm], Cluster = ScRMat$Cluster)
# Perform analysis of variance. [NOTE: Following line was updated in Feb 2020 to reflect
# change in VCA package.]
AOV.obj <- anovaVCA(y ~ Cluster, Data = To.AOV)[[16]]
# Estimate importance value from two estimated variance components.
lil.r <- AOV.obj[1] / AOV.obj[2]
# Corrects any negative estimate of importance value. [NOTE: May not be needed but included
# as a safeguard.]
return(max(0, lil.r))
}
# Vectorized (by marker) implementation of function for estimating importance value.
lil.rs <- apply(matrix(1 + (1:lil.p), 1, lil.p), 2, VCfunc)
# Create data frame of estimated importance values.
Importance <- data.frame(Marker = colnames(ScRMat[, -1]), r = lil.rs)
# Print reverse-ordered estimated importance values. [NOTE: Some users may wish to add export
# feature here.]
print(Importance[rev(order(Importance$r)),])
# Calculate matrix R from paper.
RMat <- as.matrix(ScRMat[, 1 + rev(order(lil.rs))])
# Generator of components of matrix Y from paper.
YGen <- function(a) {c(cos(a * 2 * pi / kappa), sin(a * 2 * pi / kappa))}
# Populate that Y matrix.
Seq <- as.integer(DF[order(DF[, 1]), 1] - 1)
Y <- matrix(YGen(Seq), lil.n, 2)
# Center and scale Y matrix.
YMat <- scale(Y)
# Generate matrix to perform double summation differencing in second (i.e., fused) penalty term
# of expression (1) in paper.
ZMat <- matrix(0, lil.p, lil.p)
diag(ZMat) <- 1
ZMat[1, 1] <- 0
# Subdiagonal of all -1's to implement differencing in double summation of second (i.e., fused)
# penalty term of expression (1) in paper.
for (i in 2:lil.p) {ZMat[i-1, i] <- -1}
# Objective function = expression (1) in paper.
Obj <- function(pvec) {
# Generate matrix Q in paper.
QMat <- matrix(as.vector(pvec), lil.p, 2, byrow = F)
# Generate matrix E in paper.
EMat <- YMat[TVS == 1,] - RMat[TVS == 1,] %*% QMat
# Evaluate objective function.
Term1 <- frobenius.norm(EMat) / sum(TVS == 1)
Term2 <- Pn[1] * (tr(t(QMat) %*% QMat) / lil.p)
Term3 <- Pn[2] * (sum((t(QMat) %*% ZMat) ^ 2) / (lil.p - 1))
ObjVal <- Term1 + Term2 + Term3
# Return objective function value to optimizer (base R function optim).
return(ObjVal)
}
# Initiate index for storage of cross-validation results from all user-supplied combinations of
# values of two penalty parameters.
Cntr <- 0
# Initialize penalties at zero.
Strt <- c(rep(0, 2 * lil.p))
# Initialize matrix for storage of cross-validation results.
Results <- matrix(NA, nrow = length(lambda1) * length(lambda2), ncol = 1 + 2 * lil.p)
# Loop over values of first penalty parameter.
for (j in lambda1) {
# Loop over values of second penalty parameter.
for (k in lambda2) {
# Random training-validation partitioning. TVS = 1 for training. TVS = 0 for validation.
# [NOTE: Consider fixing seed of random number generator.]
TVS <- rbinom(n = nrow(YMat), size = 1, prob = 0.6)
# Increment index for storage of cross-validation results.
Cntr <- Cntr + 1
# Current penalty vector (pair).
Pn <- c(j, k)
# Run optimizer at current penalty vector.
MinObj <- optim(par = as.vector(Strt), fn = Obj, method = "BFGS")
# Calculate matrix Q, from paper, based on current optimum at current penalty vector.
QMat <- matrix(as.numeric(MinObj$par), lil.p, 2, byrow = F)
# Calculate matrix E, from paper, based on current optimum at current penalty vector.
EMat <- YMat[TVS == 0,] - RMat[TVS == 0,] %*% QMat
# Calculate cross-validation error as the Frobenius (norm) mean square error.
FMSE <- frobenius.norm(EMat) / sum(TVS == 0)
# Populate current row in results matrix.
Results[Cntr, ] <- c(FMSE, MinObj$par)
# Print penalty vector and cross-validation Frobenius (norm) mean square error.
cat("Penalties = ", Pn, "Validation FMSE = ", FMSE, "\n")
}
}
# Extract Q matrix of minimum cross-validation Frobenius (norm) mean square error.
QMatOpt <- matrix(as.numeric(Results[which.min(Results[, 1]), 1 + 1:(2 * lil.p)]), lil.p, 2, byrow = F)
# Calculate corresponding optimal projection of data in star plot space.
OptCoords <- RMat %*% QMatOpt
# Print cross-validated optimal Q matrix and optimal data coordinates.
print("Estimated projection coefficients:")
print(QMatOpt)
print("Estimated PSS coordinates:")
print(OptCoords)
# Plot cross-validated optimal data coordinates of star plot.
plot(OptCoords,
# Star plot axes.
xlab = "1st Starplot Coordinate", ylab = "2nd Starplot Coordinate", col.axis = "black",
# [NOTE: do.first determines background color of star plot.]
do.first = plot_bg(col = "white"), type = "n")
# Plot data labeled by cluster. Color code labels by cluster.
text(OptCoords, labels = ScRMat[,1], col = ColOpt[as.vector(DF[order(DF[, 1]), 1])], cex = 0.45)
# Normalize optimal Q matrix's row vectors (one vector per marker) to scale of data.
PlotQMatOpt <- M * 0.8 * max(abs(OptCoords)) * (QMatOpt / max(sqrt(diag(QMatOpt %*% t(QMatOpt)))))
# Plot those normalized vectors.
arrows(rep(0, lil.p), rep(0, lil.p), PlotQMatOpt[, 1], PlotQMatOpt[, 2], length = 0.05, col = "black")
# Label those normalized vectors by marker.
text(1.05 * PlotQMatOpt, labels = colnames(ScRMat[, 1 + rev(order(lil.rs))]), cex = 0.5, col = "black")
# Return cross-validated optimal Q matrix and optimal data coordinates.
return(list(QMatOpt, OptCoords))
}