R code for A High-Dimensional Cell-Phenotype Visualization Tool

21 May 2020: Bug has been fixed so that clusters’ integer labels greater than 9 now print properly.

 

# Supply SupStarPlots function with five items.
# 1) Dataframe DF: First column of dataframe must contain clusters' sequential integer
#      labels, 1, 2, 3,..., starting at 1 and without gap in sequence.
# 2) Sequence of penalty values for lambda1. Example, seq(0.001, 20.001, 1).
# 3) Sequence of penalty values for lambda2. Example, seq(0.001, 20.001, 1).
# 4) Positive, real, scalar value M for tuning all markers' vector lengths in plot. M
#      typically 1.0 or near this value.
# 5) Character vector of color names for ColOpt, the length of which should equal the
#      number of clusters in DF. Example, c("navy blue", "cyan", "brown")

 

# EXAMPLE USAGE:
# ColorOptions <- c("navy blue", "cyan", "brown")
# SupStarPlots(DF = FakeData, lambda1 = seq(0.001, 20.001, 1), lambda2 = seq(0.001, 20.001, 1),
#     ColOpt = ColorOptions, M = 1.0)

 

# COMMENTS:
# 1) Penalized supervised star plot method is applicable to 3 or more clusters.
# 2) User can set lambda1 to single small value (e.g., lambda1 = 0.001) to produce better
#      separation of clusters in current sample's plot at cost of reducing reproducibility
#      in new samples.
# 3) Please refer to VIRAL IMMUNOLOGY 32:1-8 for
#      all details on the method.

# Load the four necessary packages.
library(matrixcalc)
library(plotrix)
library(JPEN)
library(VCA)

 

# The penalized supervised star plot function.

 

SupStarPlots <- function(DF, lambda1, lambda2, ColOpt, M) {
    # Create a new data frame 1) ordered by clusters' integer labels and 2) with marker data
    # centered and scaled.
    ScRMat <- data.frame(Cluster = DF[order(DF[, 1]), 1], scale(DF[order(DF[, 1]), -1]))
    # Quantity of observations.
    lil.n <- nrow(ScRMat)
    # Quantity of markers.
    lil.p <- ncol(ScRMat[, -1])
    # Quantity of clusters.
    kappa <- length(unique(DF[, 1]))
    # Function for estimating importance values. Runs separately by marker.

 

    VCfunc <- function(colm) {
        # Create data frame for analysis of variance.
        To.AOV <- data.frame(y = ScRMat[, colm], Cluster = ScRMat$Cluster)
        # Perform analysis of variance. [NOTE: Following line was updated in Feb 2020 to reflect
        # change in VCA package.]
        AOV.obj <- anovaVCA(y ~ Cluster, Data = To.AOV)[[16]]
        # Estimate importance value from two estimated variance components.
        lil.r <- AOV.obj[1] / AOV.obj[2]
        # Corrects any negative estimate of importance value. [NOTE: May not be needed but included
        # as a safeguard.]
        return(max(0, lil.r))
    }
    # Vectorized (by marker) implementation of function for estimating importance value.
    lil.rs <- apply(matrix(1 + (1:lil.p), 1, lil.p), 2, VCfunc)
    # Create data frame of estimated importance values.
    Importance <- data.frame(Marker = colnames(ScRMat[, -1]), r = lil.rs)
    # Print reverse-ordered estimated importance values. [NOTE: Some users may wish to add export
    # feature here.]
    print(Importance[rev(order(Importance$r)),])
    # Calculate matrix R from paper. 
    RMat <- as.matrix(ScRMat[, 1 + rev(order(lil.rs))])
    # Generator of components of matrix Y from paper.
    YGen <- function(a) {c(cos(a * 2 * pi / kappa), sin(a * 2 * pi / kappa))}
    # Populate that Y matrix.
    Seq <- as.integer(DF[order(DF[, 1]), 1] - 1)
    Y <- matrix(YGen(Seq), lil.n, 2)
    # Center and scale Y matrix.
    YMat <- scale(Y)
    # Generate matrix to perform double summation differencing in second (i.e., fused) penalty term
    # of expression (1) in paper.
    ZMat <- matrix(0, lil.p, lil.p)
    diag(ZMat) <- 1
    ZMat[1, 1] <- 0
    # Subdiagonal of all -1's to implement differencing in double summation of second (i.e., fused)
    # penalty term of expression (1) in paper.
    for (i in 2:lil.p) {ZMat[i-1, i] <- -1}
    # Objective function = expression (1) in paper.
    Obj <- function(pvec) {
        # Generate matrix Q in paper.
        QMat <- matrix(as.vector(pvec), lil.p, 2, byrow = F)
        # Generate matrix E in paper.
        EMat <- YMat[TVS == 1,] - RMat[TVS == 1,] %*% QMat
        # Evaluate objective function.
        Term1 <- frobenius.norm(EMat) / sum(TVS == 1)
        Term2 <- Pn[1] * (tr(t(QMat) %*% QMat) / lil.p)
        Term3 <- Pn[2] * (sum((t(QMat) %*% ZMat) ^ 2) / (lil.p - 1))
        ObjVal <- Term1 + Term2 + Term3
        # Return objective function value to optimizer (base R function optim).
        return(ObjVal)
    }
    # Initiate index for storage of cross-validation results from all user-supplied combinations of
    # values of two penalty parameters.
    Cntr <- 0
    # Initialize penalties at zero.
    Strt <- c(rep(0, 2 * lil.p))
    # Initialize matrix for storage of cross-validation results.
    Results <- matrix(NA, nrow = length(lambda1) * length(lambda2), ncol = 1 + 2 * lil.p)
    # Loop over values of first penalty parameter.
    for (j in lambda1) {
        # Loop over values of second penalty parameter.
        for (k in lambda2) {
            # Random training-validation partitioning. TVS = 1 for training. TVS = 0 for validation.
            # [NOTE: Consider fixing seed of random number generator.]
            TVS <- rbinom(n = nrow(YMat), size = 1, prob = 0.6)
            # Increment index for storage of cross-validation results.
            Cntr <- Cntr + 1
            # Current penalty vector (pair).
            Pn <- c(j, k)
            # Run optimizer at current penalty vector.
            MinObj <- optim(par = as.vector(Strt), fn = Obj, method = "BFGS")
            # Calculate matrix Q, from paper, based on current optimum at current penalty vector.
            QMat <- matrix(as.numeric(MinObj$par), lil.p, 2, byrow = F)
            # Calculate matrix E, from paper, based on current optimum at current penalty vector.
            EMat <- YMat[TVS == 0,] - RMat[TVS == 0,] %*% QMat
            # Calculate cross-validation error as the Frobenius (norm) mean square error.
            FMSE <- frobenius.norm(EMat) / sum(TVS == 0)
            # Populate current row in results matrix.
            Results[Cntr, ] <- c(FMSE, MinObj$par)
            # Print penalty vector and cross-validation Frobenius (norm) mean square error.
            cat("Penalties = ", Pn, "Validation FMSE = ", FMSE, "\n")
        }
    }
    # Extract Q matrix of minimum cross-validation Frobenius (norm) mean square error.
    QMatOpt <- matrix(as.numeric(Results[which.min(Results[, 1]), 1 + 1:(2 * lil.p)]), lil.p, 2, byrow = F)
    # Calculate corresponding optimal projection of data in star plot space.
    OptCoords <- RMat %*% QMatOpt
    # Print cross-validated optimal Q matrix and optimal data coordinates.
    print("Estimated projection coefficients:")
    print(QMatOpt)
    print("Estimated PSS coordinates:")
    print(OptCoords)
    # Plot cross-validated optimal data coordinates of star plot.
    plot(OptCoords,
            # Star plot axes.
            xlab = "1st Starplot Coordinate", ylab = "2nd Starplot Coordinate", col.axis = "black",
            # [NOTE: do.first determines background color of star plot.]
            do.first = plot_bg(col = "white"), type = "n")
    # Plot data labeled by cluster. Color code labels by cluster.
    text(OptCoords, labels = ScRMat[,1], col = ColOpt[as.vector(DF[order(DF[, 1]), 1])], cex = 0.45)

 

    # Normalize optimal Q matrix's row vectors (one vector per marker) to scale of data.
    PlotQMatOpt <- M * 0.8 * max(abs(OptCoords)) * (QMatOpt / max(sqrt(diag(QMatOpt %*% t(QMatOpt)))))
    # Plot those normalized vectors.
    arrows(rep(0, lil.p), rep(0, lil.p), PlotQMatOpt[, 1], PlotQMatOpt[, 2], length = 0.05, col = "black")
    # Label those normalized vectors by marker.
    text(1.05 * PlotQMatOpt, labels = colnames(ScRMat[, 1 + rev(order(lil.rs))]), cex = 0.5, col = "black")
    # Return cross-validated optimal Q matrix and optimal data coordinates.
    return(list(QMatOpt, OptCoords))
}