#### ### ## functions.R ## Ryan Tibshirani ## ryantibs@stanford.edu ## ## Here are some useful functions for programming in R. ## My intention was to optimze the functions for speed, but I always feel a bit ## goofy programming in R, so please email me if you see anything that could be ## improved. ## If for some reason you can't get these functions to work: take a deep breath, ## and send me an email clearly explaining the problem. ### #### #### I. Statistical tests #### -------------------- ## Function: varr ## ## Notes: ## Do not worry about this function. It is a helper function for tTest. ## varr = function(x, m) { n = ncol(x) p = nrow(x) z = matrix(1, nrow=n, ncol=1) d = x - m %*% t(z) ans = (d^2) %*% rep(1/(n-1), n) ans = drop(ans) return (ans) } ## Function: tTest ## ## Notes: returns a (unpaired) t statistic for each feature in the matrix x ## ## Arguments: x - matrix of data, features on the rows and samples on the columns ## y - vector of class labels (entries must be 1s and 2s) ## s0 - optional parameter indicating a baseline standard deviation to use ## tTest = function(x, y, s0=0) { n1 = sum(y==1) n2 = sum(y==2) p = nrow(x) m1 = rowMeans(x[,y==1,drop=F]) m2 = rowMeans(x[,y==2,drop=F]) s = sqrt( ((n2-1) * varr(x[,y==2], m2) + (n1-1) * varr(x[,y==1], m1)) * (1/n1 + 1/n2)/(n1+n2-2) ) return((m2-m1)/(s+s0)) } ## Function: wilcoxTest ## ## Notes: returns a Wilcoxon rank sum statistic for each feature in the matrix x ## ## Arguments: x - matrix fo data, features on the rows and samples on the columns ## y - vector of class labels (entries must be 1s and 2s) ## s0 - optional parameter indicating a baseline standard deviation wilcoxTest = function(x, y, s0=0) { n1 = sum(y==1) n2 = sum(y==2) p = nrow(x) r2 = rowSums(t(apply(x,1,rank))[,y==2,drop=F]) W = r2 - (n2/2)*(n2+1) - (n1*n2)/2 s = sqrt(n1*n2*(n1+n2+1)/12) return(W/(s+s0)) } #### II. Classification #### ------------------ ## Function: getClassifiers1d ## ## Notes: given the matrix of data x and class labels y, returns a list of ## the best single dimensional classifiers ## ## Arguments: x - matrix fo data, features on the rows and samples on the columns ## y - vector of class labels (entries must be 1s and 2s) ## w1,w2 - optional weighting factors for the errors in class 1 and 2 respectively ## (warning: using these will change the reported "training error"!) ## numClassifiers - optional parameter, indicating the number of classifiers to return ## getClassifiers1d = function(x, y, w1=NULL, w2=NULL, numClassifiers=10) { # if the rows of x are not named, assign them dummy names if (is.null(rownames(x))) { rownames(x)=as.character((1:nrow(x))) } # if w1 and w2 are not specified, default them to 1 if (is.null(w1)) { w1 = 1 } if (is.null(w2)) { w2 = 1 } # sort each feature (column), and sort the class labels accordingly Y = matrix(nrow=nrow(x), ncol=ncol(x)) for (i in (1:nrow(x))) { ind = order(x[i,]) x[i,] = x[i,ind] Y[i,] = y[ind] } n1 = sum(y==1) n2 = sum(y==2) # for each feature (column), determine the rule and compute # the error at each split point # (note: for the matrix R, a value of 1 means that class 1 is # above the split point, and similarly for 2) R = matrix(nrow=nrow(x), ncol=length(y)-1) E = matrix(nrow=nrow(x), ncol=length(y)-1) H = matrix(nrow=nrow(x), ncol=length(y)-1) sum1 = numeric(length=nrow(x)) sum2 = numeric(length=nrow(x)) mins = function(v) { k = order(v)[1] return(c(k, v[k])) } for (i in (1:ncol(E))) { # update the class sums d = Y[,i]==1 sum1 = sum1 + d sum2 = sum2 + !d # determine the rule and compute the error for this split point e1 = sum1*w1 + (n2-sum2)*w2 e2 = (n1-sum1)*w1 + sum2*w2 temp = t(apply(cbind(e1, e2), 1, mins)) R[,i] = temp[,1] E[,i] = temp[,2] H[,i] = (x[,i] + x[,i+1])/2 } # for each row, choose the split point with the lowest training error, # and record the corresponding rules and heights e = t(apply(E, 1, mins)) r = numeric(length=nrow(e)) h = numeric(length=nrow(e)) for (i in (1:nrow(e))) { r[i] = R[i,e[i,1]] h[i] = H[i,e[i,1]] } # among all rows, take the best numClassifiers classifiers ind = order(e[,2])[1:numClassifiers] # return a matrix with these results M = cbind(ind, r[ind], h[ind], e[,2][ind]) rownames(M) = rownames(x)[ind] colnames(M) = c("index", "rule", "split", "training_error") return(M) } ## Function: plotClassifier1d ## ## Notes: plots a classifier in the returned list classifiers from getClassifier1d ## I.e. if c1d denotes the list returned by getClassifiers1d, to plot the best ## classifier, call plotClassifier1d(x, y, c1d[1,]). To to plot the kth best ## classifier, call plotClassifier1d(x, y, c1d[k,]). ## ## Arguments: x - matrix of data, features on rows and samples on columns ## y - vector of class labels (1s and 2s) ## r - row in returned list of classifiers from getClassifiers1d ## col1, col2 - optional parameters indicating the colors for the two classes ## ... - any aditional plotting parameters ## plotClassifier1d = function(x, y, r, col1="blue", col2="red", ...) { plot(rep(1,sum(y==1)), x[r[1],y==1], xlim=c(0,3), col=col1, ...) points(rep(2, sum(y==2)), x[r[1],y==2], col=col2, ...) abline(h=r[3]) } plotClassifiers1d = function(x, y, M, root, col1="blue", col2="red", ...) { for (i in seq(1,nrow(M))) { jpeg(file=paste(root, i, ".jpg", sep="")) plot(rep(1,sum(y==1)), x[M[i,1],y==1], xlim=c(0,3), col=col1, main=paste(rownames(x)[M[i,1]], "\ntraining error=", M[i,4], " (", round(M[i,4]/length(y)*100, digits=2), "%)", sep=""), ...) points(rep(2, sum(y==2)), x[M[i,1],y==2], col=col2, ...) abline(h=M[i,3]) dev.off() } } ## Function: getClassifiers2d ## ## Notes: given the matrix x with classes indicated by y, returns a list of the best two ## dimensional classifiers ## ## Arguments: x - matrix of data, features on rows and samples on columns ## y - vector of class labels (1s and 2s) ## w1,w2 - optional weighting factors, as in getClassifiers1d ## numClassifiers - optional parameter indicating the number of classifiers to return ## depth - the total number of features to look at when computing the 2d classifiers ## (i.e. from these features, all 50!/(2*48!) combinations of features will be tried) ## ind - if depth is specified and you know exactly which pool of features to use when ## trying combinations, you can specify their indices (as rows of x) here. Otherwise, ## the function just takes the best 1d classifiers. ## getClassifiers2d = function(x, y, w1=NULL, w2=NULL, numClassifiers=10, depth=50, ind=NULL) { # if the rows of x are not named, assign them dummy names if (is.null(rownames(x))) { rownames(x)=as.character((1:nrow(x))) } # if w1 and w2 are not specified, default them to 1 if (is.null(w1)) { w1 = 1 } if (is.null(w2)) { w2 = 1 } # if the indices of the nodes have not been specified, take the best # numNodes 1d classifiers if (is.null(ind)) { ind = getClassifiers1d(x, y, numClassifiers=depth)[,"index"] } # for each node, try this in combination with every other peak # as a 2d classifier fitLinearModel = function(x1, x2, y, w1, w2) { model = glm((y-1) ~ cbind(x1,x2), family="binomial", weights=(w1*(y==1) + w2*(y==2))) result = predict(model, type="response") result = 1 + (result>0.5) error = w1*sum(y==1 & result==2) + w2*sum(y==2 & result==1) return(c(model\$coefficients, error, model\$deviance)) } E = matrix(nrow=depth*(depth-1)/2, ncol=7) colnames(E) = c("index1", "index2", "b0", "b1", "b2", "training_error", "deviance") rownames(E) = character(length=nrow(E)) count = 1 for (i in ind) { for (j in ind[ind>i]) { E[count,] = c(i, j, fitLinearModel(x[i,], x[j,], y, w1, w2)) rownames(E)[count] = paste(rownames(x)[i], rownames(x)[j], sep=", ") count=count+1 } } # among all combinations, take the best numClassifiers classifiers k = order(E[,6])[1:numClassifiers] return(E[k,]) } ## Function: plotClassifier2d ## ## Notes: plots a classifiers in the returned list of classifiers from getClassifiers2d. ## I.e. if the returned object from getClassifiers2d is c2d, to plot the kth best ## 2d classifier you call plotClassifier2d(x, y, c2d[k,]). ## ## Arguments: x - matrix of data, features on the rows, samples on the columns ## y - vector of class labels (1s and 2s) ## r - row in returned list of classifiers from getClassifiers2d ## col1, col2 - optional parameters indicating the colors of the classes ## ... - any aditional plotting parameters ## plotClassifier2d = function(x, y, r, col1="blue", col2="red", ...) { plot(x[r[1],y==1], x[r[2],y==1], xlim=c(min(x[r[1],]),max(x[r[1],])), ylim=c(min(x[r[2],]),max(x[r[2],])), col=col1, ...) points(x[r[1],y==2], x[r[2],y==2], col=col2, ...) abline(-r[3]/r[5], -r[4]/r[5]) } plotClassifiers2d = function(x, y, M, root, col1="blue", col2="red", ...) { for (i in seq(1,nrow(M))) { jpeg(file=paste(root, i, ".jpg", sep="")) plot(x[M[i,1],y==1], x[M[i,2],y==1], xlim=c(min(x[M[i,1],]),max(x[M[i,1],])), ylim=c(min(x[M[i,2],]),max(x[M[i,2],])), col=col1, main=paste("training error=", M[i,6], " (", round(M[i,6]/length(y)*100, digits=2), "%)", sep=""), xlab=strsplit(rownames(M)[i], ", ")[[1]][1], ylab=strsplit(rownames(M)[i], ", ")[[1]][2], ...) points(x[M[i,1],y==2], x[M[i,2],y==2], col=col2, ...) abline(-M[i,3]/M[i,5], -M[i,4]/M[i,5]) dev.off() } }