#################################################################################################### path = "C:\\Documents and Settings\\Ryan Tibshirani.PROTEINSERVER\\Desktop\\kd" source(paste(path, "functions.R", sep="\\")) ## read in the data M = read.table(file=paste(path, "data_training.txt", sep="\\")) y = as.numeric(M[1,]) x = as.matrix(M[-1,]) rownames(x) = rownames(M)[-1] ## some useful constants f = nrow(x) n = ncol(x) ## set the seed of the random number generator, so that ## the folds are always the same set.seed(3033) ## try 6 different methods for classifier construction: ## 1. PAM ## 2. 1d classifiers ## 3. 2d classifiers ## 4. support vector machines ## 5. CART ################################################################################################### ## 1. PAM ## (do PAM first so that we can get the cv folds) # install PAM (only need to do this once) utils:::menuInstallLocal() # load PAM local({pkg <- select.list(sort(.packages(all.available = TRUE))) if(nchar(pkg)) library(pkg, character.only=TRUE)}) d = list(x=x, y=y, genenames=rownames(x), geneid=as.character(1:nrow(x))) # train pamtr = pamr.train(d) # cross validate, pamcv = pamr.cv(pamtr, d) folds = pamcv$folds ## save the folds - IMPORTANT! pamr.plotcv(pamcv) # estimate false discovery rates, and plot them pamfdr = pamr.fdr(pamtr, d) pamr.plotfdr(pamfdr) # choose a threshold, and list the corresponding features pamth = 2.162 cpam = pamr.listgenes(pamtr, d, threshold=pamth, genenames=TRUE) ################################################################################################### ## 2. 1d classifiers c1d = getClassifiers1d(x, y, numClassifiers=20) ## plot them root = paste(path, "1d\\classifier", sep="\\") plotClassifiers1d(x, y, c1d, root, pch=19, xlab="class", ylab="peak magnitude") # compute cross validated error e1d = numeric(length=10) for (i in 1:10) { ind = (1:length(y))[-folds[[i]]] cl = getClassifiers1d(x[,ind], y[ind]) bestguys = which(cl[,4]==cl[1,4]) errs = 0 for (k in bestguys) { index = cl[k,1] spl = cl[k,3] pred = numeric(length=length(folds[[i]])) if (cl[k,2]==2) { pred = 1 + (x[index,-ind] > spl) } else { pred = 2 - (x[index,-ind] > spl) } errs = errs + sum(y[-ind] != pred) } e1d[i] = errs/length(bestguys) } cverror1d = sum(e1d)/length(y) # 0.4460993 ################################################################################################### ## 3. 2d classifiers c2d = getClassifiers2d(x, y, numClassifiers=20) ## plot them root = paste(path, "2d\\classifier", sep="\\") plotClassifiers2d(x, y, c2d, root, pch=19) # compute cross validated error e2d = numeric(length=10) for (i in 1:10) { ind = (1:length(y))[-folds[[i]]] cl = getClassifiers2d(x[,ind], y[ind]) bestguys = which(cl[,4]==cl[1,4]) errs = 0 for (k in bestguys) { index1 = cl[k,1] index2 = cl[k,2] b0 = cl[k,3] b1 = cl[k,4] b2 = cl[k,5] pred = 1 + (exp(b0 + b1*x[index1,-ind] + b2*x[index2,-ind]) > 0.5) errs = errs + sum(y[-ind] != pred) } e2d[i] = errs/length(bestguys) } cverror2d = sum(e2d)/length(y) # 0.3617021 ################################################################################################### ## 4. support vector machines # install marginTree (only need to do this once) utils:::menuInstallLocal() # load marginTree local({pkg <- select.list(sort(.packages(all.available = TRUE))) if(nchar(pkg)) library(pkg, character.only=TRUE)}) # install e1071 (only need to do this once) utils:::menuInstallLocal() # load e1071 local({pkg <- select.list(sort(.packages(all.available = TRUE))) if(nchar(pkg)) library(pkg, character.only=TRUE)}) # train X = t(x) svmtr = marginTree(X, y) # cross validate, and plot svmcv = marginTree.cv(X, y, svmtr, folds=folds) marginTree.plotcv(svmcv) # choose a threshold, and list the corresponding features svmth = 0.301 csvm = marginTree.getnonzero(svmtr, threshold=svmth) ################################################################################################### ## 5. CART # install rpart (only need to do this once) utils:::menuInstallLocal() # load rpart local({pkg <- select.list(sort(.packages(all.available = TRUE))) if(nchar(pkg)) library(pkg, character.only=TRUE)}) # train, and plot cont = rpart.control(cp=0, folds=folds) X = t(x) Y = factor(y) tree = rpart(Y ~ X, cont) plot(tree) text(tree, use.n=TRUE) # training error result = predict(tree, data.frame(X)) result = 1 + (result[,2] > 0.5) err = sum(y!=result) / length(y) # 0.08510638 # cross validation error (get from summary(tree)) 25/47 * 0.9090909 = 0.483559 ################################################################################################### ################################################################################################### ################################################################################################### ################################################################################################### ################################################################################################### ################################################################################################### ################################################################################################### ################################################################################################### ################################################################################################### ## apply these methods to the test set ## read in the data Mtest = read.table(file=paste(path, "data_testing.txt", sep="\\")) ytest = as.numeric(Mtest[1,]) xtest = as.matrix(Mtest[-1,]) rownames(xtest) = rownames(Mtest)[-1] ## 1d classifier ## num = 1 ind = c1d[num,1] s = c1d[num,3] nam = rownames(c1d)[num] pred1d = 1 + (xtest[ind,] <= s) err1d = sum(pred1d!=ytest)/length(ytest) # 0.4090909 plot(rep(1,sum(y==1)), x[ind,y==1], col="blue", pch=20, main=nam, xlab="", ylab="peak magnitude", xlim=c(0,5), ylim=c(min(min(x[ind,]), min(xtest[ind,])), max(max(x[ind,]), max(xtest[ind,])))) points(rep(2, sum(y==2)), x[ind,y==2], col="red", pch=20) points(rep(3, sum(ytest==1)), xtest[ind,ytest==1], col="blue", pch=20) points(rep(4, sum(ytest==2)), xtest[ind,ytest==2], col="red", pch=20) abline(h=s) abline(v=2.5, lty="dotted") #--- num = 2 ind = c1d[num,1] s = c1d[num,3] nam = rownames(c1d)[num] pred1d = 1 + (xtest[ind,] <= s ) err1d = sum(pred1d!=ytest)/length(ytest) # 0.5909091 plot(rep(1,sum(y==1)), x[ind,y==1], col="blue", pch=20, main=nam, xlab="", ylab="peak magnitude", xlim=c(0,5), ylim=c(min(min(x[ind,]), min(xtest[ind,])), max(max(x[ind,]), max(xtest[ind,])))) points(rep(2, sum(y==2)), x[ind,y==2], col="red", pch=20) points(rep(3, sum(ytest==1)), xtest[ind,ytest==1], col="blue", pch=20) points(rep(4, sum(ytest==2)), xtest[ind,ytest==2], col="red", pch=20) abline(h=s ) abline(v=2.5, lty="dotted") ## 2d classifier ## num = 1 index1 = c2d[num,1] index2 = c2d[num,2] b0 = c2d[num,3] b1 = c2d[num,4] b2 = c2d[num,5] xlab=strsplit(rownames(c2d)[num], ", ")[[1]][1] ylab=strsplit(rownames(c2d)[num], ", ")[[1]][2] pred2d = 1 + (exp(b0 + b1*xtest[index1,] + b2*xtest[index2,]) > 0.5) err2d = sum(pred2d!=ytest)/length(ytest) # 0.3181818 pch=19 plot(x[index1,y==1], x[index2,y==1], main="", xlab=xlab, ylab=ylab, xlim=c(min(c(x[index1,],xtest[index1,])), max(c(x[index1,],xtest[index1,]))), ylim=c(min(c(x[index2,],xtest[index2,])), max(c(x[index2,],xtest[index2,]))), pch=pch, col="lightblue") points(x[index1,y==2], x[index2,y==2], col="pink", pch=pch) abline(-b0/b2, -b1/b2) points(xtest[index1,ytest==1], xtest[index2,ytest==1], col="blue", pch=pch) points(xtest[index1,ytest==2], xtest[index2,ytest==2], col="red", pch=pch) #--- num = 2 index1 = c2d[num,1] index2 = c2d[num,2] b0 = c2d[num,3] b1 = c2d[num,4] b2 = c2d[num,5] xlab=strsplit(rownames(c2d)[num], ", ")[[1]][1] ylab=strsplit(rownames(c2d)[num], ", ")[[1]][2] pred2d = 1 + (exp(b0 + b1*xtest[index1,] + b2*xtest[index2,]) > 0.5) err2d = sum(pred2d!=ytest)/length(ytest) # 0.3636364 pch=19 plot(x[index1,y==1], x[index2,y==1], main="", xlab=xlab, ylab=ylab, xlim=c(min(c(x[index1,],xtest[index1,])), max(c(x[index1,],xtest[index1,]))), ylim=c(min(c(x[index2,],xtest[index2,])), max(c(x[index2,],xtest[index2,]))), pch=pch, col="lightblue") points(x[index1,y==2], x[index2,y==2], col="pink", pch=pch) abline(-b0/b2, -b1/b2) points(xtest[index1,ytest==1], xtest[index2,ytest==1], col="blue", pch=pch) points(xtest[index1,ytest==2], xtest[index2,ytest==2], col="red", pch=pch) ## PAM ## predpam = pamr.predict(pamtr, xtest, threshold=pamth, type="class") errpam = sum(predpam!=ytest)/length(ytest) # 0.3636364 ## margin tree ## predsvm = marginTree.predict(svmtr, t(xtest), threshold=svmth) predsvm = 1 + (predsvm=="2") errsvm = sum(predsvm!=ytest)/length(ytest) # 0.3181818 ## CART ## predtree = predict(tree, list(X=t(xtest))) predtree = 1 + (predtree[,2]>0.5) errtree = sum(predtree!=ytest)/length(ytest) # 0.5