Category:Confidence Intervals, p-values, t-tests

This is the complete Project 4 written in R. This code generates, confidence intervals, p values, and performs ttests comparing various algorithms. In the last part of this post, the same folds of of cross validation are used to compare the accuracy of C5.0 vs. neuralnets. library(C50) library(neuralnet) library(nnet) accuracy <- function(truth, prediction) { tbl <- table(truth, prediction) sum(diag(tbl))/sum(tbl) } diabetes <- read.csv("~/Google Drive/PhD/CS539/Project4/diabetes.csv", na.strings="?") sample1 <- stratified(diabetes,"class",0.6510417) sample2 <- stratified(diabetes,"class",0.6510417) trainSize <- floor(nrow(sample1)*0.75) index <- sample(nrow(sample1), trainSize, replace=FALSE) trainData1 <- sample1[index,] testData1 <- sample1[-index,] trainSize <- floor(nrow(sample2)*0.75) index <- sample(nrow(sample2), trainSize, replace=FALSE) trainData2 <- sample2[index,] testData2 <- sample2[-index,] ZN <- 1.96 # N = 0.95 TNK <- 1.8331 # N 0.95 K-1 = 9 c50ctrl = C5.0Control(winnow = FALSE, noGlobalPruning = FALSE,CF = 0.25,minCases = 10,fuzzyThreshold = FALSE,sample = 0,seed = sample.int(4096, size = 1) - 1L,earlyStopping = TRUE,label = "outcome") model1 <- C5.0(class ~ ., data = trainData1, control = c50ctrl) prediction1 <- predict(model1, testData1, type="class") confusionMatrix1 <- table(testData1$class, prediction1) cat("Confusion Matrix: \n") confusionMatrix1 c50acc <- accuracy(testData1$class, prediction1) cat("Accuracy for C5.0", c50acc,"\n") c50error <- 1 - c50acc cat("Error for C5.0: 1 - accuracy: ",c50error,"\n") c50SD <- sqrt((c50error*(1-c50error))/nrow(testData1)) cat("SD for C5.0: sqrt((c50error*(1-c50error))/nrow(testData1): ",c50SD,"\n") lbound <- c50error - ZN*c50SD ubound <- c50error + ZN*c50SD cat("Confidence Interval [c50error - ZN*c50SD, c50error + ZN*c50SD] -> [",lbound,",",ubound,"]\n") trainData2forNetwork <- cbind(trainData2[,], class.ind(trainData2$class)) trainData2forNetwork <- trainData2forNetwork[,-9] trainData2forNetwork[,c(1:8)] <- scale(trainData2forNetwork[,c(1:8)]) testData2forNetwork <- scale(testData2[,c(1:8)]) networkHnodes <- c(1) # Hidden Node Configuration lrate <- 0.01 # Learning Rate formula1 <- "tested_negative + tested_positive ~ preg + plas + pres + skin + insu + mass + pedi + age" model2 <- neuralnet(formula1,trainData2forNetwork,hidden=networkHnodes,algorithm="rprop+", linear.output=FALSE, learningrate=lrate) prediction2 <- compute(model2, testData2forNetwork) #Run them through the neural network finalResults <- vector(mode="character") for(j in 1:nrow(prediction2$net.result)) {  if(which.max(prediction2$net.result[j,]) == 1) finalResults[j] = 'tested_negative' if(which.max(prediction2$net.result[j,]) == 2) finalResults[j] = 'tested_positive' } confusionMatrix2 <- table(testData2$class, finalResults) cat("Confusion Matrix For Neural Net: \n") confusionMatrix2 nnetacc <- accuracy(testData2$class, finalResults) cat("Accuracy for Nueral Net", nnetacc,"\n") nneterror <- 1 - nnetacc cat("Error for Neural Net: 1 - accuracy: ",nneterror,"\n") nnetSD <- sqrt((nneterror*(1-nneterror))/nrow(testData2)) cat("SD for Neural Net: sqrt((nneterror*(1-nneterror))/nrow(testData2)): ",nnetSD,"\n") lbound <- nneterror - ZN*nnetSD ubound <- nneterror + ZN*nnetSD cat("Confidence Interval [nneterror - ZN*nnetSD, nneterror + ZN*nnetSD] -> [",lbound,",",ubound,"]\n") d <- nneterror - c50error cat("d = nneterror - c50error: ", d) combinedSD <- sqrt((c50error*(1-c50error))/nrow(testData1) + (nneterror*(1-nneterror))/nrow(testData2)) cat("combinedSD sqrt((c50error*(1-c50error))/nrow(testData1) + (nneterror*(1-nneterror))/nrow(testData2))",combinedSD,"\n") lbound <- d - ZN*combinedSD ubound <- d + ZN*combinedSD cat("Confidence Interval [d - ZN*combinedSD, d + ZN*combinedSD] -> [",lbound,",",ubound,"]\n") kfolds <- 10 index <- 1:nrow(diabetes) index <- sample(index) ### shuffle index fold <- rep(1:kfolds, each=nrow(diabetes)/kfolds)[1:nrow(diabetes)] folds <- split(index, fold) ### create list with indices for each fold trainNetwork <- cbind(diabetes[,], class.ind(diabetes$class)) trainNetwork <- trainNetwork[,-9] trainNetwork[,c(1:8)] <- scale(trainNetwork[,c(1:8)]) accs_c50 <- vector(mode="numeric") accs_nnet <- vector(mode="numeric") sigmaSum <- 0 for(i in 1:length(folds)) {  #cat("Calculating Fold: ",i,"\n") tree <- C5.0(class ~ ., data = diabetes[-foldsi,], control = c50ctrl) prediction3 <- predict(tree, diabetes[foldsi,], type="class") actualdata <- diabetes[foldsi,]$class c50acc <- accuracy(actualdata, prediction3) accs_c50[i] <- c50acc nnetwork <- neuralnet(formula1,trainNetwork[-foldsi,],hidden=networkHnodes,algorithm="rprop+", linear.output=FALSE, learningrate=lrate) prediction4 <- compute(nnetwork, trainNetwork[foldsi,c(1:8)]) #Run them through the neural network finalResults <- vector(mode="character") for(j in 1:nrow(prediction4$net.result)) {    if(which.max(prediction4$net.result[j,]) == 1) finalResults[j] = 'tested_negative' if(which.max(prediction4$net.result[j,]) == 2) finalResults[j] = 'tested_positive' }   nnetacc <- accuracy(actualdata, finalResults) accs_nnet[i] <- nnetacc if(i == 1) # Added confusion matrix {    confusionMatrixC50 <- table(actualdata, prediction3) confusionMatrixNNet <- table(actualdata, finalResults) }  else {    confusionMatrixC50 <- confusionMatrixC50 + table(actualdata, prediction3) confusionMatrixNNet <- confusionMatrixNNet + table(actualdata, finalResults) }  hdiff <- nnetacc - c50acc sigmaSum <- sigmaSum + hdiff cat("Fold: ",i," errorT",i,"(NNET) - errorT",i,"(C50) = sigma",i," => ", nnetacc, " - ", c50acc, " = ", hdiff,"\n") } sigma <-sigmaSum / kfolds cat("sigma bar = sigmaSum / kfolds: ", sigma) sigmaForSd <- 0 for(i in 1:length(accs_nnet)) {  cat("DIF",i," -> ", accs_nnet[i] - accs_c50[i], "\n") hdiff <- nnetacc - c50acc sigmaForSd <- (sigmaForSd + (hdiff - sigma)^2) } sigmaForSd <- sigmaForSd/(kfolds*(kfolds-1)) sigmaSD <- sqrt(sigmaForSd) lbound <- sigma - TNK*sigmaSD ubound <- sigma + TNK*sigmaSD cat("CI [sigma - TNK*sigmaSD, sigma + TNK*sigmaSD] -> [",lbound,",",ubound,"]\n") # paired t-test t.test(accs_nnet,accs_c50,paired=TRUE)
 * 1) use a function for accuracy got this from Michael Hahslers Intro to Data Mining Book
 * 1) load diabetes data
 * 1) Get 2 stratified Samples
 * 1) Split each sample in a train and test set
 * 1) randomly select 75% of data for training and 25% for testing
 * 1) Configure Quinlan's C5.0 algorithm Got this from C50 package information and Michael Hahslers Intro to Data Mining Book
 * 1) Prepare Data for Neural Network
 * 1) Accuracy of Neuralnet model
 * 1) Folds Need To Be Shared For C5.0 and neuralnet
 * 1) Prepare Data for Neural Network X-Validation
 * 1) Generate Both Models Model
 * 1) cat("Generating Model\n")
 * 1) test each fold