##################################### # Labwork on decision trees - PRISM # ##################################### # Variables and screen cleaning: graphics.off(); cat("\014"); rm(list=ls()); options(warn=-1) # EXERCISE 1 ### 1. temp <- tempfile(fileext = ".csv") download.file("https://drive.google.com/uc?authuser=0&id=1U0TsgUGVpmEi7axFjDzStyPGrFlyiyUr", temp) df <- read.csv(temp) ### 2. library(rpart); library(rpart.plot) nrun <- 100 res <- c() for (irun in 1:nrun){ df <- df[sample(nrow(df)), ] train <- df[1:180, ] test <- df[181:200, ] tree <- rpart(Drug ~ ., train) preds_dummy <- rep(names(table(train$Drug))[which.max(table(train$Drug))], nrow(test)) preds_tree <- predict(tree, test, type="class") res <- rbind(res, c( mean(preds_dummy == test$Drug), mean(preds_tree == test$Drug) )) } colnames(res) <- c("dummy", "tree") boxplot(res) colMeans(res) ### 3. tree_global <- rpart(Drug ~ ., df) prp(tree_global, extra=2) ### 4. df$BP[df$BP == "LOW"] <- 1 df$BP[df$BP == "NORMAL"] <- 2 df$BP[df$BP == "HIGH"] <- 3 df$BP <- as.numeric(df$BP) df$Cholesterol[df$Cholesterol == "NORMAL"] <- 1 df$Cholesterol[df$Cholesterol == "HIGH"] <- 2 df$Cholesterol <- as.numeric(df$Cholesterol) ### 5. nrun <- 100 res <- c() for (irun in 1:nrun){ df <- df[sample(nrow(df)), ] train <- df[1:195, ] test <- df[196:200, ] tree <- rpart(Drug ~ ., train) preds_dummy <- rep(names(table(train$Drug))[which.max(table(train$Drug))], nrow(test)) preds_tree <- predict(tree, test, type="class") res <- rbind(res, c( mean(preds_dummy == test$Drug), mean(preds_tree == test$Drug) )) } colnames(res) <- c("dummy", "tree") boxplot(res); colMeans(res) tree_global <- rpart(Drug ~ ., df) prp(tree_global, extra=2) # EXERCISE 2 # (a) dataset <- read.csv("https://raw.githubusercontent.com/lgi2p/decision_tree_labwork/master/gobelins.csv") # (b) dataset <- subset(dataset, select = -id) # it is not a variable, it is an identifier -> no generalisable information # (c) library(rpart) tree <- rpart(type ~ ., dataset) # (d) library(rpart.plot) prp(tree, extra=1, varlen=0, faclen=0) rpart.plot(tree) # (e) # plot -> a goblin. new_x <- data.frame(bone_length=0.3, rotting_flesh=0.5, hair_length=0.6, has_soul=0.75, color="green") predict(tree, new_x, type="class") # plot -> confidence = 18/(1+3+18)=82% predict(tree, new_x, type="prob") # (f) importances <- tree$variable.importance/sum(tree$variable.importance) round(importances, 2) barplot(importances) importances_df <- data.frame(importance=importances, feature=names(importances)) row.names(importances_df) <- NULL importances_df$feature <- factor(importances_df$feature, levels=c("hair_length", "has_soul", "rotting_flesh", "bone_length", "color")) library(ggplot2) ggplot(data=importances_df, aes(x=feature, y=importance)) + geom_bar(stat="identity") ggplot(data=importances_df, aes(x=feature, y=importance)) + geom_bar(stat="identity", fill="steelblue") + labs(title="Variable importance \nfor monster type prediction") + theme_minimal(15) + theme(axis.text.x=element_text(size=15, angle=45, hjust=1), axis.text.y=element_text(size=20), axis.title.x=element_text(size=20, color="red"), axis.title.y=element_text(size=30, color="red"), plot.title=element_text(color="black", size=20, face="bold", hjust=0.5, vjust=0)) # (g) dataset$color <- factor(dataset$color) nRun <- 20 baseline_accs <- c() tree_accs <- c() for (iRun in 1 : nRun){ cat(iRun, " ") # Division dataset -> trainSet + validationSet: dataset <- dataset[sample(nrow(dataset)), ] test_set <- dataset[1:250, ] train_set <- dataset[251:271, ] # Learning: tree <- rpart(type ~ ., train_set) # Prediction: baseline_preds <- rep(names(which.max(table(train_set$type))), nrow(test_set)) tree_preds <- predict(tree, test_set, type="class") # Evaluation: baseline_acc <- mean(baseline_preds == test_set$type) tree_acc <- mean(tree_preds == test_set$type) baseline_accs <- c(baseline_accs, baseline_acc) tree_accs <- c(tree_accs, tree_acc) } results <- data.frame(baseline=baseline_accs, tree=tree_accs) colMeans(results) ggplot(stack(results), aes(x=ind, y=values)) + geom_boxplot() + labs(y="accuracy")+ labs(x="") # (h) library("reshape2") maxdepths <- 1:20 nRun <- 500 train_accs <- c() test_accs <- c() for (maxdepth in maxdepths){ cat(maxdepth, ' ') train_accs_1maxdepth <- c() test_accs_1maxdepth <- c() for (iRun in 1 : nRun){ # Division dataset -> trainSet + validationSet: dataset <- dataset[sample(nrow(dataset)), ] train_set <- dataset[1:260, ] test_set <- dataset[261:271, ] # Learning: tree <- rpart(type ~ ., train_set, maxdepth=maxdepth, cp=0, minsplit=2) # Prediction: train_preds <- predict(tree, train_set, type="class") test_preds <- predict(tree, test_set, type="class") # Evaluation: train_acc <- mean(train_preds == train_set$type) test_acc <- mean(test_preds == test_set$type) train_accs_1maxdepth <- c(train_accs_1maxdepth, train_acc) test_accs_1maxdepth <- c(test_accs_1maxdepth, test_acc) } train_accs <- c(train_accs, mean(train_accs_1maxdepth)) test_accs <- c(test_accs, mean(test_accs_1maxdepth)) } results <- data.frame(maxdepth=maxdepths, train=train_accs, test=test_accs) results_long <- melt(results, id="maxdepth") # convert to long format ggplot(results_long, aes(x=maxdepth, y=value, colour=variable)) + geom_line() + theme_bw(15) # -> optimal depth = 5 (large trees have higher computation times) maxdepths <- 1:20 train_accs <- c() test_accs <- c() for (maxdepth in maxdepths){ cat(maxdepth, ' ') train_accs_1maxdepth <- c() test_accs_1maxdepth <- c() for (i in 1 : nrow(dataset)){ train_set <- dataset[-i, ] test_set <- dataset[i, ] # Learning: tree <- rpart(type ~ ., train_set, maxdepth=maxdepth, cp=0, minsplit=2) # Prediction: train_preds <- predict(tree, train_set, type="class") test_preds <- predict(tree, test_set, type="class") # Evaluation: train_acc <- mean(train_preds == train_set$type) test_acc <- mean(test_preds == test_set$type) train_accs_1maxdepth <- c(train_accs_1maxdepth, train_acc) test_accs_1maxdepth <- c(test_accs_1maxdepth, test_acc) } train_accs <- c(train_accs, mean(train_accs_1maxdepth)) test_accs <- c(test_accs, mean(test_accs_1maxdepth)) } results <- data.frame(maxdepth=maxdepths, train=train_accs, test=test_accs) results_long <- melt(results, id="maxdepth") # convert to long format ggplot(results_long, aes(x=maxdepth, y=value, colour=variable)) + geom_line() + theme_bw(15) # 10% de surapp (acc) pour cp=0 et minsplit=2 maxdepths <- 1:20 minsplits <- 2:10 train_accs <- c() test_accs <- c() for (minsplit in minsplits){ cat(minsplit, ' ') train_accs_1minsplit <- c() test_accs_1minsplit <- c() for (i in 1 : nrow(dataset)){ train_set <- dataset[-i, ] test_set <- dataset[i, ] # Learning: tree <- rpart(type ~ ., train_set, maxdepth=5, cp=0, minsplit=minsplit) # Prediction: train_preds <- predict(tree, train_set, type="class") test_preds <- predict(tree, test_set, type="class") # Evaluation: train_acc <- mean(train_preds == train_set$type) test_acc <- mean(test_preds == test_set$type) train_accs_1minsplit <- c(train_accs_1minsplit, train_acc) test_accs_1minsplit <- c(test_accs_1minsplit, test_acc) } train_accs <- c(train_accs, mean(train_accs_1minsplit)) test_accs <- c(test_accs, mean(test_accs_1minsplit)) } results <- data.frame(minsplit=minsplits, train=train_accs, test=test_accs) results_long <- melt(results, id="minsplit") ggplot(results_long, aes(x=minsplit, y=value, colour=variable)) + geom_line() + theme_bw(15) # optimal minsplit=4 (for maxdepth=5), 7% de surapp (acc) pour cp=0 et maxdepth=5 maxdepth <- 5 minsplit <- 4 cps <- seq(from=0.05, to=0, by=-0.001) train_accs <- c() test_accs <- c() for (cp in cps){ cat(cp, ' ') train_accs_1cp <- c() test_accs_1cp <- c() for (i in 1 : nrow(dataset)){ train_set <- dataset[-i, ] test_set <- dataset[i, ] # Learning: tree <- rpart(type ~ ., train_set, maxdepth=maxdepth, cp=cp, minsplit=minsplit) # Prediction: train_preds <- predict(tree, train_set, type="class") test_preds <- predict(tree, test_set, type="class") # Evaluation: train_acc <- mean(train_preds == train_set$type) test_acc <- mean(test_preds == test_set$type) train_accs_1cp <- c(train_accs_1cp, train_acc) test_accs_1cp <- c(test_accs_1cp, test_acc) } train_accs <- c(train_accs, mean(train_accs_1cp)) test_accs <- c(test_accs, mean(test_accs_1cp)) } results <- data.frame(cp=cps, train=train_accs, test=test_accs) results_long <- melt(results, id="cp") ggplot(results_long, aes(x=cp, y=value, colour=variable)) + geom_line() + theme_bw(15) # optimal cp=0 (for maxdepth=5 and minsplit=4) # (i) library(randomForest) dataset$type <- as.factor(dataset$type) accsTree <- c() accsForest10 <- c() accsForest50 <- c() accsForest100 <- c() accsForest200 <- c() accsForest2000 <- c() accsForest5000 <- c() for (i in 1 : nrow(dataset)){ cat(i, " ") testSet <- dataset[i, ] trainSet <- dataset[-i, ] # Learning: tree <- rpart(type ~ ., trainSet, maxdepth = 5) forest10 <- randomForest(type ~ ., trainSet, ntree = 10) forest50 <- randomForest(type ~ ., trainSet, ntree = 50) forest100 <- randomForest(type ~ ., trainSet, ntree = 100) forest200 <- randomForest(type ~ ., trainSet, ntree = 200) forest2000 <- randomForest(type ~ ., trainSet, ntree = 2000) forest5000 <- randomForest(type ~ ., trainSet, ntree = 5000) # Prediction: predTree <- predict(tree, testSet, type = "class") predForest10 <- predict(forest10, testSet) predForest50 <- predict(forest50, testSet) predForest100 <- predict(forest100, testSet) predForest200 <- predict(forest200, testSet) predForest2000 <- predict(forest2000, testSet) predForest5000 <- predict(forest5000, testSet) # Evaluation: accsTree <- c(accsTree, predTree == testSet$type) accsForest10 <- c(accsForest10, predForest10 == testSet$type) accsForest50 <- c(accsForest50, predForest50 == testSet$type) accsForest100 <- c(accsForest100, predForest100 == testSet$type) accsForest200 <- c(accsForest200, predForest200 == testSet$type) accsForest2000 <- c(accsForest2000, predForest2000 == testSet$type) accsForest5000 <- c(accsForest5000, predForest5000 == testSet$type) } results <- data.frame(tree=accsTree, forest10=accsForest10, forest50=accsForest50, forest100=accsForest100, forest200=accsForest200, forest2000=accsForest2000, forest5000=accsForest5000) colMeans(results) boxplot(results) ggplot(utils::stack(results), aes(x = ind, y = values)) + geom_boxplot() + labs(y = "accuracy") + labs(x = "") + ggtitle("Leave one-out cross validation") # -> 200 trees seems the optimal forest size # EXERCISE 3 # (a) library(rattle) dataset <- weather # (b) dataset <- dataset[, !(names(weather) %in% c("Date", "RISK_MM"))] #, "RISK_MM" #dataset <- subset(dataset, select=-Date) dataset$RainTomorrow <- as.character(dataset$RainTomorrow) dataset$RainTomorrow[dataset$RainTomorrow == "Yes"] <- 1 dataset$RainTomorrow[dataset$RainTomorrow == "No"] <- 0 dataset$RainTomorrow <- factor(dataset$RainTomorrow, levels = c(1, 0)) tree <- rpart(RainTomorrow ~ ., dataset) rpart.plot(tree) mean(predict(tree, dataset, type = "class") == dataset$RainTomorrow) # (c) nRun <- 20 nFold <- 10 iRun=1 accs <- c() recs <- c() pres <- c() for (iRun in 1 : nRun){ cat("\nrun", iRun, ": ") folds <- createFolds(dataset$RainTomorrow, nFold, list = T, returnTrain = F) fold="Fold1" acc1run <- c() rec1run <- c() pre1run <- c() for (fold in names(folds)){ cat(fold, "") # Division dataset -> trainSet + validationSet: testSet <- dataset[folds[[fold]], ] trainSet <- dataset[setdiff(1 : nrow(dataset), folds[[fold]]), ] # Learning: tree <- rpart(RainTomorrow ~ ., trainSet) # Prediction: preds <- predict(tree, testSet, type = "class") # Evaluation: confMat <- confusionMatrix(data = preds, reference = testSet$RainTomorrow) acc <- confMat$overall[1] rec <- recall(confMat$table) pre <- precision(confMat$table) acc1run <- c(acc1run, acc) rec1run <- c(rec1run, rec) pre1run <- c(pre1run, pre) } accs <- c(accs, mean(acc1run)) recs <- c(recs, mean(rec1run)) pres <- c(pres, mean(pre1run)) } results <- data.frame( accuracy = accs, recall = recs, precision = pres ) ggplot(stack(results), aes(x = ind, y = values)) + geom_boxplot() + labs(y = "accuracy")+ labs(x = "") round(table(dataset$RainTomorrow)/sum(table(dataset$RainTomorrow)), 2) # (d) tree2 <- rpart(Cloud3pm ~ Temp9am + Humidity9am + Cloud9am + WindSpeed9am + Pressure9am, dataset) prp(tree2) # (e) nRun <- 20 nFold <- 10 iRun=1 mses <- c() msesRef <- c() for (iRun in 1 : nRun){ cat("\nrun", iRun, ": ") folds <- createFolds(dataset$Cloud3pm, nFold, list = T, returnTrain = F) fold="Fold01" mse1run <- c() mse1runRef <- c() for (fold in names(folds)){ cat(fold, "") # Division dataset -> trainSet + validationSet: testSet <- dataset[folds[[fold]], ] trainSet <- dataset[setdiff(1 : nrow(dataset), folds[[fold]]), ] # Learning: tree <- rpart(Cloud3pm ~ Temp9am + Humidity9am + Cloud9am + WindSpeed9am + Pressure9am, trainSet) # Prediction: preds <- predict(tree, testSet) predsRef <- rep(mean(trainSet$Cloud3pm), nrow(testSet)) # Evaluation: mse <- sqrt(sum((preds - testSet$Cloud3pm)^2)/nrow(testSet)) mseRef <- sqrt(sum((predsRef - testSet$Cloud3pm)^2)/nrow(testSet)) mse1run <- c(mse1run, mse) mse1runRef <- c(mse1runRef, mse1run) } mses <- c(mses, mean(mse1run)) msesRef <- c(msesRef, mean(mse1runRef)) } results <- data.frame( tree = mses, ref = msesRef ) ggplot(stack(results), aes(x = ind, y = values)) + geom_boxplot() + labs(y = "MSE")+ labs(x = "") # (f) library(e1071) dataset2 = dataset[, c("Cloud3pm", "Temp9am", "Humidity9am", "Cloud9am", "WindSpeed9am", "Pressure9am")] obj <- tune.rpart(Cloud3pm ~ ., data = dataset2, minsplit = seq(from = 2, to = 50, by = 2), minbucket = seq(from = 1, to = 50, by = 2), cp = seq(from = 0, to = 0.05, by = 0.001), maxdepth = 1 : 10, tunecontrol = tune.control(sampling = "cross", cross = 5) ) obj$best.parameters # -> minsplit = 2, minbucket = 49, cp = 0.017, maxdepth = 2 nRun <- 200 nFold <- 10 iRun=1 mses <- c() msesRef <- c() for (iRun in 1 : nRun){ cat("\nrun", iRun, ": ") folds <- createFolds(dataset$Cloud3pm, nFold, list = T, returnTrain = F) fold="Fold01" mse1run <- c() mse1runRef <- c() for (fold in names(folds)){ cat(fold, "") # Division dataset -> trainSet + validationSet: testSet <- dataset[folds[[fold]], ] trainSet <- dataset[setdiff(1 : nrow(dataset), folds[[fold]]), ] # Learning: tree <- rpart(Cloud3pm ~ Temp9am + Humidity9am + Cloud9am + WindSpeed9am + Pressure9am, trainSet, minsplit = obj$best.parameters$minsplit, minbucket = obj$best.parameters$minbucket, cp = obj$best.parameters$cp, maxdepth = obj$best.parameters$maxdepth) # Prediction: preds <- predict(tree, testSet) predsRef <- rep(mean(trainSet$Cloud3pm), nrow(testSet)) # Evaluation: mse <- sqrt(sum((preds - testSet$Cloud3pm)^2)/nrow(testSet)) mseRef <- sqrt(sum((predsRef - testSet$Cloud3pm)^2)/nrow(testSet)) mse1run <- c(mse1run, mse) mse1runRef <- c(mse1runRef, mse1run) } mses <- c(mses, mean(mse1run)) msesRef <- c(msesRef, mean(mse1runRef)) } results <- data.frame( tree = mses, ref = msesRef ) ggplot(stack(results), aes(x = ind, y = values)) + geom_boxplot() + labs(y = "MSE")+ labs(x = "") # EXERCISE 4 library(rpart); library(rpart.plot) # (a) # path <- "/Users/nsc/Google Drive/Enseignement/Enseignement classique/2020 - 2021/PRISM/Artificial intelligence/3. Decision trees/labwork" # setwd(path) # dataset <- read.csv2("premier_league.csv") temp <- tempfile(fileext = ".csv") download.file("https://drive.google.com/uc?authuser=0&id=1Of8H78ar1cYKrDIofYbC-SOhYokyLb_-", temp) dataset <- read.csv(temp, sep=";") dim(dataset) head(dataset) # (b) tree <- rpart(Full_Time_Result ~ Half_Time_Result, dataset, cp=0.08) prp(tree, extra=1) # -> P(Full_Time_Result = A / Half_Time_Result = A) = 64/(64+15+8) = 0.74 # -> P(Full_Time_Result = H / Half_Time_Result = H) = 95/(9+21+95) = 0.76 so roughly the same as for A # (c) tree <- rpart(Full_Time_Result ~ ., dataset, cp=0.02) prp(tree, extra=1, varlen=0, faclen=0) # P(Wtf wins / H = Wtf and A = McC, draw game at half time) = 3/(17+6+3) = 0.115 # P(Ars wins / H = Ars and A = McC, draw game at half time) = 7/(3+24+7) = 0.206