From Problem Set 2.
Let’s set our random seed:
set.seed(42)Let’s write a function that takes in a number of iterations and returns a data frame with all of the relevant information for our Monte Carlo simulation.
library(tidyverse)
mc_pi = function(n) {
  df = tibble(x = runif(n)*2-1, y = runif(n)*2-1)
  df = df %>% mutate(r = x^2+y^2) %>%
    mutate(incirc = ifelse(x^2+y^2 <= 1, 1, 0)) %>%
    mutate(perc_inside = cummean(incirc)) %>%
    mutate(pi_est = perc_inside*4) %>%
    mutate(err = pi-pi_est) %>%
    mutate(abs_err = abs(err))
  return(df)
}Test this out:
test = mc_pi(10^6)
tail(test$pi_est)## [1] 3.140900 3.140901 3.140901 3.140902 3.140903 3.140904Graph our error:
test %>% slice(seq(1,length(test$y),1000)) %>% 
  ggplot() + geom_point(aes(x=1:length(x), y=abs_err), size = 0.1) + scale_y_log10() + xlab("Iteration") + ylab("Log error")Load our data:
hr = read_csv("HR_comma_sep.csv")Label factors:
hr = hr %>% mutate(number_project = ordered(number_project)) %>%
  mutate(time_spend_company = ordered(time_spend_company)) %>%
  mutate(work_accident = factor(Work_accident)) %>%
  mutate(left = factor(left)) %>%
  mutate(sales = factor(sales)) %>%
  mutate(salary = factor(salary))Drop the extra column with inconsistent naming:
hr = hr %>% select(-Work_accident)Let’s shuffle the dataframe:
sh_hr = slice(hr, sample(nrow(hr), replace = FALSE))
head(hr[1:3])## # A tibble: 6 x 3
##   satisfaction_level last_evaluation number_project
##                <dbl>           <dbl>          <ord>
## 1               0.38            0.53              2
## 2               0.80            0.86              5
## 3               0.11            0.88              7
## 4               0.72            0.87              5
## 5               0.37            0.52              2
## 6               0.41            0.50              2head(sh_hr[1:3])## # A tibble: 6 x 3
##   satisfaction_level last_evaluation number_project
##                <dbl>           <dbl>          <ord>
## 1               0.36            0.57              2
## 2               0.09            0.79              6
## 3               0.65            0.96              2
## 4               0.56            0.79              4
## 5               0.99            0.73              3
## 6               0.78            0.89              4Split the dataset:
hr_train = slice(sh_hr,1:10000)
hr_test = slice(sh_hr, seq(10001, nrow(sh_hr)))Examine some summary statistics:
summary(hr_train)##  satisfaction_level last_evaluation  number_project average_montly_hours
##  Min.   :0.0900     Min.   :0.3600   2:1547         Min.   : 96.0       
##  1st Qu.:0.4400     1st Qu.:0.5600   3:2708         1st Qu.:156.0       
##  Median :0.6500     Median :0.7200   4:2911         Median :201.0       
##  Mean   :0.6147     Mean   :0.7174   5:1859         Mean   :201.7       
##  3rd Qu.:0.8200     3rd Qu.:0.8700   6: 806         3rd Qu.:246.0       
##  Max.   :1.0000     Max.   :1.0000   7: 169         Max.   :310.0       
##                                                                         
##  time_spend_company left     promotion_last_5years         sales     
##  3      :4304       0:7627   Min.   :0.0000        sales      :2740  
##  2      :2116       1:2373   1st Qu.:0.0000        technical  :1856  
##  4      :1725                Median :0.0000        support    :1489  
##  5      : 983                Mean   :0.0222        IT         : 829  
##  6      : 478                3rd Qu.:0.0000        marketing  : 588  
##  10     : 147                Max.   :1.0000        product_mng: 577  
##  (Other): 247                                      (Other)    :1921  
##     salary     work_accident
##  high  : 824   0:8574       
##  low   :4845   1:1426       
##  medium:4331                
##                             
##                             
##                             
## library(GGally)
prs = ggpairs(hr_train) 
prsggsave("pairs.pdf", prs)ggplot(hr_train, aes(x = left, y = satisfaction_level)) + geom_boxplot()Okay, no big surprise here, most of the people who left had low satisfaction levels.
Were they over or underworked?
hr_train$number_project = as.integer(hr_train$number_project)
ggplot(hr_train, aes(x=left, y=number_project)) + geom_boxplot() How were the evaluations?
ggplot(hr_train, aes(x=left, y=last_evaluation)) +geom_boxplot() Okay let’s see if we can find those who are leaving. What percentage have left?
mean(hr_train$left==1)## [1] 0.2373Let’s construct some new features.
hr_train = hr_train %>% mutate(unhappy = satisfaction_level < 0.5, overworked = number_project > 3, underappreciated = last_evaluation < 0.6)
hr_test = hr_test %>% mutate(unhappy = satisfaction_level < 0.5, overworked = number_project > 3, underappreciated = last_evaluation < 0.6)Make a hypothesis:
hr_train = hr_train %>% mutate(prob_quit = unhappy | (overworked & underappreciated))
hr_test = hr_test %>% mutate(prob_quit = unhappy | (overworked & underappreciated))How did we do on the training set?
sum(as.integer(hr_train$prob_quit) == hr_train$left)## [1] 7663So that is a 76.24% correct prediction rate. Note that this this is not good:
sum(0 == hr_train$left)## [1] 7627Let’s try again:
hr_train = hr_train %>% mutate(prob_quit = unhappy & (overworked))
hr_test = hr_test %>% mutate(prob_quit = unhappy & (overworked))
sum(as.integer(hr_train$prob_quit) == hr_train$left)## [1] 7784Slightly better, but that is nothing to write home about.
Let’s try something better.
library(rpart)
hr_train = select(hr_train, -unhappy) %>% select(-prob_quit) %>% select(-overworked) %>% select(-underappreciated)
tree.fit = rpart(left~., data=hr_train, control = rpart.control(maxdepth = 30))
summary(tree.fit)## Call:
## rpart(formula = left ~ ., data = hr_train, control = rpart.control(maxdepth = 30))
##   n= 10000 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.24652339      0 1.0000000 1.0000000 0.017927842
## 2 0.18289086      1 0.7534766 0.7534766 0.016147693
## 3 0.07796039      3 0.3876949 0.3876949 0.012179770
## 4 0.04846186      5 0.2317741 0.2330383 0.009631896
## 5 0.03034134      6 0.1833123 0.1879477 0.008698859
## 6 0.01938475      7 0.1529709 0.1592920 0.008036757
## 7 0.01222082      8 0.1335862 0.1369574 0.007472560
## 8 0.01000000      9 0.1213654 0.1272651 0.007211852
## 
## Variable importance
##   satisfaction_level      last_evaluation       number_project 
##                   35                   17                   17 
## average_montly_hours   time_spend_company        work_accident 
##                   16                   14                    1 
## 
## Node number 1: 10000 observations,    complexity param=0.2465234
##   predicted class=0  expected loss=0.2373  P(node) =1
##     class counts:  7627  2373
##    probabilities: 0.763 0.237 
##   left son=2 (7233 obs) right son=3 (2767 obs)
##   Primary splits:
##       satisfaction_level   < 0.465 to the right, improve=1038.4460, (0 missing)
##       number_project       < 1.5   to the right, improve= 626.2516, (0 missing)
##       average_montly_hours < 274.5 to the left,  improve= 273.1569, (0 missing)
##       time_spend_company   splits as  LRRRRRRR,  improve= 262.7210, (0 missing)
##       last_evaluation      < 0.575 to the right, improve= 132.8969, (0 missing)
##   Surrogate splits:
##       number_project       < 1.5   to the right, agree=0.794, adj=0.256, (0 split)
##       average_montly_hours < 275.5 to the left,  agree=0.757, adj=0.121, (0 split)
##       last_evaluation      < 0.485 to the right, agree=0.741, adj=0.065, (0 split)
## 
## Node number 2: 7233 observations,    complexity param=0.07796039
##   predicted class=0  expected loss=0.09636389  P(node) =0.7233
##     class counts:  6536   697
##    probabilities: 0.904 0.096 
##   left son=4 (5895 obs) right son=5 (1338 obs)
##   Primary splits:
##       time_spend_company   splits as  LLLRRRRR,  improve=429.75020, (0 missing)
##       last_evaluation      < 0.825 to the left,  improve=156.97700, (0 missing)
##       average_montly_hours < 216.5 to the left,  improve=113.10290, (0 missing)
##       number_project       < 3.5   to the left,  improve= 72.52983, (0 missing)
##       satisfaction_level   < 0.715 to the left,  improve= 58.86811, (0 missing)
##   Surrogate splits:
##       last_evaluation      < 0.995 to the left,  agree=0.821, adj=0.033, (0 split)
##       average_montly_hours < 298   to the left,  agree=0.815, adj=0.001, (0 split)
## 
## Node number 3: 2767 observations,    complexity param=0.1828909
##   predicted class=1  expected loss=0.3942898  P(node) =0.2767
##     class counts:  1091  1676
##    probabilities: 0.394 0.606 
##   left son=6 (1639 obs) right son=7 (1128 obs)
##   Primary splits:
##       number_project       < 1.5   to the right, improve=276.19080, (0 missing)
##       satisfaction_level   < 0.115 to the right, improve=243.81630, (0 missing)
##       time_spend_company   splits as  RRRLLLLL,  improve=241.16720, (0 missing)
##       average_montly_hours < 161.5 to the right, improve= 97.60469, (0 missing)
##       last_evaluation      < 0.445 to the left,  improve= 96.86592, (0 missing)
##   Surrogate splits:
##       satisfaction_level   < 0.355 to the left,  agree=0.881, adj=0.707, (0 split)
##       average_montly_hours < 161.5 to the right, agree=0.859, adj=0.653, (0 split)
##       last_evaluation      < 0.575 to the right, agree=0.853, adj=0.640, (0 split)
##       time_spend_company   splits as  RRLLLLLL,  agree=0.841, adj=0.609, (0 split)
## 
## Node number 4: 5895 observations
##   predicted class=0  expected loss=0.01424936  P(node) =0.5895
##     class counts:  5811    84
##    probabilities: 0.986 0.014 
## 
## Node number 5: 1338 observations,    complexity param=0.07796039
##   predicted class=0  expected loss=0.4581465  P(node) =0.1338
##     class counts:   725   613
##    probabilities: 0.542 0.458 
##   left son=10 (520 obs) right son=11 (818 obs)
##   Primary splits:
##       last_evaluation      < 0.815 to the left,  improve=302.3806, (0 missing)
##       average_montly_hours < 215.5 to the left,  improve=247.9913, (0 missing)
##       time_spend_company   splits as  RRRRRLLL,  improve=183.1469, (0 missing)
##       satisfaction_level   < 0.715 to the left,  improve=162.6257, (0 missing)
##       number_project       < 2.5   to the left,  improve=138.3302, (0 missing)
##   Surrogate splits:
##       average_montly_hours < 215.5 to the left,  agree=0.749, adj=0.354, (0 split)
##       number_project       < 2.5   to the left,  agree=0.714, adj=0.263, (0 split)
##       satisfaction_level   < 0.705 to the left,  agree=0.705, adj=0.240, (0 split)
##       time_spend_company   splits as  RRRRRLLL,  agree=0.685, adj=0.190, (0 split)
##       work_accident        splits as  RL,        agree=0.653, adj=0.108, (0 split)
## 
## Node number 6: 1639 observations,    complexity param=0.1828909
##   predicted class=0  expected loss=0.4203783  P(node) =0.1639
##     class counts:   950   689
##    probabilities: 0.580 0.420 
##   left son=12 (1032 obs) right son=13 (607 obs)
##   Primary splits:
##       satisfaction_level   < 0.115 to the right, improve=647.7497, (0 missing)
##       average_montly_hours < 242.5 to the left,  improve=373.9581, (0 missing)
##       number_project       < 4.5   to the left,  improve=363.2091, (0 missing)
##       last_evaluation      < 0.765 to the left,  improve=270.7619, (0 missing)
##       time_spend_company   splits as  LLRRRRRR,  improve=108.7181, (0 missing)
##   Surrogate splits:
##       average_montly_hours < 242.5 to the left,  agree=0.855, adj=0.610, (0 split)
##       number_project       < 4.5   to the left,  agree=0.845, adj=0.582, (0 split)
##       last_evaluation      < 0.765 to the left,  agree=0.785, adj=0.420, (0 split)
## 
## Node number 7: 1128 observations,    complexity param=0.03034134
##   predicted class=1  expected loss=0.125  P(node) =0.1128
##     class counts:   141   987
##    probabilities: 0.125 0.875 
##   left son=14 (84 obs) right son=15 (1044 obs)
##   Primary splits:
##       last_evaluation      < 0.575 to the right, improve=117.210600, (0 missing)
##       average_montly_hours < 162   to the right, improve=113.763400, (0 missing)
##       satisfaction_level   < 0.355 to the left,  improve=103.339700, (0 missing)
##       time_spend_company   splits as  RRLLLLLL,  improve= 55.849020, (0 missing)
##       work_accident        splits as  RL,        improve=  7.816372, (0 missing)
##   Surrogate splits:
##       average_montly_hours < 162   to the right, agree=0.947, adj=0.286, (0 split)
##       satisfaction_level   < 0.355 to the left,  agree=0.938, adj=0.167, (0 split)
##       time_spend_company   splits as  RRLLLLLL,  agree=0.937, adj=0.155, (0 split)
## 
## Node number 10: 520 observations
##   predicted class=0  expected loss=0.03653846  P(node) =0.052
##     class counts:   501    19
##    probabilities: 0.963 0.037 
## 
## Node number 11: 818 observations,    complexity param=0.04846186
##   predicted class=1  expected loss=0.2738386  P(node) =0.0818
##     class counts:   224   594
##    probabilities: 0.274 0.726 
##   left son=22 (115 obs) right son=23 (703 obs)
##   Primary splits:
##       time_spend_company   splits as  RRRRRLLL,  improve=141.12110, (0 missing)
##       average_montly_hours < 216.5 to the left,  improve=138.14360, (0 missing)
##       satisfaction_level   < 0.715 to the left,  improve=111.17040, (0 missing)
##       number_project       < 2.5   to the left,  improve= 76.89708, (0 missing)
##       salary               splits as  LRL,       improve= 23.39814, (0 missing)
##   Surrogate splits:
##       satisfaction_level    < 0.595 to the left,  agree=0.879, adj=0.139, (0 split)
##       promotion_last_5years < 0.5   to the right, agree=0.867, adj=0.052, (0 split)
##       average_montly_hours  < 208.5 to the left,  agree=0.862, adj=0.017, (0 split)
## 
## Node number 12: 1032 observations
##   predicted class=0  expected loss=0.07945736  P(node) =0.1032
##     class counts:   950    82
##    probabilities: 0.921 0.079 
## 
## Node number 13: 607 observations
##   predicted class=1  expected loss=0  P(node) =0.0607
##     class counts:     0   607
##    probabilities: 0.000 1.000 
## 
## Node number 14: 84 observations
##   predicted class=0  expected loss=0.07142857  P(node) =0.0084
##     class counts:    78     6
##    probabilities: 0.929 0.071 
## 
## Node number 15: 1044 observations,    complexity param=0.01222082
##   predicted class=1  expected loss=0.06034483  P(node) =0.1044
##     class counts:    63   981
##    probabilities: 0.060 0.940 
##   left son=30 (29 obs) right son=31 (1015 obs)
##   Primary splits:
##       last_evaluation      < 0.445 to the left,  improve=52.674380, (0 missing)
##       average_montly_hours < 162   to the right, improve=40.163420, (0 missing)
##       satisfaction_level   < 0.355 to the left,  improve=37.352090, (0 missing)
##       time_spend_company   splits as  LRRRRRRR,  improve=23.246210, (0 missing)
##       work_accident        splits as  RL,        improve= 3.201507, (0 missing)
##   Surrogate splits:
##       average_montly_hours < 115.5 to the left,  agree=0.975, adj=0.103, (0 split)
##       time_spend_company   splits as  RRRRRLLL,  agree=0.973, adj=0.034, (0 split)
## 
## Node number 22: 115 observations
##   predicted class=0  expected loss=0  P(node) =0.0115
##     class counts:   115     0
##    probabilities: 1.000 0.000 
## 
## Node number 23: 703 observations,    complexity param=0.01938475
##   predicted class=1  expected loss=0.1550498  P(node) =0.0703
##     class counts:   109   594
##    probabilities: 0.155 0.845 
##   left son=46 (70 obs) right son=47 (633 obs)
##   Primary splits:
##       average_montly_hours < 215.5 to the left,  improve=70.531440, (0 missing)
##       satisfaction_level   < 0.715 to the left,  improve=50.970130, (0 missing)
##       number_project       < 2.5   to the left,  improve=34.779260, (0 missing)
##       salary               splits as  LRR,       improve= 9.670679, (0 missing)
##       time_spend_company   splits as  RRRRLLLL,  improve= 9.265164, (0 missing)
##   Surrogate splits:
##       satisfaction_level < 0.925 to the right, agree=0.912, adj=0.114, (0 split)
##       number_project     < 1.5   to the left,  agree=0.905, adj=0.043, (0 split)
## 
## Node number 30: 29 observations
##   predicted class=0  expected loss=0  P(node) =0.0029
##     class counts:    29     0
##    probabilities: 1.000 0.000 
## 
## Node number 31: 1015 observations
##   predicted class=1  expected loss=0.03349754  P(node) =0.1015
##     class counts:    34   981
##    probabilities: 0.033 0.967 
## 
## Node number 46: 70 observations
##   predicted class=0  expected loss=0.1714286  P(node) =0.007
##     class counts:    58    12
##    probabilities: 0.829 0.171 
## 
## Node number 47: 633 observations
##   predicted class=1  expected loss=0.08056872  P(node) =0.0633
##     class counts:    51   582
##    probabilities: 0.081 0.919This is hard to read.
library(rpart.plot)
rpart.plot(tree.fit)Okay, so what is our error rate?
hr_test$number_project = as.integer(hr_test$number_project)
preds = predict(tree.fit, hr_test, type = "class")
mean(hr_test$left == preds)## [1] 0.9711942That’s more like it!
Can we do even better?
library(xgboost)
matdata = xgb.DMatrix(data = as.matrix(sapply(select(hr_train,-left), as.numeric)), label =as.numeric( hr_train$left)-1)
mattest = xgb.DMatrix(data = as.matrix(sapply(select(hr_test,-left), as.numeric)), label = as.numeric(hr_test$left)-1)
watchlist = list(train=matdata, test = mattest)
btree.fit = xgb.train(data = matdata, label = hr_train$left, max.depth = 30, eval.metric="error", nrounds = 200, watchlist = watchlist)## [1]  train-error:0.007400    test-error:0.022004 
## [2]  train-error:0.006900    test-error:0.021404 
## [3]  train-error:0.004600    test-error:0.018604 
## [4]  train-error:0.004100    test-error:0.018604 
## [5]  train-error:0.002700    test-error:0.017604 
## [6]  train-error:0.002100    test-error:0.017203 
## [7]  train-error:0.000900    test-error:0.016803 
## [8]  train-error:0.000400    test-error:0.017203 
## [9]  train-error:0.000000    test-error:0.016803 
## [10] train-error:0.000000    test-error:0.017003 
## [11] train-error:0.000000    test-error:0.016803 
## [12] train-error:0.000000    test-error:0.016803 
## [13] train-error:0.000000    test-error:0.017003 
## [14] train-error:0.000000    test-error:0.017003 
## [15] train-error:0.000000    test-error:0.017003 
## [16] train-error:0.000000    test-error:0.017003 
## [17] train-error:0.000000    test-error:0.017003 
## [18] train-error:0.000000    test-error:0.017003 
## [19] train-error:0.000000    test-error:0.017003 
## [20] train-error:0.000000    test-error:0.017003 
## [21] train-error:0.000000    test-error:0.017003 
## [22] train-error:0.000000    test-error:0.017003 
## [23] train-error:0.000000    test-error:0.017003 
## [24] train-error:0.000000    test-error:0.017003 
## [25] train-error:0.000000    test-error:0.017003 
## [26] train-error:0.000000    test-error:0.017003 
## [27] train-error:0.000000    test-error:0.017003 
## [28] train-error:0.000000    test-error:0.017003 
## [29] train-error:0.000000    test-error:0.017003 
## [30] train-error:0.000000    test-error:0.017003 
## [31] train-error:0.000000    test-error:0.017003 
## [32] train-error:0.000000    test-error:0.017003 
## [33] train-error:0.000000    test-error:0.017003 
## [34] train-error:0.000000    test-error:0.017003 
## [35] train-error:0.000000    test-error:0.017003 
## [36] train-error:0.000000    test-error:0.017003 
## [37] train-error:0.000000    test-error:0.017003 
## [38] train-error:0.000000    test-error:0.017003 
## [39] train-error:0.000000    test-error:0.017003 
## [40] train-error:0.000000    test-error:0.017003 
## [41] train-error:0.000000    test-error:0.017003 
## [42] train-error:0.000000    test-error:0.017003 
## [43] train-error:0.000000    test-error:0.017003 
## [44] train-error:0.000000    test-error:0.017003 
## [45] train-error:0.000000    test-error:0.017003 
## [46] train-error:0.000000    test-error:0.017003 
## [47] train-error:0.000000    test-error:0.017003 
## [48] train-error:0.000000    test-error:0.017003 
## [49] train-error:0.000000    test-error:0.017003 
## [50] train-error:0.000000    test-error:0.017003 
## [51] train-error:0.000000    test-error:0.017003 
## [52] train-error:0.000000    test-error:0.017003 
## [53] train-error:0.000000    test-error:0.017003 
## [54] train-error:0.000000    test-error:0.017003 
## [55] train-error:0.000000    test-error:0.017003 
## [56] train-error:0.000000    test-error:0.017003 
## [57] train-error:0.000000    test-error:0.017003 
## [58] train-error:0.000000    test-error:0.017003 
## [59] train-error:0.000000    test-error:0.017003 
## [60] train-error:0.000000    test-error:0.017003 
## [61] train-error:0.000000    test-error:0.017003 
## [62] train-error:0.000000    test-error:0.017003 
## [63] train-error:0.000000    test-error:0.017003 
## [64] train-error:0.000000    test-error:0.017003 
## [65] train-error:0.000000    test-error:0.017003 
## [66] train-error:0.000000    test-error:0.017003 
## [67] train-error:0.000000    test-error:0.017003 
## [68] train-error:0.000000    test-error:0.017003 
## [69] train-error:0.000000    test-error:0.017003 
## [70] train-error:0.000000    test-error:0.017003 
## [71] train-error:0.000000    test-error:0.017003 
## [72] train-error:0.000000    test-error:0.017003 
## [73] train-error:0.000000    test-error:0.017003 
## [74] train-error:0.000000    test-error:0.017003 
## [75] train-error:0.000000    test-error:0.017003 
## [76] train-error:0.000000    test-error:0.017003 
## [77] train-error:0.000000    test-error:0.017003 
## [78] train-error:0.000000    test-error:0.017003 
## [79] train-error:0.000000    test-error:0.017003 
## [80] train-error:0.000000    test-error:0.017003 
## [81] train-error:0.000000    test-error:0.017003 
## [82] train-error:0.000000    test-error:0.017003 
## [83] train-error:0.000000    test-error:0.017003 
## [84] train-error:0.000000    test-error:0.017003 
## [85] train-error:0.000000    test-error:0.017003 
## [86] train-error:0.000000    test-error:0.017003 
## [87] train-error:0.000000    test-error:0.017003 
## [88] train-error:0.000000    test-error:0.017003 
## [89] train-error:0.000000    test-error:0.017003 
## [90] train-error:0.000000    test-error:0.017003 
## [91] train-error:0.000000    test-error:0.017003 
## [92] train-error:0.000000    test-error:0.017003 
## [93] train-error:0.000000    test-error:0.017003 
## [94] train-error:0.000000    test-error:0.017003 
## [95] train-error:0.000000    test-error:0.017003 
## [96] train-error:0.000000    test-error:0.017003 
## [97] train-error:0.000000    test-error:0.017003 
## [98] train-error:0.000000    test-error:0.017003 
## [99] train-error:0.000000    test-error:0.017003 
## [100]    train-error:0.000000    test-error:0.017003 
## [101]    train-error:0.000000    test-error:0.017003 
## [102]    train-error:0.000000    test-error:0.017003 
## [103]    train-error:0.000000    test-error:0.017003 
## [104]    train-error:0.000000    test-error:0.017003 
## [105]    train-error:0.000000    test-error:0.017003 
## [106]    train-error:0.000000    test-error:0.017003 
## [107]    train-error:0.000000    test-error:0.017003 
## [108]    train-error:0.000000    test-error:0.017003 
## [109]    train-error:0.000000    test-error:0.017003 
## [110]    train-error:0.000000    test-error:0.017003 
## [111]    train-error:0.000000    test-error:0.017003 
## [112]    train-error:0.000000    test-error:0.017003 
## [113]    train-error:0.000000    test-error:0.017003 
## [114]    train-error:0.000000    test-error:0.017003 
## [115]    train-error:0.000000    test-error:0.017003 
## [116]    train-error:0.000000    test-error:0.017003 
## [117]    train-error:0.000000    test-error:0.017003 
## [118]    train-error:0.000000    test-error:0.017003 
## [119]    train-error:0.000000    test-error:0.017003 
## [120]    train-error:0.000000    test-error:0.017003 
## [121]    train-error:0.000000    test-error:0.017003 
## [122]    train-error:0.000000    test-error:0.017003 
## [123]    train-error:0.000000    test-error:0.017003 
## [124]    train-error:0.000000    test-error:0.017003 
## [125]    train-error:0.000000    test-error:0.017003 
## [126]    train-error:0.000000    test-error:0.017003 
## [127]    train-error:0.000000    test-error:0.017003 
## [128]    train-error:0.000000    test-error:0.017003 
## [129]    train-error:0.000000    test-error:0.017003 
## [130]    train-error:0.000000    test-error:0.017003 
## [131]    train-error:0.000000    test-error:0.017003 
## [132]    train-error:0.000000    test-error:0.017003 
## [133]    train-error:0.000000    test-error:0.017003 
## [134]    train-error:0.000000    test-error:0.017003 
## [135]    train-error:0.000000    test-error:0.017003 
## [136]    train-error:0.000000    test-error:0.017003 
## [137]    train-error:0.000000    test-error:0.017003 
## [138]    train-error:0.000000    test-error:0.017003 
## [139]    train-error:0.000000    test-error:0.017003 
## [140]    train-error:0.000000    test-error:0.017003 
## [141]    train-error:0.000000    test-error:0.017003 
## [142]    train-error:0.000000    test-error:0.017003 
## [143]    train-error:0.000000    test-error:0.017003 
## [144]    train-error:0.000000    test-error:0.017003 
## [145]    train-error:0.000000    test-error:0.017003 
## [146]    train-error:0.000000    test-error:0.017003 
## [147]    train-error:0.000000    test-error:0.017003 
## [148]    train-error:0.000000    test-error:0.017003 
## [149]    train-error:0.000000    test-error:0.017003 
## [150]    train-error:0.000000    test-error:0.017003 
## [151]    train-error:0.000000    test-error:0.017003 
## [152]    train-error:0.000000    test-error:0.017003 
## [153]    train-error:0.000000    test-error:0.017003 
## [154]    train-error:0.000000    test-error:0.017003 
## [155]    train-error:0.000000    test-error:0.017003 
## [156]    train-error:0.000000    test-error:0.017003 
## [157]    train-error:0.000000    test-error:0.017003 
## [158]    train-error:0.000000    test-error:0.017003 
## [159]    train-error:0.000000    test-error:0.017003 
## [160]    train-error:0.000000    test-error:0.017003 
## [161]    train-error:0.000000    test-error:0.017003 
## [162]    train-error:0.000000    test-error:0.017003 
## [163]    train-error:0.000000    test-error:0.017003 
## [164]    train-error:0.000000    test-error:0.017003 
## [165]    train-error:0.000000    test-error:0.017003 
## [166]    train-error:0.000000    test-error:0.017003 
## [167]    train-error:0.000000    test-error:0.017003 
## [168]    train-error:0.000000    test-error:0.017003 
## [169]    train-error:0.000000    test-error:0.017003 
## [170]    train-error:0.000000    test-error:0.017003 
## [171]    train-error:0.000000    test-error:0.017003 
## [172]    train-error:0.000000    test-error:0.017003 
## [173]    train-error:0.000000    test-error:0.017003 
## [174]    train-error:0.000000    test-error:0.017003 
## [175]    train-error:0.000000    test-error:0.017003 
## [176]    train-error:0.000000    test-error:0.017003 
## [177]    train-error:0.000000    test-error:0.017003 
## [178]    train-error:0.000000    test-error:0.017003 
## [179]    train-error:0.000000    test-error:0.017003 
## [180]    train-error:0.000000    test-error:0.017003 
## [181]    train-error:0.000000    test-error:0.017003 
## [182]    train-error:0.000000    test-error:0.017003 
## [183]    train-error:0.000000    test-error:0.017003 
## [184]    train-error:0.000000    test-error:0.017003 
## [185]    train-error:0.000000    test-error:0.017003 
## [186]    train-error:0.000000    test-error:0.017003 
## [187]    train-error:0.000000    test-error:0.017003 
## [188]    train-error:0.000000    test-error:0.017003 
## [189]    train-error:0.000000    test-error:0.017003 
## [190]    train-error:0.000000    test-error:0.017003 
## [191]    train-error:0.000000    test-error:0.017003 
## [192]    train-error:0.000000    test-error:0.017003 
## [193]    train-error:0.000000    test-error:0.017003 
## [194]    train-error:0.000000    test-error:0.017003 
## [195]    train-error:0.000000    test-error:0.017003 
## [196]    train-error:0.000000    test-error:0.017003 
## [197]    train-error:0.000000    test-error:0.017003 
## [198]    train-error:0.000000    test-error:0.017003 
## [199]    train-error:0.000000    test-error:0.017003 
## [200]    train-error:0.000000    test-error:0.017003bpreds = predict(btree.fit, mattest)
head(bpreds)## [1] -8.689881e-03 -9.687281e-04 -1.731492e-04 -1.818596e-06  9.990979e-01
## [6] -2.331969e-03mean(as.integer(hr_test$left)-1 == (bpreds >0.5))## [1] 0.9829966That brings our accuracy up to 98.44%. This is something to write home about.