<- c("C50", "gmodels", "tidyverse", "openxlsx",
my_libraries "rattle", "rpart", "rpart.plot")
lapply(my_libraries, library, character.only = TRUE)
Contents
- Train (and graph) dividend payer with rpart(), rpart.plot and C5.0
- Loan default train
- Loan default prediction
- Adding penalty matrix to make false negatives costly
- Trees are random but not too fragile
To write a PQ set for decision trees, I experimented below. First the libraries:
Predicting dividend
GARP’s motivating example is a super simple (n = 20) dataset of public companies the either pay or do not pay a Dividend. The my20firms dataframe (you can see) is slightly altered to achieve a tree that I liked better for purposes of a practice question:
# my20firms <- garp_data
# my20firms$Dividend[1] <- 0
# my20firms$Dividend[9] <- 0
# my20firms$Dividend[12] <- 1
# my20firms$Dividend[13] <- 0
# my20firms$Dividend[15] <- 0
# colnames(my20firms)[colnames(my20firms) == "Retail_investor"] <- "Retail"
# colnames(my20firms)[colnames(my20firms) == "Large_cap"] <- "LargeCap"
# write.xlsx(my20firms, file = "dividendExampleModified_v3.xlsx")
# my20firms <- read.xlsx("dividendExampleModified_v3.xlsx")
# saveRDS(my20firms, file = "my20firms-rds.RDS")
<- readRDS("my20firms-rds.RDS")
my20firms my20firms
Dividend Earnings LargeCap Retail Tech
1 0 0 1 40 1
2 1 1 1 30 0
3 1 1 1 20 0
4 0 0 0 80 1
5 1 0 1 20 0
6 0 1 0 30 1
7 0 1 0 40 0
8 1 0 1 60 0
9 0 1 1 20 1
10 0 1 1 40 0
11 0 0 0 20 1
12 1 0 1 70 0
13 0 1 0 30 1
14 1 0 1 70 0
15 0 0 1 50 1
16 1 0 1 60 1
17 1 1 1 30 0
18 0 1 0 30 1
19 0 0 0 40 0
20 1 1 1 50 0
<- rpart(Dividend ~ ., data = my20firms,
fit2 parms = list(split = "gini"),
control = rpart.control(minsplit = 1,
minbucket = 1,
maxdepth = 4))
# summary(fit2) printout is too long
print(fit2)
n= 20
node), split, n, deviance, yval
* denotes terminal node
1) root 20 4.9500000 0.4500000
2) LargeCap< 0.5 7 0.0000000 0.0000000 *
3) LargeCap>=0.5 13 2.7692310 0.6923077
6) Tech>=0.5 4 0.7500000 0.2500000
12) Retail< 55 3 0.0000000 0.0000000 *
13) Retail>=55 1 0.0000000 1.0000000 *
7) Tech< 0.5 9 0.8888889 0.8888889
14) Earnings>=0.5 5 0.8000000 0.8000000
28) Retail>=35 2 0.5000000 0.5000000 *
29) Retail< 35 3 0.0000000 1.0000000 *
15) Earnings< 0.5 4 0.0000000 1.0000000 *
printcp(fit2)
Regression tree:
rpart(formula = Dividend ~ ., data = my20firms, parms = list(split = "gini"),
control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 4))
Variables actually used in tree construction:
[1] Earnings LargeCap Retail Tech
Root node error: 4.95/20 = 0.2475
n= 20
CP nsplit rel error xerror xstd
1 0.440559 0 1.00000 1.11610 0.056479
2 0.228352 1 0.55944 1.12013 0.298518
3 0.151515 2 0.33109 1.04063 0.309021
4 0.039282 3 0.17957 0.81948 0.360160
5 0.010000 5 0.10101 0.80808 0.361385
rpart.plot(fit2, yesno = 2, left=FALSE, type=2, branch.lty = 3, nn= TRUE,
box.palette = "BuGn", leaf.round=0)
# converting the target to factor
$Dividend <- as_factor(my20firms$Dividend)
my20firms
<- rpart(Dividend ~ ., data = my20firms,
fit3 parms = list(split = "gini"),
control = rpart.control(minsplit = 1,
minbucket = 1,
maxdepth = 4))
print(fit3)
n= 20
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 20 9 0 (0.5500000 0.4500000)
2) LargeCap< 0.5 7 0 0 (1.0000000 0.0000000) *
3) LargeCap>=0.5 13 4 1 (0.3076923 0.6923077)
6) Tech>=0.5 4 1 0 (0.7500000 0.2500000)
12) Retail< 55 3 0 0 (1.0000000 0.0000000) *
13) Retail>=55 1 0 1 (0.0000000 1.0000000) *
7) Tech< 0.5 9 1 1 (0.1111111 0.8888889) *
rpart.plot(fit3, yesno = 2, left=FALSE, type=2, branch.lty = 3, nn= TRUE,
box.palette = "BuGn", leaf.round=0)
I had to refresh my knowledge of decision trees, and for that I depended on the awesome book () that I will review in the future (almost done!). He uses C5.0 algorithm (per the C50 package) and I just wanted to see its defaults:
<- C5.0(Dividend ~ ., data = my20firms)
tree_c5 plot(tree_c5)
# set MinCases = 1
<- C5.0(Dividend ~ .,
tree_c5_v2 control = C5.0Control(minCases = 1),
data = my20firms)
plot(tree_c5_v2)
Loan default examples
Now I will switch datasets, and use the same loan default dataset used in the book. But I will use the more familiar rpart() function to train the tree. The result is similar but not identical (and please not the difference is not due to sampling varation: my test sample is the same).
set.seed(9829)
<- sample(1000, 900)
train_sample
<- read.csv("credit.csv", stringsAsFactors = TRUE)
credit
# split the data frames
<- credit[train_sample, ]
credit_train <- credit[-train_sample, ]
credit_test
$credit_history <- credit_train$credit_history |>
credit_trainfct_relevel("critical", "poor", "good", "very good", "perfect")
<- rpart(default ~ ., data = credit_train,
tree_credit_train parms = list(split = "gini"),
control = rpart.control(minsplit = 1,
minbucket = 1,
maxdepth = 4))
rpart.plot(tree_credit_train, yesno = 2, left=FALSE, type=2, branch.lty = 3, nn= TRUE,
box.palette = c("palegreen", "pink"), leaf.round=0, extra = 101, digits = 4)
print(tree_credit_train)
n= 900
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 900 265 no (0.7055556 0.2944444)
2) checking_balance=> 200 DM,unknown 415 55 no (0.8674699 0.1325301) *
3) checking_balance=< 0 DM,1 - 200 DM 485 210 no (0.5670103 0.4329897)
6) credit_history=critical,poor,good 426 167 no (0.6079812 0.3920188)
12) months_loan_duration< 27.5 324 105 no (0.6759259 0.3240741)
24) amount< 9899.5 317 98 no (0.6908517 0.3091483) *
25) amount>=9899.5 7 0 yes (0.0000000 1.0000000) *
13) months_loan_duration>=27.5 102 40 yes (0.3921569 0.6078431)
26) dependents>=1.5 14 4 no (0.7142857 0.2857143) *
27) dependents< 1.5 88 30 yes (0.3409091 0.6590909) *
7) credit_history=very good,perfect 59 16 yes (0.2711864 0.7288136)
14) age< 22.5 3 0 no (1.0000000 0.0000000) *
15) age>=22.5 56 13 yes (0.2321429 0.7678571) *
printcp(tree_credit_train)
Classification tree:
rpart(formula = default ~ ., data = credit_train, parms = list(split = "gini"),
control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 4))
Variables actually used in tree construction:
[1] age amount checking_balance
[4] credit_history dependents months_loan_duration
Root node error: 265/900 = 0.29444
n= 900
CP nsplit rel error xerror xstd
1 0.050943 0 1.00000 1.00000 0.051599
2 0.026415 3 0.81509 0.83396 0.048726
3 0.022642 4 0.78868 0.82642 0.048577
4 0.011321 5 0.76604 0.81887 0.048425
5 0.010000 6 0.75472 0.81887 0.048425
Default prediction
Because there is a 10% test set, we can test the decision tree. It’s not great. In terms of the mistake, notice that 28/35 actual defaulters were incorrectly predicted to repay; that’s terrible. Compare this to only 7/65 actual re-payers who were predicted to default.
<- predict(tree_credit_train, credit_test, type = "class")
tree_credit_pred
CrossTable(credit_test$default, tree_credit_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
Cell Contents
|-------------------------|
| N |
| N / Table Total |
|-------------------------|
Total Observations in Table: 100
| predicted default
actual default | no | yes | Row Total |
---------------|-----------|-----------|-----------|
no | 58 | 7 | 65 |
| 0.580 | 0.070 | |
---------------|-----------|-----------|-----------|
yes | 28 | 7 | 35 |
| 0.280 | 0.070 | |
---------------|-----------|-----------|-----------|
Column Total | 86 | 14 | 100 |
---------------|-----------|-----------|-----------|
Adding a loss (aka, penalty, cost) matrix
It’s really easy to impose a penalty matrix. We will make the false negative three times more costly than a false positive. As desired, the false negatives flip with huge improvement: the updated model correctly traps 28/35 defaults with only 7/35 false negatives. But this comes with an equally huge trade-off: false positives jump from 7/65 to 27 out of 65 who are predicted to default but actually repay.
<- matrix(c(0, 3, # Actual: No
penalty_matrix 1, 0), # Actual: Yes
ncol=2)
rownames(penalty_matrix) <- colnames(penalty_matrix) <- c("No", "Yes")
<- rpart(default ~ ., data = credit_train,
tree_credit_cost_train parms = list(split = "gini", loss=penalty_matrix),
control = rpart.control(minsplit = 1,
minbucket = 1,
maxdepth = 4))
<- predict(tree_credit_cost_train, credit_test, type = "class")
tree_credit_cost_pred
CrossTable(credit_test$default, tree_credit_cost_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
Cell Contents
|-------------------------|
| N |
| N / Table Total |
|-------------------------|
Total Observations in Table: 100
| predicted default
actual default | no | yes | Row Total |
---------------|-----------|-----------|-----------|
no | 38 | 27 | 65 |
| 0.380 | 0.270 | |
---------------|-----------|-----------|-----------|
yes | 7 | 28 | 35 |
| 0.070 | 0.280 | |
---------------|-----------|-----------|-----------|
Column Total | 45 | 55 | 100 |
---------------|-----------|-----------|-----------|
Can I easily randomize?
I’m interested in the fact that decision trees have random qualities (aside from sampling variation). Below I set a different seed and switched the split algo to entropy. But the ultimate tree is the same.
# different see and switch gini to information; aka, entropy
set.seed(448)
<- rpart(default ~ ., data = credit_train,
tree_credit_train_2 parms = list(split = "information"),
control = rpart.control(minsplit = 1,
minbucket = 1,
maxdepth = 4))
rpart.plot(tree_credit_train_2, yesno = 2, left=FALSE, type=2, branch.lty = 3, nn= TRUE,
box.palette = c("palegreen", "pink"), leaf.round=0, extra = 101, digits = 4)
print(tree_credit_train_2)
n= 900
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 900 265 no (0.7055556 0.2944444)
2) checking_balance=> 200 DM,unknown 415 55 no (0.8674699 0.1325301) *
3) checking_balance=< 0 DM,1 - 200 DM 485 210 no (0.5670103 0.4329897)
6) credit_history=critical,poor,good 426 167 no (0.6079812 0.3920188)
12) months_loan_duration< 27.5 324 105 no (0.6759259 0.3240741)
24) amount< 9899.5 317 98 no (0.6908517 0.3091483) *
25) amount>=9899.5 7 0 yes (0.0000000 1.0000000) *
13) months_loan_duration>=27.5 102 40 yes (0.3921569 0.6078431)
26) dependents>=1.5 14 4 no (0.7142857 0.2857143) *
27) dependents< 1.5 88 30 yes (0.3409091 0.6590909) *
7) credit_history=very good,perfect 59 16 yes (0.2711864 0.7288136)
14) age< 22.5 3 0 no (1.0000000 0.0000000) *
15) age>=22.5 56 13 yes (0.2321429 0.7678571) *
printcp(tree_credit_train_2)
Classification tree:
rpart(formula = default ~ ., data = credit_train, parms = list(split = "information"),
control = rpart.control(minsplit = 1, minbucket = 1, maxdepth = 4))
Variables actually used in tree construction:
[1] age amount checking_balance
[4] credit_history dependents months_loan_duration
Root node error: 265/900 = 0.29444
n= 900
CP nsplit rel error xerror xstd
1 0.050943 0 1.00000 1.00000 0.051599
2 0.026415 3 0.81509 0.87547 0.049518
3 0.022642 4 0.78868 0.86038 0.049236
4 0.011321 5 0.76604 0.86415 0.049307
5 0.010000 6 0.75472 0.86038 0.049236
identical(tree_credit_train, tree_credit_train_2)
[1] FALSE
all.equal(tree_credit_train, tree_credit_train_2)
[1] "Component \"call\": target, current do not match when deparsed"
[2] "Component \"cptable\": Mean relative difference: 0.04736418"
[3] "Component \"parms\": Component \"split\": Mean relative difference: 1"
[4] "Component \"splits\": Attributes: < Component \"dimnames\": Component 1: 4 string mismatches >"
[5] "Component \"splits\": Mean relative difference: 5.545437"
[6] "Component \"csplit\": Attributes: < Component \"dim\": Mean relative difference: 0.04761905 >"
[7] "Component \"csplit\": Numeric: lengths (126, 120) differ"
[8] "Component \"variable.importance\": Names: 2 string mismatches"
[9] "Component \"variable.importance\": Mean relative difference: 0.1894674"