Logistic regression – finedtech by David Harper, CFA, FRM

Simulate data with simdata package

The delightful simdata package allows us to specify a correlation matrix and tranform (via transformation function) the random multivarate normal distribution into the set of desired univariate (but correlated!) distributions.

library(tidyverse)
library(gt)

library(patchwork)
library(GGally); library(ggcorrplot)
library(ggeffects) # amazing package plots marginal effects

library(simdata)
library(matrixcalc); library(mbend)
library(gmodels) # CrossTable()

library(skimr)

# this function in the simdata package builds a correlation
# matrix by specifying c(col, row, rho)
correlation_matrix = cor_from_upper(
    8,
    rbind(c(1,8,-0.20), # loyalty
          c(2,8,-0.16), # bundle
          c(3,8,0.12),  # jump (in price)
          c(4,8,0.15),  # premium
          c(5,8,-0.07), # age
          c(6,8,-0.05), # income
          c(7,8,0))     # mobile
)

# we require positive definite matrix
# is.positive.definite(correlation_matrix) = TRUE
if (!is.positive.definite(correlation_matrix)) {
    correlation_matrix <- bend(correlation_matrix)$bent |> round(5)
}

ggcorrplot(correlation_matrix,
           colors = c("red","white", "darkgreen"))

transformation <- simdata::function_list(
    loyalty = function(z) qbeta(pnorm(z[,1]), shape1 = 2, shape2 = 5) * 30,
    bundle_b = function(z) z[,2] > qnorm(0.7), #bundle
    pricejump_b = function(z) z[,3] > qnorm(0.8),  # 80th for 20% probability
    premium = function(z) pnorm(z[,4]) * (2000 - 300) + 300, # premium
    age = function(z) pmax(18, pmin(80, z[,5] * 10 + 40)), #age
    income = function(z) exp(z[,6] + 4), #income
    mobile_b = function(z) z[,7] > 0, #mobile
    churn = function(z) z[,8] > qnorm(.8)
)

# the multivarate normal design specification
sim_design = simdata::simdesign_mvtnorm(
  relations = correlation_matrix,
  transform_initial = transformation,
  prefix_final = NULL
)

sim_data = simdata::simulate_data(sim_design, n_obs = 1000, seed = 51493)

sim_data$churn <- as.factor(sim_data$churn)
sim_data$loyalty <- round(sim_data$loyalty, 1)
sim_data$bundle_b <- as.factor(sim_data$bundle_b) #ok
sim_data$pricejump_b <- as.factor(sim_data$pricejump_b) #ok
sim_data$premium <- round(sim_data$premium/10)*10
sim_data$age <- round(sim_data$age)
sim_data$income <- round(sim_data$income/10)*10
sim_data$mobile_b <- as.factor(sim_data$mobile_b) #ok

# don't use v1, instead will split into train/test sets
# model_sim_v1 <- glm(formula = churn ~ .,
#        family = binomial(link = "logit"), data = sim_data)
# summary(model_sim_v1)

set.seed(7553695)
train_sample <- sample(1000, 900)
sim_train <- sim_data[train_sample, ]
sim_test <- sim_data[-train_sample, ]

data_scenario_range <- data.frame(
    loyalty = c(25,20,15,10,5,1),
    bundle_b = as.factor(c(TRUE,TRUE,TRUE,FALSE,FALSE,FALSE)),
    pricejump_b = as.factor(c(FALSE,FALSE,FALSE,FALSE,TRUE,TRUE)),
    premium = c(300,500,900,1100,1600,2000),
    age = c(70,55,40,29,24,21),
    income = c(200,150,120,100,80,60),
    mobile_b = as.factor(c(TRUE,FALSE,TRUE,FALSE,TRUE,FALSE))
)

data_feature_means <- data.frame(
    loyalty = mean(sim_train$loyalty),
    bundle_b = as.factor(FALSE),
    pricejump_b = as.factor(FALSE),
    premium = mean(sim_train$premium),
    age = mean(sim_train$age),
    income = mean(sim_train$income),
    mobile_b = as.factor(TRUE)
)

ggpairs(sim_train, columns = 1:6, lower = "blank")

skim(sim_train)

Data summary
Name	sim_train
Number of rows	900
Number of columns	8
_______________________
Column type frequency:
factor	4
numeric	4
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
bundle_b	1	FALSE	2	FAL: 632, TRU: 268
pricejump_b	1	FALSE	2	FAL: 720, TRU: 180
mobile_b	1	FALSE	2	TRU: 463, FAL: 437
churn	1	FALSE	2	FAL: 726, TRU: 174

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
loyalty	1	8.49	4.83	0.2	4.8	7.9	11.5	24.9	▆▇▅▂▁
premium	1	1137.60	486.65	300.0	720.0	1120.0	1552.5	2000.0	▇▇▆▆▇
age	1	40.08	9.52	18.0	33.0	40.0	47.0	71.0	▂▇▇▃▁
income	1	87.72	106.29	0.0	30.0	50.0	100.0	1040.0	▇▁▁▁▁

The regression results

model_sim_v2 <- glm(formula = churn ~ .,
        family = binomial(link = "logit"), data = sim_train)
summary(model_sim_v2)


Call:
glm(formula = churn ~ ., family = binomial(link = "logit"), data = sim_train)

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -0.2202193  0.4613275  -0.477  0.63311    
loyalty         -0.1179751  0.0214055  -5.511 3.56e-08 ***
bundle_bTRUE    -0.6553063  0.2151626  -3.046  0.00232 ** 
pricejump_bTRUE  0.4985342  0.2054056   2.427  0.01522 *  
premium          0.0005120  0.0001806   2.834  0.00459 ** 
age             -0.0196931  0.0092299  -2.134  0.03287 *  
income          -0.0008402  0.0009273  -0.906  0.36491    
mobile_bTRUE    -0.0086937  0.1764525  -0.049  0.96070    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 883.84  on 899  degrees of freedom
Residual deviance: 815.98  on 892  degrees of freedom
AIC: 831.98

Number of Fisher Scoring iterations: 5

predicted_probs_range <- predict(model_sim_v2, newdata = data_scenario_range, type = "response")
predicted_probs_means <- predict(model_sim_v2, newdata = data_feature_means, type = "response")
round(predicted_probs_range,5)

      1       2       3       4       5       6 
0.00534 0.01495 0.04387 0.18364 0.48978 0.67269

round(predicted_probs_means,5)

      1 
0.18077

coef_table <- coef(summary(model_sim_v2)) 
coef_tbl  <-  as_tibble(coef_table)
Coeff_labels <- c("(Intercept)", "Loyalty, yrs", "Bundle?(T)", "Price Jumped?(T)", "Premium, $000s", 
                 "Age, yrs", "Income, $000s","Mobile?(T)")
coef_tbl <- cbind(Coeff_labels, coef_tbl)

# Using gt() to render a table
coef_tbl_gt <- coef_tbl %>% gt() |> 
    opt_table_font(stack = "humanist") |>
    fmt_number(columns = everything(),
               decimals = 3)
coef_tbl_gt |> 
    data_color(
        columns = 'Pr(>|z|)', 
        palette = c("darkseagreen1", "darkseagreen3", "darkseagreen4"),
        domain = c(0,0.05),
        na_color = "lightgrey"
    )

Coeff_labels	Estimate	Std. Error	z value	Pr(>\|z\|)
(Intercept)	−0.220	0.461	−0.477	0.633
Loyalty, yrs	−0.118	0.021	−5.511	0.000
Bundle?(T)	−0.655	0.215	−3.046	0.002
Price Jumped?(T)	0.499	0.205	2.427	0.015
Premium, $000s	0.001	0.000	2.834	0.005
Age, yrs	−0.020	0.009	−2.134	0.033
Income, $000s	−0.001	0.001	−0.906	0.365
Mobile?(T)	−0.009	0.176	−0.049	0.961

Evaluate with confusion matrix

predict_test_probs <- predict(model_sim_v2, sim_test, type = "response")

predict_test_class <- as.factor(ifelse(predict_test_probs > 0.40, "TRUE", "FALSE"))
CrossTable(sim_test$churn, predict_test_class,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c("Actual", "Predicted"))


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
             | Predicted 
      Actual |     FALSE |      TRUE | Row Total | 
-------------|-----------|-----------|-----------|
       FALSE |        75 |         2 |        77 | 
             |     0.750 |     0.020 |           | 
-------------|-----------|-----------|-----------|
        TRUE |        21 |         2 |        23 | 
             |     0.210 |     0.020 |           | 
-------------|-----------|-----------|-----------|
Column Total |        96 |         4 |       100 | 
-------------|-----------|-----------|-----------|

predict_test_class <- as.factor(ifelse(predict_test_probs > 0.32, "TRUE", "FALSE"))
CrossTable(sim_test$churn, predict_test_class,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c("Actual", "Predicted"))


 
   Cell Contents
|-------------------------|
|                       N |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  100 

 
             | Predicted 
      Actual |     FALSE |      TRUE | Row Total | 
-------------|-----------|-----------|-----------|
       FALSE |        67 |        10 |        77 | 
             |     0.670 |     0.100 |           | 
-------------|-----------|-----------|-----------|
        TRUE |        18 |         5 |        23 | 
             |     0.180 |     0.050 |           | 
-------------|-----------|-----------|-----------|
Column Total |        85 |        15 |       100 | 
-------------|-----------|-----------|-----------|

Visualization

I think the most typical plot shows one predictor (ie., one feature) on the X axis while holding the other features constant. In this case, I will use Loyalty.

# test set but varying loyalty while others constant
test_vary_loyalty <- data.frame(
    churn = sim_test$churn,
    churnn = as.numeric(sim_test$churn)-1,
    loyalty = sim_test$loyalty,
    bundle_b = as.factor(FALSE), # the rest the same
    pricejump_b = as.factor(FALSE),
    premium = mean(sim_test$premium),
    age = mean(sim_test$age),
    income = mean(sim_test$income),
    mobile_b = as.factor(TRUE)
)
    
test_vary_loyalty$predicted_probs <- predict(model_sim_v2, newdata = test_vary_loyalty, type = "response")

p1 <- ggplot(test_vary_loyalty, aes(x = loyalty, y = predicted_probs)) +
    geom_line(color = "red", linewidth = 1) +
    geom_jitter(aes(x = loyalty, y = churnn), color = "black", size = 2, width = 0.02, height = 0.02) +
    labs(y = "Probability of Churn", x = "Loyalty, yrs") +
    theme_minimal()

Visualize sigmoid

But I also wanted to try plotting the sigmoid (aka, logistic) function which is given by p = 1/[1+exp(-z)] where z is the linear combination of all features. You’ll notice that I appended (binded) five artificial observations merely to extend the sigmoid.

sim_train2 <- sim_train
last_rows_n <- 5
last_row <- tail(sim_train2, n = last_rows_n)
last_row$loyalty = rep(0,last_rows_n)
last_row$bundle_b = rep(FALSE, last_rows_n)
last_row$pricejump_b = rep(TRUE, last_rows_n)
last_row$premium = c(3000,5000,9000,12000,14000)
last_row$age = rep(18, 5)
last_row$income = rep(40,5)
last_row$mobile_b = rep(FALSE, 5)
last_row$churn = rep(TRUE, 5)
sim_train2 <- rbind(sim_train2, last_row)
sim_train2f <- sim_train2

sim_train2$bundle_b <- as.numeric(sim_train2$bundle_b) - 1
sim_train2$pricejump_b <- as.numeric(sim_train2$pricejump_b) - 1
sim_train2$mobile_b <- as.numeric(sim_train2$mobile_b) - 1

# Linear combination (logit) for each observation
sim_train2$logit <- as.matrix(sim_train2[, setdiff(names(sim_train2), "churn")]) %*% coef(model_sim_v2)[-1] + coef(model_sim_v2)[1]

# Prediction
sim_train2$predicted_probs <- predict(model_sim_v2, newdata = sim_train2f, type = "response")
sim_train2$churn <- as.numeric(sim_train2$churn) - 1

p2 <- ggplot(sim_train2, aes(x = logit, y = predicted_probs)) +
    geom_jitter(alpha = 0.5, color = "purple", size = 2, width = 0.02, height = 0.02) +
    geom_jitter(aes(x = logit, y = churn), color = "black", size = 2, width = 0.02, height = 0.02) +
    # Next plots the sigmoid function
    geom_line(aes(y = 1 / (1 + exp(-logit))), color = "red", linewidth = 1) +
    labs(y = "Probability of Churn", x = "Logit (Linear Combination)") +
    coord_cartesian(xlim = c(-5,5)) +
    theme_minimal() 

p1 + p2

Visualize the marginal effects

p3 <- plot(ggpredict(model_sim_v2,c("loyalty", "pricejump_b")))
p4 <- plot(ggpredict(model_sim_v2,c("loyalty", "bundle_b")))
p3 <- p3 + coord_cartesian(ylim = c(0,.6))
p4 <- p4 + coord_cartesian(ylim = c(0,.6))
p3 + p4

what_is <- ggpredict(model_sim_v2,c("loyalty", "pricejump_b"))
what_is

# Predicted probabilities of churn

# pricejump_b = FALSE

loyalty | Predicted |       95% CI
----------------------------------
      0 |      0.38 | [0.29, 0.48]
      5 |      0.25 | [0.20, 0.31]
     10 |      0.16 | [0.12, 0.20]
     15 |      0.09 | [0.06, 0.14]
     20 |      0.05 | [0.03, 0.10]
     25 |      0.03 | [0.01, 0.07]

# pricejump_b = TRUE

loyalty | Predicted |       95% CI
----------------------------------
      0 |      0.50 | [0.38, 0.62]
      5 |      0.36 | [0.27, 0.45]
     10 |      0.23 | [0.17, 0.32]
     15 |      0.15 | [0.09, 0.22]
     20 |      0.09 | [0.05, 0.16]
     25 |      0.05 | [0.02, 0.11]

Adjusted for:
* bundle_b =   FALSE
*  premium = 1137.60
*      age =   40.08
*   income =   87.72
* mobile_b =   FALSE

Numerical examples to help explain coefficient interpretation

int_test <- data_feature_means
int_test$loyalty = 0
int_test

  loyalty bundle_b pricejump_b premium      age   income mobile_b
1       0    FALSE       FALSE  1137.6 40.07667 87.72222     TRUE

predict(model_sim_v2, newdata = int_test, type = "response")

        1 
0.3753352

int_test$pricejump_b = as.factor(TRUE)
predict(model_sim_v2, newdata = int_test, type = "response")

        1 
0.4972846

int_test$loyalty = 1
predict(model_sim_v2, newdata = int_test, type = "response")

        1 
0.4678353

int_test$loyalty = 10
predict(model_sim_v2, newdata = int_test, type = "response")

        1 
0.2331494

int_test$pricejump_b = as.factor(FALSE)