Simulated insurance dataset (via simdata) and visualization of marginal effects (via ggeffects)
code
analysis
Author
David Harper, CFA, FRM
Published
October 29, 2023
Simulate data with simdata package
The delightful simdata package allows us to specify a correlation matrix and tranform (via transformation function) the random multivarate normal distribution into the set of desired univariate (but correlated!) distributions.
library(tidyverse)library(gt)library(patchwork)library(GGally); library(ggcorrplot)library(ggeffects) # amazing package plots marginal effectslibrary(simdata)library(matrixcalc); library(mbend)library(gmodels) # CrossTable()library(skimr)# this function in the simdata package builds a correlation# matrix by specifying c(col, row, rho)correlation_matrix =cor_from_upper(8,rbind(c(1,8,-0.20), # loyaltyc(2,8,-0.16), # bundlec(3,8,0.12), # jump (in price)c(4,8,0.15), # premiumc(5,8,-0.07), # agec(6,8,-0.05), # incomec(7,8,0)) # mobile)# we require positive definite matrix# is.positive.definite(correlation_matrix) = TRUEif (!is.positive.definite(correlation_matrix)) { correlation_matrix <-bend(correlation_matrix)$bent |>round(5)}ggcorrplot(correlation_matrix,colors =c("red","white", "darkgreen"))
Cell Contents
|-------------------------|
| N |
| N / Table Total |
|-------------------------|
Total Observations in Table: 100
| Predicted
Actual | FALSE | TRUE | Row Total |
-------------|-----------|-----------|-----------|
FALSE | 67 | 10 | 77 |
| 0.670 | 0.100 | |
-------------|-----------|-----------|-----------|
TRUE | 18 | 5 | 23 |
| 0.180 | 0.050 | |
-------------|-----------|-----------|-----------|
Column Total | 85 | 15 | 100 |
-------------|-----------|-----------|-----------|
Visualization
I think the most typical plot shows one predictor (ie., one feature) on the X axis while holding the other features constant. In this case, I will use Loyalty.
# test set but varying loyalty while others constanttest_vary_loyalty <-data.frame(churn = sim_test$churn,churnn =as.numeric(sim_test$churn)-1,loyalty = sim_test$loyalty,bundle_b =as.factor(FALSE), # the rest the samepricejump_b =as.factor(FALSE),premium =mean(sim_test$premium),age =mean(sim_test$age),income =mean(sim_test$income),mobile_b =as.factor(TRUE))test_vary_loyalty$predicted_probs <-predict(model_sim_v2, newdata = test_vary_loyalty, type ="response")p1 <-ggplot(test_vary_loyalty, aes(x = loyalty, y = predicted_probs)) +geom_line(color ="red", linewidth =1) +geom_jitter(aes(x = loyalty, y = churnn), color ="black", size =2, width =0.02, height =0.02) +labs(y ="Probability of Churn", x ="Loyalty, yrs") +theme_minimal()
Visualize sigmoid
But I also wanted to try plotting the sigmoid (aka, logistic) function which is given by p = 1/[1+exp(-z)] where z is the linear combination of all features. You’ll notice that I appended (binded) five artificial observations merely to extend the sigmoid.
sim_train2 <- sim_trainlast_rows_n <-5last_row <-tail(sim_train2, n = last_rows_n)last_row$loyalty =rep(0,last_rows_n)last_row$bundle_b =rep(FALSE, last_rows_n)last_row$pricejump_b =rep(TRUE, last_rows_n)last_row$premium =c(3000,5000,9000,12000,14000)last_row$age =rep(18, 5)last_row$income =rep(40,5)last_row$mobile_b =rep(FALSE, 5)last_row$churn =rep(TRUE, 5)sim_train2 <-rbind(sim_train2, last_row)sim_train2f <- sim_train2sim_train2$bundle_b <-as.numeric(sim_train2$bundle_b) -1sim_train2$pricejump_b <-as.numeric(sim_train2$pricejump_b) -1sim_train2$mobile_b <-as.numeric(sim_train2$mobile_b) -1# Linear combination (logit) for each observationsim_train2$logit <-as.matrix(sim_train2[, setdiff(names(sim_train2), "churn")]) %*%coef(model_sim_v2)[-1] +coef(model_sim_v2)[1]# Predictionsim_train2$predicted_probs <-predict(model_sim_v2, newdata = sim_train2f, type ="response")sim_train2$churn <-as.numeric(sim_train2$churn) -1p2 <-ggplot(sim_train2, aes(x = logit, y = predicted_probs)) +geom_jitter(alpha =0.5, color ="purple", size =2, width =0.02, height =0.02) +geom_jitter(aes(x = logit, y = churn), color ="black", size =2, width =0.02, height =0.02) +# Next plots the sigmoid functiongeom_line(aes(y =1/ (1+exp(-logit))), color ="red", linewidth =1) +labs(y ="Probability of Churn", x ="Logit (Linear Combination)") +coord_cartesian(xlim =c(-5,5)) +theme_minimal() p1 + p2