Display system information and load tidyverse and faraway packages

sessionInfo()
## R version 4.2.3 (2023-03-15)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.3.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.31   R6_2.5.1        jsonlite_1.8.4  evaluate_0.20  
##  [5] cachem_1.0.7    rlang_1.1.0     cli_3.6.0       rstudioapi_0.14
##  [9] jquerylib_0.1.4 bslib_0.4.2     rmarkdown_2.20  tools_4.2.3    
## [13] xfun_0.37       yaml_2.3.7      fastmap_1.1.1   compiler_4.2.3 
## [17] htmltools_0.5.4 knitr_1.42      sass_0.4.5
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(faraway)

faraway package contains the datasets in the ELMR book.

Heart disease example

The dataframe wcgs in faraway package contains data from the Western Collaborative Group Study.

wcgs %>% head(10)
##    age height weight sdp dbp chol behave cigs dibep chd  typechd timechd
## 1   49     73    150 110  76  225     A2   25     A  no     none    1664
## 2   42     70    160 154  84  177     A2   20     A  no     none    3071
## 3   42     69    160 110  78  181     B3    0     B  no     none    3071
## 4   41     68    152 124  78  132     B4   20     B  no     none    3064
## 5   59     70    150 144  86  255     B3   20     B yes infdeath    1885
## 6   44     72    204 150  90  182     B4    0     B  no     none    3102
## 7   44     72    164 130  84  155     B4    0     B  no     none    3074
## 8   40     71    150 138  60  140     A2    0     A  no     none    3071
## 9   43     72    190 146  76  149     B3   25     B  no     none    3064
## 10  42     70    175 132  90  325     A2    0     A  no     none    1032
##      arcus
## 1   absent
## 2  present
## 3   absent
## 4   absent
## 5  present
## 6   absent
## 7   absent
## 8   absent
## 9   absent
## 10 present

We convert the dataframe into a tibble for compatibility with tidyverse.

wcgs <- wcgs %>% 
  as_tibble() %>%
  print(width = Inf)
## # A tibble: 3,154 × 13
##      age height weight   sdp   dbp  chol behave  cigs dibep chd   typechd 
##    <int>  <int>  <int> <int> <int> <int> <fct>  <int> <fct> <fct> <fct>   
##  1    49     73    150   110    76   225 A2        25 A     no    none    
##  2    42     70    160   154    84   177 A2        20 A     no    none    
##  3    42     69    160   110    78   181 B3         0 B     no    none    
##  4    41     68    152   124    78   132 B4        20 B     no    none    
##  5    59     70    150   144    86   255 B3        20 B     yes   infdeath
##  6    44     72    204   150    90   182 B4         0 B     no    none    
##  7    44     72    164   130    84   155 B4         0 B     no    none    
##  8    40     71    150   138    60   140 A2         0 A     no    none    
##  9    43     72    190   146    76   149 B3        25 B     no    none    
## 10    42     70    175   132    90   325 A2         0 A     no    none    
##    timechd arcus  
##      <int> <fct>  
##  1    1664 absent 
##  2    3071 present
##  3    3071 absent 
##  4    3064 absent 
##  5    1885 present
##  6    3102 absent 
##  7    3074 absent 
##  8    3071 absent 
##  9    3064 absent 
## 10    1032 present
## # … with 3,144 more rows

For now, we focus just on variables
- chd, whether the person develops coronary heard disease or not,
- height, height of the person in inches,
- cigs, number of cigarettes smoked per day.

wcgs %>%
  select(chd, height, cigs) %>%
  summary()
##   chd           height           cigs     
##  no :2897   Min.   :60.00   Min.   : 0.0  
##  yes: 257   1st Qu.:68.00   1st Qu.: 0.0  
##             Median :70.00   Median : 0.0  
##             Mean   :69.78   Mean   :11.6  
##             3rd Qu.:72.00   3rd Qu.:20.0  
##             Max.   :78.00   Max.   :99.0

We use side-by-side boxplots to summarize the qulitative variable chd and quantitative variables height and cigs

ggplot(data = wcgs) +
  geom_boxplot(mapping = aes(x = chd, y = height))

and number of cigarretes smoked per day

ggplot(data = wcgs) +
  geom_boxplot(mapping = aes(x = chd, y = cigs))

It seems more cigarettes is associated with heard disease, but not height. How can we formally analyze this? If we use linear regression (straight line) for the anlaysis, the line will eventually extends beyond the [0, 1] range, making interpretation hard.

ggplot(data = wcgs) +
  geom_point(mapping = aes(x = cigs, y = chd))

Logistic regression

ggplot(data = tibble(x = 0), mapping = aes(x = x)) + # null data
  stat_function(fun = ilogit) + # ilogit is from faraway
  xlim(-6, 6) + 
  labs(x = expression(eta), y = "p", title = "Logistic function (inverse link)")

# curve(ilogit(x), -6, 6, xlab = expression(eta), ylab = "p")

Fitting logistic regression

(lmod <- glm(chd ~ height + cigs, family = binomial, wcgs))
## 
## Call:  glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
## 
## Coefficients:
## (Intercept)       height         cigs  
##    -4.50161      0.02521      0.02313  
## 
## Degrees of Freedom: 3153 Total (i.e. Null);  3151 Residual
## Null Deviance:       1781 
## Residual Deviance: 1749  AIC: 1755

Inspect the content in the result lmod:

str(lmod)
## List of 30
##  $ coefficients     : Named num [1:3] -4.5016 0.0252 0.0231
##   ..- attr(*, "names")= chr [1:3] "(Intercept)" "height" "cigs"
##  $ residuals        : Named num [1:3154] -1.12 -1.1 -1.06 -1.1 10.72 ...
##   ..- attr(*, "names")= chr [1:3154] "1" "2" "3" "4" ...
##  $ fitted.values    : Named num [1:3154] 0.1107 0.0933 0.0594 0.0891 0.0933 ...
##   ..- attr(*, "names")= chr [1:3154] "1" "2" "3" "4" ...
##  $ effects          : Named num [1:3154] 36.178 -1.071 5.724 -0.312 3.124 ...
##   ..- attr(*, "names")= chr [1:3154] "(Intercept)" "height" "cigs" "" ...
##  $ R                : num [1:3, 1:3] -15.3 0 0 -1068.1 -38 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "(Intercept)" "height" "cigs"
##   .. ..$ : chr [1:3] "(Intercept)" "height" "cigs"
##  $ rank             : int 3
##  $ qr               :List of 5
##   ..$ qr   : num [1:3154, 1:3] -15.276 0.019 0.0155 0.0186 0.019 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:3154] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:3] "(Intercept)" "height" "cigs"
##   ..$ rank : int 3
##   ..$ qraux: num [1:3] 1.02 1 1.02
##   ..$ pivot: int [1:3] 1 2 3
##   ..$ tol  : num 1e-11
##   ..- attr(*, "class")= chr "qr"
##  $ family           :List of 12
##   ..$ family    : chr "binomial"
##   ..$ link      : chr "logit"
##   ..$ linkfun   :function (mu)  
##   ..$ linkinv   :function (eta)  
##   ..$ variance  :function (mu)  
##   ..$ dev.resids:function (y, mu, wt)  
##   ..$ aic       :function (y, n, mu, wt, dev)  
##   ..$ mu.eta    :function (eta)  
##   ..$ initialize: language {     if (NCOL(y) == 1) { ...
##   ..$ validmu   :function (mu)  
##   ..$ valideta  :function (eta)  
##   ..$ simulate  :function (object, nsim)  
##   ..- attr(*, "class")= chr "family"
##  $ linear.predictors: Named num [1:3154] -2.08 -2.27 -2.76 -2.32 -2.27 ...
##   ..- attr(*, "names")= chr [1:3154] "1" "2" "3" "4" ...
##  $ deviance         : num 1749
##  $ aic              : num 1755
##  $ null.deviance    : num 1781
##  $ iter             : int 5
##  $ weights          : Named num [1:3154] 0.0985 0.0846 0.0559 0.0811 0.0846 ...
##   ..- attr(*, "names")= chr [1:3154] "1" "2" "3" "4" ...
##  $ prior.weights    : Named num [1:3154] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "names")= chr [1:3154] "1" "2" "3" "4" ...
##  $ df.residual      : int 3151
##  $ df.null          : int 3153
##  $ y                : Named num [1:3154] 0 0 0 0 1 0 0 0 0 0 ...
##   ..- attr(*, "names")= chr [1:3154] "1" "2" "3" "4" ...
##  $ converged        : logi TRUE
##  $ boundary         : logi FALSE
##  $ model            :'data.frame':   3154 obs. of  3 variables:
##   ..$ chd   : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ...
##   ..$ height: int [1:3154] 73 70 69 68 70 72 72 71 72 70 ...
##   ..$ cigs  : int [1:3154] 25 20 0 20 20 0 0 0 25 0 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language chd ~ height + cigs
##   .. .. ..- attr(*, "variables")= language list(chd, height, cigs)
##   .. .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:3] "chd" "height" "cigs"
##   .. .. .. .. ..$ : chr [1:2] "height" "cigs"
##   .. .. ..- attr(*, "term.labels")= chr [1:2] "height" "cigs"
##   .. .. ..- attr(*, "order")= int [1:2] 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(chd, height, cigs)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:3] "factor" "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:3] "chd" "height" "cigs"
##  $ call             : language glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
##  $ formula          :Class 'formula'  language chd ~ height + cigs
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  $ terms            :Classes 'terms', 'formula'  language chd ~ height + cigs
##   .. ..- attr(*, "variables")= language list(chd, height, cigs)
##   .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:3] "chd" "height" "cigs"
##   .. .. .. ..$ : chr [1:2] "height" "cigs"
##   .. ..- attr(*, "term.labels")= chr [1:2] "height" "cigs"
##   .. ..- attr(*, "order")= int [1:2] 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(chd, height, cigs)
##   .. ..- attr(*, "dataClasses")= Named chr [1:3] "factor" "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:3] "chd" "height" "cigs"
##  $ data             : tibble [3,154 × 13] (S3: tbl_df/tbl/data.frame)
##   ..$ age    : int [1:3154] 49 42 42 41 59 44 44 40 43 42 ...
##   ..$ height : int [1:3154] 73 70 69 68 70 72 72 71 72 70 ...
##   ..$ weight : int [1:3154] 150 160 160 152 150 204 164 150 190 175 ...
##   ..$ sdp    : int [1:3154] 110 154 110 124 144 150 130 138 146 132 ...
##   ..$ dbp    : int [1:3154] 76 84 78 78 86 90 84 60 76 90 ...
##   ..$ chol   : int [1:3154] 225 177 181 132 255 182 155 140 149 325 ...
##   ..$ behave : Factor w/ 4 levels "A1","A2","B3",..: 2 2 3 4 3 4 4 2 3 2 ...
##   ..$ cigs   : int [1:3154] 25 20 0 20 20 0 0 0 25 0 ...
##   ..$ dibep  : Factor w/ 2 levels "A","B": 1 1 2 2 2 2 2 1 2 1 ...
##   ..$ chd    : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ...
##   ..$ typechd: Factor w/ 4 levels "angina","infdeath",..: 3 3 3 3 2 3 3 3 3 3 ...
##   ..$ timechd: int [1:3154] 1664 3071 3071 3064 1885 3102 3074 3071 3064 1032 ...
##   ..$ arcus  : Factor w/ 2 levels "absent","present": 1 2 1 1 2 1 1 1 1 2 ...
##  $ offset           : NULL
##  $ control          :List of 3
##   ..$ epsilon: num 1e-08
##   ..$ maxit  : num 25
##   ..$ trace  : logi FALSE
##  $ method           : chr "glm.fit"
##  $ contrasts        : NULL
##  $ xlevels          : Named list()
##  - attr(*, "class")= chr [1:2] "glm" "lm"

Summary of the result:

(lmod_sm <- summary(lmod))
## 
## Call:
## glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0041  -0.4425  -0.3630  -0.3499   2.4357  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.50161    1.84186  -2.444   0.0145 *  
## height       0.02521    0.02633   0.957   0.3383    
## cigs         0.02313    0.00404   5.724 1.04e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1781.2  on 3153  degrees of freedom
## Residual deviance: 1749.0  on 3151  degrees of freedom
## AIC: 1755
## 
## Number of Fisher Scoring iterations: 5
str(lmod_sm)
## List of 17
##  $ call          : language glm(formula = chd ~ height + cigs, family = binomial, data = wcgs)
##  $ terms         :Classes 'terms', 'formula'  language chd ~ height + cigs
##   .. ..- attr(*, "variables")= language list(chd, height, cigs)
##   .. ..- attr(*, "factors")= int [1:3, 1:2] 0 1 0 0 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:3] "chd" "height" "cigs"
##   .. .. .. ..$ : chr [1:2] "height" "cigs"
##   .. ..- attr(*, "term.labels")= chr [1:2] "height" "cigs"
##   .. ..- attr(*, "order")= int [1:2] 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(chd, height, cigs)
##   .. ..- attr(*, "dataClasses")= Named chr [1:3] "factor" "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:3] "chd" "height" "cigs"
##  $ family        :List of 12
##   ..$ family    : chr "binomial"
##   ..$ link      : chr "logit"
##   ..$ linkfun   :function (mu)  
##   ..$ linkinv   :function (eta)  
##   ..$ variance  :function (mu)  
##   ..$ dev.resids:function (y, mu, wt)  
##   ..$ aic       :function (y, n, mu, wt, dev)  
##   ..$ mu.eta    :function (eta)  
##   ..$ initialize: language {     if (NCOL(y) == 1) { ...
##   ..$ validmu   :function (mu)  
##   ..$ valideta  :function (eta)  
##   ..$ simulate  :function (object, nsim)  
##   ..- attr(*, "class")= chr "family"
##  $ deviance      : num 1749
##  $ aic           : num 1755
##  $ contrasts     : NULL
##  $ df.residual   : int 3151
##  $ null.deviance : num 1781
##  $ df.null       : int 3153
##  $ iter          : int 5
##  $ deviance.resid: Named num [1:3154] -0.484 -0.442 -0.35 -0.432 2.178 ...
##   ..- attr(*, "names")= chr [1:3154] "1" "2" "3" "4" ...
##  $ coefficients  : num [1:3, 1:4] -4.5016 0.0252 0.0231 1.8419 0.0263 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "(Intercept)" "height" "cigs"
##   .. ..$ : chr [1:4] "Estimate" "Std. Error" "z value" "Pr(>|z|)"
##  $ aliased       : Named logi [1:3] FALSE FALSE FALSE
##   ..- attr(*, "names")= chr [1:3] "(Intercept)" "height" "cigs"
##  $ dispersion    : num 1
##  $ df            : int [1:3] 3 3151 3
##  $ cov.unscaled  : num [1:3, 1:3] 3.392458 -0.048431 -0.000114 -0.048431 0.000693 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "(Intercept)" "height" "cigs"
##   .. ..$ : chr [1:3] "(Intercept)" "height" "cigs"
##  $ cov.scaled    : num [1:3, 1:3] 3.392458 -0.048431 -0.000114 -0.048431 0.000693 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "(Intercept)" "height" "cigs"
##   .. ..$ : chr [1:3] "(Intercept)" "height" "cigs"
##  - attr(*, "class")= chr "summary.glm"

Interpretation

# dataframe
wcgs %>%
  select(chd, height, cigs) %>%
  head(10)
## # A tibble: 10 × 3
##    chd   height  cigs
##    <fct>  <int> <int>
##  1 no        73    25
##  2 no        70    20
##  3 no        69     0
##  4 no        68    20
##  5 yes       70    20
##  6 no        72     0
##  7 no        72     0
##  8 no        71     0
##  9 no        72    25
## 10 no        70     0
# response
lmod$y %>% head(10)
##  1  2  3  4  5  6  7  8  9 10 
##  0  0  0  0  1  0  0  0  0  0
# predictors
model.matrix(lmod) %>% head(10)
##    (Intercept) height cigs
## 1            1     73   25
## 2            1     70   20
## 3            1     69    0
## 4            1     68   20
## 5            1     70   20
## 6            1     72    0
## 7            1     72    0
## 8            1     71    0
## 9            1     72   25
## 10           1     70    0
library(gtsummary)
lmod %>%
  tbl_regression() %>%
  bold_labels() %>%
  bold_p(t = 0.05)
Characteristic log(OR)1 95% CI1 p-value
height 0.03 -0.03, 0.08 0.3
cigs 0.02 0.02, 0.03 <0.001
1 OR = Odds Ratio, CI = Confidence Interval

Summarize the odds:

lmod %>%
  tbl_regression(intercept = TRUE, exponentiate = TRUE) %>%
  bold_labels() %>%
  bold_p(t = 0.05)
Characteristic OR1 95% CI1 p-value
(Intercept) 0.01 0.00, 0.40 0.015
height 1.03 0.97, 1.08 0.3
cigs 1.02 1.02, 1.03 <0.001
1 OR = Odds Ratio, CI = Confidence Interval
# same as lmod$coefficients
# coef(lmod) is a named numeric vector
(beta_hat <- unname(coef(lmod)))
## [1] -4.50161397  0.02520779  0.02312740
exp(beta_hat)
## [1] 0.01109108 1.02552819 1.02339691

How to interpret the effect of a pack a day (20 cigarettes) on heart disease?

exp(beta_hat[3] * 20)
## [1] 1.588115
(p1 <- ilogit(sum(beta_hat * c(1, 68, 20))))
## [1] 0.08907868

and

(p2 <- ilogit(sum(beta_hat * c(1, 68, 0))))
## [1] 0.05800425

Then the relative risk is

p1 / p2
## [1] 1.535727

Inference (analysis of deviance)

lmod$null.deviance - lmod$deviance
## [1] 32.19451

giving p-value

pchisq(lmod$null.deviance - lmod$deviance, 2, lower.tail = FALSE)
## [1] 1.02106e-07

Therefore our model gives a significantly better fit than the null (intercept-only) model.

# fit a model without height
lmodc <- glm(chd ~ cigs, family = binomial, wcgs)
anova(lmodc, lmod, test = "Chi")
## Analysis of Deviance Table
## 
## Model 1: chd ~ cigs
## Model 2: chd ~ height + cigs
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      3152       1750                     
## 2      3151       1749  1  0.92025   0.3374
drop1(lmod, test = "Chi")
## Single term deletions
## 
## Model:
## chd ~ height + cigs
##        Df Deviance    AIC     LRT Pr(>Chi)    
## <none>      1749.0 1755.0                     
## height  1   1750.0 1754.0  0.9202   0.3374    
## cigs    1   1780.1 1784.1 31.0695 2.49e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lmod_sm$coefficients
##                Estimate  Std. Error   z value     Pr(>|z|)
## (Intercept) -4.50161397 1.841862743 -2.444055 1.452321e-02
## height       0.02520779 0.026327385  0.957474 3.383281e-01
## cigs         0.02312740 0.004040183  5.724344 1.038339e-08

In general, deviance-based test is preferred over the z-test.

Confidence intervals

tibble(
  `coef`  = beta_hat,
  `2.5%`  = beta_hat - 1.96 * lmod_sm$coefficients[, 2],
  `97.5%` = beta_hat + 1.96 * lmod_sm$coefficients[, 2])
## # A tibble: 3 × 3
##      coef  `2.5%` `97.5%`
##     <dbl>   <dbl>   <dbl>
## 1 -4.50   -8.11   -0.892 
## 2  0.0252 -0.0264  0.0768
## 3  0.0231  0.0152  0.0310

or from profile-likelihood

confint(lmod)
## Waiting for profiling to be done...
##                   2.5 %      97.5 %
## (Intercept) -8.13475465 -0.91297018
## height      -0.02619902  0.07702835
## cigs         0.01514949  0.03100534

Diagnostics

linpred  <- predict(lmod)
linpred %>% head(10)
##         1         2         3         4         5         6         7         8 
## -2.083261 -2.274521 -2.762277 -2.324936 -2.274521 -2.686653 -2.686653 -2.711861 
##         9        10 
## -2.108468 -2.737069

The second on the scale of response, \(p = \text{logit}^{-1}(\eta)\),

predprob <- predict(lmod, type = "response")
predprob %>% head(10)
##          1          2          3          4          5          6          7 
## 0.11073449 0.09325523 0.05939705 0.08907868 0.09325523 0.06376553 0.06376553 
##          8          9         10 
## 0.06227708 0.10827647 0.06082112
# same as residuals(lmod, type = "response")
rawres <- lmod$y - predprob

The plot of raw residuals against the fitted values is not very informative.

wcgs %>%
  mutate(rawres = rawres, linpred = linpred) %>%
  ggplot() +
  geom_point(mapping = aes(x = linpred, y = rawres)) +
  labs(x = "Linear predictor", y = "Raw residuals")

We do not expect the raw residuals to have equal variance because the binary variance is \(p(1 - p)\).

devres <- residuals(lmod)
devres %>% head(10)
##          1          2          3          4          5          6          7 
## -0.4844779 -0.4424800 -0.3499548 -0.4319693  2.1782631 -0.3630133 -0.3630133 
##          8          9         10 
## -0.3586106 -0.4787466 -0.3542579

Sanity check:

sqrt(-2 * (lmod$y * log(predprob) + (1 - lmod$y) * log(1 - predprob))) %>%
  head(10)
##         1         2         3         4         5         6         7         8 
## 0.4844779 0.4424800 0.3499548 0.4319693 2.1782631 0.3630133 0.3630133 0.3586106 
##         9        10 
## 0.4787466 0.3542579

The plot of deviance residuals against the fitted values.

wcgs %>%
  mutate(devres = devres, linpred = linpred) %>%
  ggplot() +
  geom_point(mapping = aes(x = linpred, y = devres)) +
  labs(x = "Linear predictor", y = "Deviance residuals")

Again we see the residuals are clustered into two lines: the upper one corresponding to \(y_i=1\) and the lower one to \(y_i=0\). We can improve this plot by binning: divide the range of linear predictor into 100 bins of roughly equal points and plot average residual against average linear predictors per bin.

wcgs %>%
  mutate(devres = devres, linpred = linpred) %>% 
  group_by(cut(linpred, breaks = unique(quantile(linpred, (1:100)/101)))) %>%
  summarize(devres = mean(devres), 
            linpred = mean(linpred)) %>%
  ggplot() +
  geom_point(mapping = aes(x = linpred, y = devres)) + 
  labs(x = "Linear predictor", y = "Binned deviance residual")

Question: is there a concern that the deviance residuals are not centered around 0?

wcgs %>%
  mutate(devres = devres) %>%
  group_by(height) %>%
  summarize(devres = mean(devres)) %>%
  ggplot() +
  geom_point(mapping = aes(x = height, y = devres)) + 
  labs(x = "height", y = "Binned deviance residual")

wcgs %>%
  mutate(devres = devres) %>%
  group_by(cigs) %>%
  summarize(devres = mean(devres),
            count = n()) %>%
  ggplot() +
  geom_point(mapping = aes(x = cigs, y = devres, size=sqrt(count))) + 
  labs(x = "cigs", y = "Binned deviance residual")

qqnorm(devres)

halfnorm(hatvalues(lmod))

We see two high leverage cases, who smoke an unusual number of cigarettes per day!

wcgs %>%
  slice(c(2527, 2695)) %>%
  print(width = Inf)
## # A tibble: 2 × 13
##     age height weight   sdp   dbp  chol behave  cigs dibep chd   typechd timechd
##   <int>  <int>  <int> <int> <int> <int> <fct>  <int> <fct> <fct> <fct>     <int>
## 1    52     71    168   120    80   251 A1        99 A     no    none       2956
## 2    47     64    158   116    76   206 A1        80 A     no    none       2114
##   arcus  
##   <fct>  
## 1 present
## 2 absent
halfnorm(cooks.distance(lmod))

wcgs %>%
  slice(c(953, 2082)) %>%
  print(width = Inf)
## # A tibble: 2 × 13
##     age height weight   sdp   dbp  chol behave  cigs dibep chd   typechd timechd
##   <int>  <int>  <int> <int> <int> <int> <fct>  <int> <fct> <fct> <fct>     <int>
## 1    57     63    155   128    88   196 A2         0 A     yes   silent     2349
## 2    49     77    210   138    86   235 B3         0 B     yes   angina     3048
##   arcus 
##   <fct> 
## 1 absent
## 2 absent

Goodness of fit

Hosmer-Lemeshow statistic

  • Intuitively if we divide observations into \(J\) bins according to linear predictors \(\eta\), then \(y_j / n_j\) (observed proportion of “successes”) for \(j\)-th bin should be close to the average predicted probabilities in that bin.
wcgs_binned <- wcgs %>%
  mutate(predprob = predict(lmod, type = "response"), 
         linpred  = predict(lmod, type = "link"),
         bin      = cut(linpred, breaks = unique(quantile(linpred, (1:100) / 101)))) %>%
  group_by(bin) %>%
  summarize(y       = sum(ifelse(chd == "yes", 1, 0)), 
            avgpred = mean(predprob), 
            count   = n()) %>%
  mutate(se_fit = sqrt(avgpred * (1 - avgpred) / count))
wcgs_binned %>%
  ggplot(mapping = aes(x = avgpred, y = y / count)) + 
  geom_point() +
  geom_linerange(mapping = aes(ymin = y / count - 2 * se_fit,
                               ymax = y / count + 2 * se_fit), alpha = 0.5) +
  geom_abline(intercept = 0, slope = 1) +
  labs(x = "Predicted probability", y = "Observed proportion")

  • The Hosmer-Lemeshow test formalizes this idea by testing the statistic \[ X_{\text{HL}}^2 = \sum_{j=1}^J \frac{(y_j - m_j \widehat{p}_j)^2}{m_j \widehat{p}_j (1 - \widehat{p}_j)} \] against the \(\chi^2\) distribution with \(J-1\) degrees of freedom.
# Hosmer-Lemeshow test statistic
(hlstat <- with(wcgs_binned, sum((y - count * avgpred)^2 / (count * avgpred * (1 - avgpred)))))
## [1] 64.87001
# J
nrow(wcgs_binned)
## [1] 52
# p-value
pchisq(hlstat, nrow(wcgs_binned) - 1, lower.tail = FALSE)
## [1] 0.09172918

We see a moderate p-value, which indicates no lack of fit.

ROC curve

  • Logistic regression is often used as a tool for classification.

  • If we choose a threshold, say 0.2, then the predicted probabilities give a classification rule \[ \text{case $i$ is a} \begin{cases} \text{"success"} & \text{if } \widehat{p}_i \ge 0.2 \\ \text{"failure"} & \text{if } \widehat{p}_i < 0.2 \end{cases}. \]

wcgs %>% 
  mutate(predprob = predict(lmod, type = "response")) %>% 
  mutate(predout = ifelse(predprob >= 0.2, "yes", "no")) %>%
  xtabs(~ chd + predout, data = .)
##      predout
## chd     no  yes
##   no  2886   11
##   yes  254    3
  • With this classification rule, we see the error rate is about
(11 + 254) / (2886 + 254 + 11 + 3)
## [1] 0.08402029

The sensitivity is \[ \frac{\text{TP}}{\text{TP + FN}} = \frac{3}{257} = 1.17\% \] and the specificity is \[ \frac{\text{TN}}{\text{FP + TN}} = \frac{2886}{11 + 2886} = 99.62\% \]

  • If we lower the threshold, then we increase the sensitivity but decrease the specificity. If we plot sensitivity against 1-specificity by varying the threshold, then we get the receiver operating characteristic (ROC) curve.
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
lmod_roc <- roc(chd ~ predprob, wcgs)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
ggroc(lmod_roc)

The area under the curve (AUC) is a measure of the overal classification of the classifier. Larger AUC means better classification performance.

auc(lmod_roc)
## Area under the curve: 0.6007

Model selection by AIC

wcgs <- wcgs %>%
  # 1 in = 0.0254 m, 1 lb = 0.4536 kg
  mutate(bmi = 703 * weight / height)
biglm <- glm(chd ~ age + height + weight + bmi + sdp + dbp + chol + dibep + cigs + arcus,
             family = binomial, data = wcgs)

and then do sequential backward search using the step function

stats::step(biglm, trace = TRUE, direction = "back") %>%
  tbl_regression() %>%
  bold_labels()
## Start:  AIC=1591.05
## chd ~ age + height + weight + bmi + sdp + dbp + chol + dibep + 
##     cigs + arcus
## 
##          Df Deviance    AIC
## - dbp     1   1569.1 1589.1
## - weight  1   1569.3 1589.3
## - bmi     1   1569.5 1589.5
## - height  1   1569.5 1589.5
## <none>        1569.0 1591.0
## - arcus   1   1571.1 1591.1
## - sdp     1   1576.8 1596.8
## - dibep   1   1590.4 1610.4
## - cigs    1   1592.0 1612.0
## - age     1   1593.7 1613.7
## - chol    1   1619.8 1639.8
## 
## Step:  AIC=1589.06
## chd ~ age + height + weight + bmi + sdp + chol + dibep + cigs + 
##     arcus
## 
##          Df Deviance    AIC
## - weight  1   1569.4 1587.4
## - bmi     1   1569.5 1587.5
## - height  1   1569.5 1587.5
## <none>        1569.1 1589.1
## - arcus   1   1571.2 1589.2
## - sdp     1   1586.2 1604.2
## - dibep   1   1590.4 1608.4
## - cigs    1   1592.5 1610.5
## - age     1   1593.7 1611.7
## - chol    1   1619.9 1637.9
## 
## Step:  AIC=1587.36
## chd ~ age + height + bmi + sdp + chol + dibep + cigs + arcus
## 
##          Df Deviance    AIC
## - height  1   1570.3 1586.3
## <none>        1569.4 1587.4
## - arcus   1   1571.5 1587.5
## - bmi     1   1574.4 1590.4
## - sdp     1   1586.7 1602.7
## - dibep   1   1590.7 1606.7
## - cigs    1   1592.8 1608.8
## - age     1   1594.0 1610.0
## - chol    1   1620.0 1636.0
## 
## Step:  AIC=1586.32
## chd ~ age + bmi + sdp + chol + dibep + cigs + arcus
## 
##         Df Deviance    AIC
## <none>       1570.3 1586.3
## - arcus  1   1572.6 1586.6
## - bmi    1   1577.3 1591.3
## - sdp    1   1587.2 1601.2
## - dibep  1   1591.9 1605.9
## - age    1   1594.2 1608.2
## - cigs   1   1594.6 1608.6
## - chol   1   1620.0 1634.0
Characteristic log(OR)1 95% CI1 p-value
age 0.06 0.04, 0.08 <0.001
bmi 0.00 0.00, 0.00 0.008
sdp 0.02 0.01, 0.03 <0.001
chol 0.01 0.01, 0.01 <0.001
dibep
A
B -0.66 -1.0, -0.38 <0.001
cigs 0.02 0.01, 0.03 <0.001
arcus
absent
present 0.22 -0.06, 0.50 0.13
1 OR = Odds Ratio, CI = Confidence Interval

Model selection by lasso

(wcgs <- wcgs %>% 
  select(-c(behave, typechd, timechd)) %>%
  drop_na())
## # A tibble: 3,140 × 11
##      age height weight   sdp   dbp  chol  cigs dibep chd   arcus     bmi
##    <int>  <int>  <int> <int> <int> <int> <int> <fct> <fct> <fct>   <dbl>
##  1    49     73    150   110    76   225    25 A     no    absent  1445.
##  2    42     70    160   154    84   177    20 A     no    present 1607.
##  3    42     69    160   110    78   181     0 B     no    absent  1630.
##  4    41     68    152   124    78   132    20 B     no    absent  1571.
##  5    59     70    150   144    86   255    20 B     yes   present 1506.
##  6    44     72    204   150    90   182     0 B     no    absent  1992.
##  7    44     72    164   130    84   155     0 B     no    absent  1601.
##  8    40     71    150   138    60   140     0 A     no    absent  1485.
##  9    43     72    190   146    76   149    25 B     no    absent  1855.
## 10    42     70    175   132    90   325     0 A     no    present 1758.
## # … with 3,130 more rows

split data into 80% training cases and 20% validation cases.

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-6
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:faraway':
## 
##     melanoma
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(themis) # provides up/down-sampling methods for the data
## Loading required package: recipes
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:Matrix':
## 
##     update
## The following objects are masked from 'package:gtsummary':
## 
##     all_double, all_factor, all_integer, all_logical, all_numeric
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
# set seed for reproducibility
set.seed(200)

# list = FALSE, request result to be in a matrix (of row position) not a list
training_samples <- wcgs$chd %>%
  createDataPartition(p = 0.8, list = FALSE)
# (train_data <- wcgs %>%
#   slice(training_samples[, 1]))
# (val_data   <- wcgs %>%
#   slice(-training_samples[, 1]))

The glmnet package takes a matrix (of predictors) and a vector (of responses) as input. We use model.matrix function to create them. glmnet will add intercept by default, so we drop intercept term when forming x matrix.

# X and y from original data
x_all <- model.matrix(
  chd ~ - 1 + age + height + weight + bmi + sdp + dbp + chol + dibep + cigs + arcus, 
  data = wcgs)
y_all <- ifelse(wcgs$chd == "yes", 1, 0)
# training X and y
x_train <- x_all[training_samples[, 1], ]
y_train <- y_all[training_samples[, 1]]
# validation X and y
x_val <- x_all[-training_samples[, 1], ]
y_val <- y_all[-training_samples[, 1]]

Fit lasso regression and plot solution path:

lasso_fit <- glmnet(x_train, y_train, family = "binomial", alpha = 1)
summary(lasso_fit)
##            Length Class     Mode     
## a0          49    -none-    numeric  
## beta       539    dgCMatrix S4       
## df          49    -none-    numeric  
## dim          2    -none-    numeric  
## lambda      49    -none-    numeric  
## dev.ratio   49    -none-    numeric  
## nulldev      1    -none-    numeric  
## npasses      1    -none-    numeric  
## jerr         1    -none-    numeric  
## offset       1    -none-    logical  
## classnames   2    -none-    character
## call         5    -none-    call     
## nobs         1    -none-    numeric
plot(lasso_fit, xvar = "lambda", label = TRUE)

# predict validation case probabilities at different \lambda values and calculate test deviance
pred_val <- predict(lasso_fit, newx = x_val, type = "response", s = lasso_fit$lambda)
dev_val <- -2 * colSums(y_val * log(pred_val) + (1 - y_val) * log(1 - pred_val))
tibble(lambda = lasso_fit$lambda, dev_val = dev_val) %>%
  ggplot() + 
  geom_point(mapping = aes(x = lambda, y = dev_val)) + 
  scale_x_log10() +
  labs(y = "Binomial deviance on validation set", x = "Lambda")

From the graph, it seems that \(\lambda = 0.005\) yields a model that performs best on the validation set.

set.seed(200)
cv_lasso <- cv.glmnet(x_all, y_all, alpha = 1, family = "binomial", 
                      type.measure = "auc")
plot(cv_lasso)

The plot displays the cross-validation error (devience by default, we chose AUC here) according to the log of lambda. The left dashed vertical line indicates that the log of the optimal value of log lambda is approximately -5.2, which is the one that minimizes the average AUC. This lambda value will give the most accurate model. The exact value of lambda and corresponding model can be viewed as follow:

cv_lasso$lambda.min
## [1] 0.005696827
coef(cv_lasso, cv_lasso$lambda.min)
## 12 x 1 sparse Matrix of class "dgCMatrix"
##                         s1
## (Intercept)  -1.040951e+01
## age           4.954011e-02
## height        .           
## weight        5.334625e-03
## bmi           .           
## sdp           1.537576e-02
## dbp           .           
## chol          9.228292e-03
## dibepA        5.162308e-01
## dibepB       -5.494421e-15
## cigs          1.656021e-02
## arcuspresent  9.149817e-02

We see that this model differs from the best model chosen by AIC by replacing the predictor bmi by weight.

Try tidymodels

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.1     ✔ rsample      1.1.1
## ✔ dials        1.2.0     ✔ tune         1.1.0
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.1.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.0.4     ✔ yardstick    1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ recipes::all_double()    masks gtsummary::all_double()
## ✖ recipes::all_factor()    masks gtsummary::all_factor()
## ✖ recipes::all_integer()   masks gtsummary::all_integer()
## ✖ recipes::all_logical()   masks gtsummary::all_logical()
## ✖ recipes::all_numeric()   masks gtsummary::all_numeric()
## ✖ scales::discard()        masks purrr::discard()
## ✖ Matrix::expand()         masks tidyr::expand()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ recipes::fixed()         masks stringr::fixed()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ caret::lift()            masks purrr::lift()
## ✖ Matrix::pack()           masks tidyr::pack()
## ✖ yardstick::precision()   masks caret::precision()
## ✖ yardstick::recall()      masks caret::recall()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ yardstick::spec()        masks readr::spec()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step()          masks stats::step()
## ✖ Matrix::unpack()         masks tidyr::unpack()
## ✖ recipes::update()        masks Matrix::update(), stats::update()
## • Learn how to get started at https://www.tidymodels.org/start/
# For reproducibility
set.seed(203)

data_split <- initial_split(
  wcgs, 
  # stratify by chd
  strata = "chd", 
  prop = 0.75)

data_split
## <Training/Testing/Total>
## <2355/785/3140>
wcgs_other <- training(data_split)
dim(wcgs_other)
## [1] 2355   11
wcgs_test <- testing(data_split)
dim(wcgs_test)
## [1] 785  11
logit_recipe <- 
  recipe(
    chd ~  age + height + weight + bmi + sdp + dbp + chol + dibep + cigs + arcus, 
    data = wcgs_other
  ) %>%
  # mean imputation for chol
  step_impute_mean(chol) %>%
  # mode imputation for arcus
  step_impute_mode(arcus) %>%
  # create traditional dummy variables
  step_dummy(all_nominal_predictors()) %>%
  # zero-variance filter
  step_zv(all_numeric_predictors()) %>% 
  # center and scale numeric data
  step_normalize(all_numeric_predictors()) %>%
  # estimate the means and standard deviations
  prep(training = wcgs_other, retain = TRUE)
logit_recipe
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 10
## 
## ── Training information
## Training data contained 2355 data points and no incomplete rows.
## 
## ── Operations
## • Mean imputation for: chol | Trained
## • Mode imputation for: arcus | Trained
## • Dummy variables from: dibep, arcus | Trained
## • Zero variance filter removed: <none> | Trained
## • Centering and scaling for: age, height, weight, bmi, sdp, dbp, ... | Trained
  • Model
logit_mod <- 
  logistic_reg(
    penalty = tune(), 
    mixture = 1 # tune()
  ) %>% 
  set_engine("glmnet", standardize = FALSE)
logit_mod
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Engine-Specific Arguments:
##   standardize = FALSE
## 
## Computational engine: glmnet
  • Workflow
logit_wf <- workflow() %>%
  add_recipe(logit_recipe) %>%
  add_model(logit_mod)
logit_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 5 Recipe Steps
## 
## • step_impute_mean()
## • step_impute_mode()
## • step_dummy()
## • step_zv()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = tune()
##   mixture = 1
## 
## Engine-Specific Arguments:
##   standardize = FALSE
## 
## Computational engine: glmnet
  • Tuning grid
param_grid <- grid_regular(
  penalty(range = c(-6, 3)), 
  # mixture(),
  levels = 100 # c(100, 5)
  )
param_grid
## # A tibble: 100 × 1
##       penalty
##         <dbl>
##  1 0.000001  
##  2 0.00000123
##  3 0.00000152
##  4 0.00000187
##  5 0.00000231
##  6 0.00000285
##  7 0.00000351
##  8 0.00000433
##  9 0.00000534
## 10 0.00000658
## # … with 90 more rows
  • Cross validation
# Set cross-validation partitions
set.seed(200)

folds <- vfold_cv(wcgs_other, v = 5)
folds
## #  5-fold cross-validation 
## # A tibble: 5 × 2
##   splits             id   
##   <list>             <chr>
## 1 <split [1884/471]> Fold1
## 2 <split [1884/471]> Fold2
## 3 <split [1884/471]> Fold3
## 4 <split [1884/471]> Fold4
## 5 <split [1884/471]> Fold5
  • Fit cross-validation
system.time({
logit_fit <- logit_wf %>%
  tune_grid(
    resamples = folds,
    grid = param_grid,
    metrics = metric_set(roc_auc, accuracy)
    )
})
##    user  system elapsed 
##   2.061   0.084   2.149
logit_fit
## # Tuning results
## # 5-fold cross-validation 
## # A tibble: 5 × 4
##   splits             id    .metrics           .notes          
##   <list>             <chr> <list>             <list>          
## 1 <split [1884/471]> Fold1 <tibble [200 × 5]> <tibble [0 × 3]>
## 2 <split [1884/471]> Fold2 <tibble [200 × 5]> <tibble [0 × 3]>
## 3 <split [1884/471]> Fold3 <tibble [200 × 5]> <tibble [0 × 3]>
## 4 <split [1884/471]> Fold4 <tibble [200 × 5]> <tibble [0 × 3]>
## 5 <split [1884/471]> Fold5 <tibble [200 × 5]> <tibble [0 × 3]>
logit_fit %>%
  collect_metrics() %>%
  print(width = Inf) %>%
  filter(.metric == "roc_auc") %>%
  ggplot(mapping = aes(x = penalty, y = mean)) +
  geom_point() +
  labs(x = "Penalty", y = "CV AUC") +
  scale_x_log10()
## # A tibble: 200 × 7
##       penalty .metric  .estimator  mean     n std_err .config               
##         <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                 
##  1 0.000001   accuracy binary     0.921     5 0.00489 Preprocessor1_Model001
##  2 0.000001   roc_auc  binary     0.739     5 0.0240  Preprocessor1_Model001
##  3 0.00000123 accuracy binary     0.921     5 0.00489 Preprocessor1_Model002
##  4 0.00000123 roc_auc  binary     0.739     5 0.0240  Preprocessor1_Model002
##  5 0.00000152 accuracy binary     0.921     5 0.00489 Preprocessor1_Model003
##  6 0.00000152 roc_auc  binary     0.739     5 0.0240  Preprocessor1_Model003
##  7 0.00000187 accuracy binary     0.921     5 0.00489 Preprocessor1_Model004
##  8 0.00000187 roc_auc  binary     0.739     5 0.0240  Preprocessor1_Model004
##  9 0.00000231 accuracy binary     0.921     5 0.00489 Preprocessor1_Model005
## 10 0.00000231 roc_auc  binary     0.739     5 0.0240  Preprocessor1_Model005
## # … with 190 more rows

best_logit <- logit_fit %>%
  select_best("roc_auc")
best_logit
## # A tibble: 1 × 2
##   penalty .config               
##     <dbl> <chr>                 
## 1 0.00123 Preprocessor1_Model035

Finalize model

  • Final workflow
final_wf <- logit_wf %>%
  finalize_workflow(best_logit)
final_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 5 Recipe Steps
## 
## • step_impute_mean()
## • step_impute_mode()
## • step_dummy()
## • step_zv()
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = 0.00123284673944207
##   mixture = 1
## 
## Engine-Specific Arguments:
##   standardize = FALSE
## 
## Computational engine: glmnet
# Fit the whole training set, then predict the test cases
final_fit <- 
  final_wf %>%
  last_fit(data_split)
final_fit
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits             id               .metrics .notes   .predictions .workflow 
##   <list>             <chr>            <list>   <list>   <list>       <list>    
## 1 <split [2355/785]> train/test split <tibble> <tibble> <tibble>     <workflow>
# Test metrics
final_fit %>% 
  collect_metrics()
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.910 Preprocessor1_Model1
## 2 roc_auc  binary         0.758 Preprocessor1_Model1
  • Selected variables’ coefficients
tidy(extract_model(final_fit)) %>%
  filter(lambda > 0.001 & lambda < 0.0011)
## Warning: `extract_model()` was deprecated in tune 0.1.6.
## ℹ Please use `extract_fit_engine()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 9 × 5
##   term           step estimate  lambda dev.ratio
##   <chr>         <dbl>    <dbl>   <dbl>     <dbl>
## 1 (Intercept)      41  -2.80   0.00106     0.109
## 2 age              41   0.293  0.00106     0.109
## 3 height           41  -0.0209 0.00106     0.109
## 4 bmi              41   0.216  0.00106     0.109
## 5 sdp              41   0.217  0.00106     0.109
## 6 chol             41   0.471  0.00106     0.109
## 7 cigs             41   0.290  0.00106     0.109
## 8 dibep_B          41  -0.294  0.00106     0.109
## 9 arcus_present    41   0.118  0.00106     0.109