Regression
data(birthwt, package = "MASS")
birthwt <- birthwt %>%
mutate(smoke = factor(smoke, labels = c("Non-smoker", "Smoker")),
race = factor(race, labels = c("White", "African American", "Other"))) %>%
var_labels(bwt = 'Birth weight (g)',
smoke = 'Smoking status',
race = 'Race')
birthwt %>%
group_by(race, smoke) %>%
summarise(
n = n(),
Mean = mean(bwt, na.rm = TRUE),
SD = sd(bwt, na.rm = TRUE),
Median = median(bwt, na.rm = TRUE),
CV = rel_dis(bwt)
)
`summarise()` regrouping output by 'race' (override with `.groups` argument)
# A tibble: 6 x 7
# Groups: race [3]
race smoke n Mean SD Median CV
<fct> <fct> <int> <dbl> <dbl> <dbl> <dbl>
1 White Non-smoker 44 3429. 710. 3593 0.207
2 White Smoker 52 2827. 626. 2776. 0.222
3 African American Non-smoker 16 2854. 621. 2920 0.218
4 African American Smoker 10 2504 637. 2381 0.254
5 Other Non-smoker 55 2816. 709. 2807 0.252
6 Other Smoker 12 2757. 810. 3146. 0.294
birthwt %>%
gen_bst_df(bwt ~ race|smoke)
Birth weight (g) | LowerCI | UpperCI | Race | Smoking status |
3428.75 | 3214.98 | 3623.23 | White | Non-smoker |
2826.85 | 2661.95 | 2988.81 | White | Smoker |
2854.50 | 2565.29 | 3141.09 | African American | Non-smoker |
2504.00 | 2089.21 | 2867.61 | African American | Smoker |
2815.78 | 2632.03 | 3009.99 | Other | Non-smoker |
2757.17 | 2300.54 | 3144.49 | Other | Smoker |
birthwt %>%
bar_error(bwt ~ race, fill = ~ smoke) %>%
axis_labs() %>%
gf_labs(fill = "Smoking status:")
No summary function supplied, defaulting to `mean_se()`

model_norm <- lm(bwt ~ smoke + race, data = birthwt)
model_norm %>% Anova() %>% tidy()
term | sumsq | df | statistic | p.value |
smoke | 7322574.73 | 1.00 | 15.46 | 0.00 |
race | 8712354.03 | 2.00 | 9.20 | 0.00 |
Residuals | 87631355.83 | 185.00 | | |
model_norm %>% tidy()
term | estimate | std.error | statistic | p.value |
(Intercept) | 3334.95 | 91.78 | 36.34 | 0.00 |
smokeSmoker | -428.73 | 109.04 | -3.93 | 0.00 |
raceAfrican American | -450.36 | 153.12 | -2.94 | 0.00 |
raceOther | -452.88 | 116.48 | -3.89 | 0.00 |
model_norm %>%
glm_coef(labels = model_labels(model_norm))
Parameter | Coefficient | Pr(>|t|) |
Constant | 3334.95 (3036.11, 3633.78) | < 0.001 |
Smoking status: Smoker | -428.73 (-667.02, -190.44) | < 0.001 |
Race: African American | -450.36 (-750.31, -150.4) | 0.003 |
Race: Other | -452.88 (-775.17, -130.58) | 0.006 |
Compare models
model_norm %>%
glm_coef(se_rob = TRUE, labels = model_labels(model_norm))
Parameter | Coefficient | Pr(>|t|) |
Constant | 3334.95 (3036.11, 3633.78) | < 0.001 |
Smoking status: Smoker | -428.73 (-667.02, -190.44) | < 0.001 |
Race: African American | -450.36 (-750.31, -150.4) | 0.003 |
Race: Other | -452.88 (-775.17, -130.58) | 0.006 |
model_norm %>% glance()
r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual |
0.12 | 0.11 | 688.25 | 8.68 | 0.00 | 4 | -1501.11 | 3012.22 | 3028.43 | 87631355.83 | 185 |
model_norm %>%
plot_model("pred", terms = ~race|smoke, dot.size = 1.5, title = "")

emmip(model_norm, smoke ~ race) %>%
gf_labs(y = get_label(birthwt$bwt), x = "", col = "Smoking status")

Logistic regression
data(diet, package = "Epi")
diet <- diet %>%
mutate(
chd = factor(chd, labels = c("No CHD", "CHD"))
) %>%
var_labels(
chd = "Coronary Heart Disease",
fibre = "Fibre intake (10 g/day)"
)
diet %>% estat(~ fibre|chd)
| Coronary Heart Disease | N | Min. | Max. | Mean | Median | SD | CV |
Fibre intake (10 g/day) | No CHD | 288.00 | 0.60 | 5.35 | 1.75 | 1.69 | 0.58 | 0.33 |
| CHD | 45.00 | 0.76 | 2.43 | 1.49 | 1.51 | 0.40 | 0.27 |
diet %>%
gf_boxploth(chd ~ fibre, fill = "indianred3", alpha = 0.7) %>%
axis_labs()
Warning: Removed 4 rows containing non-finite values (stat_boxploth).

model_binom <- glm(chd ~ fibre, data = diet, family = binomial)
model_binom %>%
glm_coef(labels = model_labels(model_binom))
Parameter | Odds ratio | Pr(>|z|) |
Constant | 0.95 (0.3, 3.01) | 0.93 |
Fibre intake (10 g/day) | 0.33 (0.16, 0.67) | 0.00 |
model_binom %>%
plot_model("pred", terms = "fibre [all]", title = "")

Matched Case-Control Studies: Conditional Logistic Regression
data(bdendo, package = "Epi")
bdendo <- bdendo %>%
mutate(
cancer = factor(d, labels = c('Control', 'Case')),
gall = factor(gall, labels = c("No GBD", "GBD")),
est = factor(est, labels = c("No oestrogen", "Oestrogen"))
) %>%
var_labels(
cancer = 'Endometrial cancer',
gall = 'Gall bladder disease',
est = 'Oestrogen'
)
bdendo %>%
mutate(
cancer = relevel(cancer, ref = "Case"),
est = relevel(est, ref = "Oestrogen"),
gall = relevel(gall, ref = "GBD")
) %>%
copy_labels(bdendo) %>%
cross_tab(cancer ~ est + gall)
Endometrial cancer | levels | Case | Control | Total |
Total N (%) | | 63 (20.0) | 252 (80.0) | 315 |
Oestrogen | Oestrogen | 56 (88.9) | 127 (50.4) | 183 (58.1) |
| No oestrogen | 7 (11.1) | 125 (49.6) | 132 (41.9) |
Gall bladder disease | GBD | 17 (27.0) | 24 (9.5) | 41 (13.0) |
| No GBD | 46 (73.0) | 228 (90.5) | 274 (87.0) |
library(survival)
model_clogit <- clogit(cancer == 'Case' ~ est * gall + strata(set), data = bdendo)
model_clogit %>%
glm_coef(labels = c("Oestrogen/No oestrogen", "GBD/No GBD",
"Oestrogen:GBD Interaction"))
Parameter | Odds ratio | Pr(>|z|) |
Oestrogen/No oestrogen | 14.88 (4.49, 49.36) | < 0.001 |
GBD/No GBD | 18.07 (3.2, 102.01) | 0.001 |
Oestrogen:GBD Interaction | 0.13 (0.02, 0.9) | 0.039 |
require(ggeffects)
Loading required package: ggeffects
bdendo_pred <- ggemmeans(model_clogit, terms = c('gall', 'est'))
bdendo_pred %>%
gf_pointrange(predicted + conf.low + conf.high ~ x|group, col = ~ x) %>%
gf_labs(y = "P(cancer)", x = "", col = get_label(bdendo$gall))

Ordinal Logistic Regression
library(ordinal)
Attaching package: 'ordinal'
The following object is masked from 'package:dplyr':
slice
data(housing, package = "MASS")
housing <- housing %>%
var_labels(
Sat = "Satisfaction",
Infl = "Perceived influence",
Type = "Type of rental",
Cont = "Contact"
)
model_clm <- clm(Sat ~ Infl + Type + Cont, weights = Freq, data = housing)
model_clm %>%
glm_coef(labels = model_labels(model_clm, intercept = FALSE))
Parameter | Ordinal OR | Pr(>|Z|) |
Perceived influence: Low | 0.61 (0.48, 0.78) | < 0.001 |
Perceived influence: Medium | 2 (1.56, 2.55) | < 0.001 |
Contact: Low | 1.76 (1.44, 2.16) | < 0.001 |
Perceived influence: High | 3.63 (2.83, 4.66) | < 0.001 |
Contact: High | 0.56 (0.45, 0.71) | < 0.001 |
Type of rental: Apartment | 0.69 (0.51, 0.94) | 0.018 |
Type of rental: Atrium | 0.34 (0.25, 0.45) | < 0.001 |
Type of rental: Terrace | 1.43 (1.19, 1.73) | < 0.001 |
model_clm %>%
plot_model(type = "pred", terms = c("Infl", "Cont"),
dot.size = 1, title = "") %>%
gf_theme(axis.text.x = element_text(angle = 45, hjust = 1))

model_clm %>%
plot_model(type = "pred", terms = c("Infl", "Type"),
dot.size = 1, title = "") %>%
gf_theme(axis.text.x = element_text(angle = 45, hjust = 1))

emmip(model_clm, Infl ~ Type |Cont) %>%
gf_labs(x = "Type of rental", col = "Perceived influence")

Poisson Regression
data(quine, package = "MASS")
levels(quine$Eth) <- c("Aboriginal", "White")
levels(quine$Sex) <- c("Female", "Male")
quine <- quine %>%
var_labels(
Days = "Number of absent days",
Eth = "Ethnicity",
Age = "Age group"
)
EDA
quine %>%
group_by(Eth, Sex, Age) %>%
summarise(
n = n(),
Mean = mean(Days, na.rm = TRUE),
SD = sd(Days, na.rm = TRUE),
Median = median(Days, na.rm = TRUE),
CV = rel_dis(Days)
)
`summarise()` regrouping output by 'Eth', 'Sex' (override with `.groups` argument)
# A tibble: 16 x 8
# Groups: Eth, Sex [4]
Eth Sex Age n Mean SD Median CV
<fct> <fct> <fct> <int> <dbl> <dbl> <dbl> <dbl>
1 Aboriginal Female F0 5 17.6 17.4 11 0.987
2 Aboriginal Female F1 15 18.9 16.3 13 0.865
3 Aboriginal Female F2 9 32.6 27.3 20 0.839
4 Aboriginal Female F3 9 14.6 14.9 10 1.02
5 Aboriginal Male F0 8 11.5 7.23 12 0.629
6 Aboriginal Male F1 5 9.6 4.51 7 0.469
7 Aboriginal Male F2 11 30.9 17.8 32 0.576
8 Aboriginal Male F3 7 27.1 10.4 28 0.382
9 White Female F0 5 19.8 9.68 20 0.489
10 White Female F1 17 7.76 6.48 6 0.834
11 White Female F2 10 5.7 4.97 4 0.872
12 White Female F3 10 13.5 11.5 12 0.851
13 White Male F0 9 13.6 20.9 7 1.54
14 White Male F1 9 5.56 5.39 5 0.970
15 White Male F2 10 15.2 12.9 12 0.848
16 White Male F3 7 27.3 22.9 27 0.840
Model
model_pois <- glm(Days ~ Eth + Sex + Age, family = poisson, data = quine)
model_pois %>%
glm_coef(labels = model_labels(model_pois), se_rob = TRUE)
Parameter | Rate ratio | Pr(>|z|) |
Constant | 17.66 (10.66, 29.24) | < 0.001 |
Ethnicity: White | 0.59 (0.39, 0.88) | 0.01 |
Sex: Male | 1.11 (0.77, 1.6) | 0.57 |
Age group: F1 | 0.8 (0.42, 1.5) | 0.482 |
Age group: F2 | 1.42 (0.85, 2.36) | 0.181 |
Age group: F3 | 1.35 (0.77, 2.34) | 0.292 |
model_pois %>% glance()
null.deviance | df.null | logLik | AIC | BIC | deviance | df.residual |
2073.53 | 145 | -1165.49 | 2342.98 | 2360.88 | 1742.50 | 140 |
Negative-binomial
deviance(model_pois) / df.residual(model_pois)
[1] 12.44646
Thus, we have over-dispersion. One option is to use a negative binomial distribution.
library(MASS)
Attaching package: 'MASS'
The following objects are masked _by_ '.GlobalEnv':
birthwt, housing, quine
The following object is masked from 'package:dplyr':
select
model_negbin <- glm.nb(Days ~ Eth + Sex + Age, data = quine)
model_negbin %>%
glm_coef(labels = model_labels(model_negbin), se_rob = TRUE)
Parameter | Rate ratio | Pr(>|z|) |
Constant | 20.24 (12.24, 33.47) | < 0.001 |
Ethnicity: White | 0.57 (0.38, 0.84) | 0.005 |
Sex: Male | 1.07 (0.74, 1.53) | 0.73 |
Age group: F1 | 0.69 (0.39, 1.23) | 0.212 |
Age group: F2 | 1.2 (0.7, 2.03) | 0.507 |
Age group: F3 | 1.29 (0.73, 2.28) | 0.385 |
model_negbin %>% glance()
null.deviance | df.null | logLik | AIC | BIC | deviance | df.residual |
192.24 | 145 | -547.83 | 1109.65 | 1130.54 | 167.86 | 140 |
age group is a factor with more than two levels and is significant:
model_negbin %>% Anova()
LR Chisq | Df | Pr(>Chisq) |
12.66 | 1.00 | 0.00 |
0.15 | 1.00 | 0.70 |
9.48 | 3.00 | 0.02 |
model_negbin %>%
plot_model(type = "pred", terms = c("Age", "Eth"),
dot.size = 1.5, title = "")

emmip(model_negbin, Eth ~ Age|Sex) %>%
gf_labs(y = "Number of absent days", x = "Age group", col = "Ethnicity")
## Adjusting CIs and p-values for multiple comparisons
multiple(model_negbin, ~ Age|Eth)$df
contrast | Eth | ratio | SE | z.ratio | p.value | lower.CL | upper.CL |
F0 / F1 | Aboriginal | 1.44 | 0.34 | 1.57 | 0.40 | 0.79 | 2.62 |
F0 / F2 | Aboriginal | 0.84 | 0.19 | -0.77 | 0.86 | 0.46 | 1.52 |
F0 / F3 | Aboriginal | 0.78 | 0.19 | -1.04 | 0.72 | 0.42 | 1.45 |
F1 / F2 | Aboriginal | 0.58 | 0.12 | -2.65 | 0.04 | 0.34 | 0.98 |
F1 / F3 | Aboriginal | 0.54 | 0.12 | -2.89 | 0.02 | 0.31 | 0.93 |
F2 / F3 | Aboriginal | 0.93 | 0.20 | -0.34 | 0.99 | 0.53 | 1.63 |
F0 / F1 | White | 1.44 | 0.34 | 1.57 | 0.40 | 0.79 | 2.62 |
F0 / F2 | White | 0.84 | 0.19 | -0.77 | 0.86 | 0.46 | 1.52 |
F0 / F3 | White | 0.78 | 0.19 | -1.04 | 0.72 | 0.42 | 1.45 |
F1 / F2 | White | 0.58 | 0.12 | -2.65 | 0.04 | 0.34 | 0.98 |
F1 / F3 | White | 0.54 | 0.12 | -2.89 | 0.02 | 0.31 | 0.93 |
F2 / F3 | White | 0.93 | 0.20 | -0.34 | 0.99 | 0.53 | 1.63 |
multiple(model_negbin, ~ Age|Eth)$fig_ci %>%
gf_labs(x = "IRR")

multiple(model_negbin, ~ Age|Eth)$fig_ci %>%
gf_labs(x = "IRR")

Survival Analysis
effect of thiotepa versus placebo on the development of bladder cancer.
library(survival)
data(bladder)
bladder <- bladder %>%
mutate(times = stop,
rx = factor(rx, labels=c("Placebo", "Thiotepa"))
) %>%
var_labels(times = "Survival time",
rx = "Treatment")
Parametric method
model_surv <- survreg(Surv(times, event) ~ rx, data = bladder)
model_surv %>%
glm_coef(labels = c("Treatment: Thiotepa/Placebo", "Scale"), se_rob = TRUE)
Parameter | Survival time ratio | Pr(>|z|) |
Treatment: Thiotepa/Placebo | 1.64 (0.89, 3.04) | 0.116 |
Scale | 1 (0.85, 1.18) | 0.992 |
model_exp <- survreg(Surv(times, event) ~ rx, data = bladder, dist = "exponential")
model_exp %>%
glm_coef(labels = c("Treatment: Thiotepa/Placebo"), se_rob = TRUE)
Parameter | Survival time ratio | Pr(>|z|) |
Treatment: Thiotepa/Placebo | 1.64 (0.85, 3.16) | 0.139 |
Interpretation: Patients receiving Thiotepa live on average 64% more than those in the Placebo group.
Using naive standard errors
model_exp %>%
glm_coef(labels = c("Treatment: Thiotepa/Placebo"))
Parameter | Survival time ratio | Pr(>|z|) |
Treatment: Thiotepa/Placebo | 1.64 (0.85, 3.16) | 0.139 |
model_exp %>%
plot_model(type = "pred", terms = ~ rx, dot.size = 1.5, title = "") %>%
gf_labs(y = "Survival time", x = "Treatment", title = "")
### Cox proportional hazards regression
model_cox <- coxph(Surv(times, event) ~ rx, data = bladder)
model_cox %>%
glm_coef(labels = c("Treatment: Thiotepa/Placebo"))
Parameter | Hazard ratio | Pr(>|z|) |
Treatment: Thiotepa/Placebo | 0.64 (0.44, 0.94) | 0.02 |
model_cox %>%
plot_model(type = "pred", terms = ~ rx, dot.size = 1.5,
title = "") %>%
gf_labs(x = "Treatment", title = "")

Mixed Linear Regression Models
Continuous outcomes
relationship between sex and age on the distance from the pituitary to the pterygomaxillary fissure (mm).
library(nlme)
Attaching package: 'nlme'
The following objects are masked from 'package:ordinal':
ranef, VarCorr
The following object is masked from 'package:dplyr':
collapse
data(Orthodont)
Orthodont <- Orthodont %>%
var_labels(
distance = "Pituitary distance (mm)",
age = "Age (years)"
)
model_lme <- lme(distance ~ Sex * I(age - mean(age, na.rm = TRUE)), random = ~ 1|Subject,
method = "ML", data = Orthodont)
model_lme %>%
glm_coef(labels = c(
"Constant",
"Sex: female-male",
"Age (years)",
"Sex:Age interaction"
))
Parameter | Coefficient | Pr(>|t|) |
Constant | 24.97 (24.03, 25.9) | < 0.001 |
Sex: female-male | -2.32 (-3.78, -0.86) | 0.005 |
Age (years) | 0.78 (0.63, 0.94) | < 0.001 |
Sex:Age interaction | -0.3 (-0.54, -0.07) | 0.015 |
model_lme %>%
plot_model("pred", terms = age ~ Sex,
show.data = TRUE, jitter = 0.1, dot.size = 1.5) %>%
gf_labs(y = get_label(Orthodont$distance), x = "Age (years)", title = "")
