Show figure code
<- bayestestR::simulate_correlation(n=500, r=0.7)
df |>
df ggplot(aes(x=V1, y=V2)) +
geom_point() +
geom_smooth(method="lm", formula = 'y ~ x', se=FALSE, linewidth=2, color="red") +
theme_bw()
2. On Regression Parameters in a Frequentist Framework
y
given x
with a testPearson's product-moment correlation
Parameter1 | Parameter2 | r | 95% CI | t(30) | p
----------------------------------------------------------------
mtcars$wt | mtcars$qsec | -0.17 | [-0.49, 0.19] | -0.97 | 0.339
Alternative hypothesis: true correlation is not equal to 0
ggplot(df, aes(x=V1, y=V2)) +
geom_point(alpha=0) +
geom_vline(xintercept = 0, linetype="dotted") +
geom_abline(intercept = 1, slope = 3, color="red", linewidth=2) +
geom_segment(aes(x = 0, y = 1, xend = 1, yend = 1), linewidth=1,
color="green", linetype="dashed") +
geom_segment(aes(x = 1, y = 1, xend = 1, yend = 4), linewidth=1,
color="green", linetype="solid", arrow=arrow(length = unit(0.1, "inches"))) +
geom_segment(aes(x = 0, y = 0, xend = 0, yend = 1), linewidth=1,
color="blue", linetype="solid", arrow=arrow(length = unit(0.1, "inches"))) +
geom_point(aes(x = 0, y = 0), color="purple", size=8, shape="+") +
labs(x="x", y="y") +
theme_bw() +
coord_cartesian(xlim = c(-1, 1.5), ylim = c(-3, 4))
mpg
qsec
mtcars
(built-in in R)lm()
function (linear model) with the response ~ predictor
formula to specify the modelsummary()
method to get the results
Call:
lm(formula = mpg ~ qsec, data = mtcars)
Residuals:
Min 1Q Median 3Q Max
-9.8760 -3.4539 -0.7203 2.2774 11.6491
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.1140 10.0295 -0.510 0.6139
qsec 1.4121 0.5592 2.525 0.0171 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.564 on 30 degrees of freedom
Multiple R-squared: 0.1753, Adjusted R-squared: 0.1478
F-statistic: 6.377 on 1 and 30 DF, p-value: 0.01708
Call:
lm(formula = mpg ~ qsec10, data = df)
Residuals:
Min 1Q Median 3Q Max
-9.8760 -3.4539 -0.7203 2.2774 11.6491
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.114 10.030 -0.510 0.6139
qsec10 14.121 5.592 2.525 0.0171 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.564 on 30 degrees of freedom
Multiple R-squared: 0.1753, Adjusted R-squared: 0.1478
F-statistic: 6.377 on 1 and 30 DF, p-value: 0.01708
p <- mtcars |>
ggplot(aes(x=qsec, y=mpg)) +
geom_point(alpha=0.7, size=6) +
geom_smooth(method="lm", formula = 'y ~ x', se=FALSE, linewidth=2) +
geom_segment(aes(x = qsec, y = mpg, xend = qsec, yend = predict(lm(mpg ~ qsec, data=mtcars))),
color="red", linetype="dotted", linewidth=1) +
theme_bw() +
labs(x="x", y="y", title="The residuals are the vertical distances between each point and the line.")
p
Ronald Fisher (1890 – 1962), “a genius who almost single-handedly created the foundations for modern statistical science”, introduced MLE in 1922
p <- mtcars |>
ggplot(aes(x=qsec, y=mpg)) +
geom_point(alpha=0.7, size=6) +
geom_smooth(method="lm", formula = 'y ~ x', se=FALSE, linewidth=2) +
geom_segment(aes(x = qsec, y = mpg, xend = qsec, yend = predict(lm(mpg ~ qsec, data=mtcars))),
color="red", linetype="dotted", linewidth=1) +
theme_bw() +
labs(x="x", y="y", title="The residuals are the vertical distances between each point and the line.")
p2 <- data.frame(Error=insight::get_residuals(lm(mpg ~ qsec, data=mtcars))) |>
ggplot(aes(x=Error)) +
geom_histogram(bins=10, fill="grey", color="black") +
geom_vline(xintercept = 0, linetype="dashed") +
geom_density(data=data.frame(Error=bayestestR::distribution_normal(n=100, sd = 2)),
aes(y=after_stat(density)*40), color="#F44336", linewidth=1, adjust=1) +
geom_density(data=data.frame(Error=bayestestR::distribution_normal(n=100, sd = 3)),
aes(y=after_stat(density)*50), color="#FF5722", linewidth=1, adjust=1) +
geom_density(data=data.frame(Error=bayestestR::distribution_normal(n=100, sd = 4)),
aes(y=after_stat(density)*60), color="#FF9800", linewidth=1, adjust=1) +
geom_point(aes(y=0), size=10, shape=16, alpha=0.3) +
theme_bw() +
coord_flip() +
labs(y = "Density")
p + theme(plot.title = element_blank()) | p2
model <- lm(mpg ~ qsec, data=mtcars)
p <- mtcars |>
ggplot(aes(x=qsec, y=mpg)) +
geom_smooth(method="lm", formula = 'y ~ x', se=FALSE, linewidth=2) +
theme_bw()
# Function to add normal distribution curves
add_normals <- function(p, model) {
sigma <- summary(model)$sigma # Standard deviation of residuals
n <- 100 # Number of points for each curve
for(i in 1:nrow(mtcars)) {
x_val <- mtcars$qsec[i]
y_pred <- predict(model, newdata = data.frame(qsec = x_val))
# Create a sequence of y values for the normal curve
y_seq <- seq(y_pred - 3*sigma, y_pred + 3*sigma, length.out = n)
density <- dnorm(y_seq, mean = y_pred, sd = sigma)
# Adjust density to match the scale of the plot
max_width <- 1 # Max width of areas
density_scaled <- (density / max(density)) * max_width
# Create a dataframe for each path
path_df <- data.frame(x = x_val + density_scaled, y = y_seq)
path_dfv <- data.frame(x=path_df$x[1], ymin=min(path_df$y), ymax=max(path_df$y))
# Add the path to the plot
p <- p +
geom_segment(data = path_dfv, aes(x = x, xend=x, y = ymin, yend=ymax),
color = "#FF9800", linewidth = 0.7, alpha=0.8, linetype="dotted") +
geom_path(data = path_df, aes(x = x, y = y),
color = "#FF9800", linewidth = 1, alpha=0.8)
}
p
}
# Create the final plot
p <- add_normals(p, model) +
geom_segment(aes(x = qsec, y = mpg, xend = qsec, yend = predict(lm(mpg ~ qsec, data=mtcars))),
color="red", linetype="solid", linewidth=1) +
geom_point(alpha=0.8, size=6)
p
Call:
lm(formula = mpg ~ qsec, data = mtcars)
Residuals:
Min 1Q Median 3Q Max
-9.8760 -3.4539 -0.7203 2.2774 11.6491
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.1140 10.0295 -0.510 0.6139
qsec 1.4121 0.5592 2.525 0.0171 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.564 on 30 degrees of freedom
Multiple R-squared: 0.1753, Adjusted R-squared: 0.1478
F-statistic: 6.377 on 1 and 30 DF, p-value: 0.01708
Call:
glm(formula = mpg ~ qsec, family = "gaussian", data = mtcars)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.1140 10.0295 -0.510 0.6139
qsec 1.4121 0.5592 2.525 0.0171 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 30.95518)
Null deviance: 1126.05 on 31 degrees of freedom
Residual deviance: 928.66 on 30 degrees of freedom
AIC: 204.59
Number of Fisher Scoring iterations: 2
Call:
lm(formula = mpg ~ qsec, data = mtcars)
Residuals:
Min 1Q Median 3Q Max
-9.8760 -3.4539 -0.7203 2.2774 11.6491
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.1140 10.0295 -0.510 0.6139
qsec 1.4121 0.5592 2.525 0.0171 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.564 on 30 degrees of freedom
Multiple R-squared: 0.1753, Adjusted R-squared: 0.1478
F-statistic: 6.377 on 1 and 30 DF, p-value: 0.01708
Call:
lm(formula = mpg ~ qsec, data = mtcars)
Residuals:
Min 1Q Median 3Q Max
-9.8760 -3.4539 -0.7203 2.2774 11.6491
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.1140 10.0295 -0.510 0.6139
qsec 1.4121 0.5592 2.525 0.0171 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.564 on 30 degrees of freedom
Multiple R-squared: 0.1753, Adjusted R-squared: 0.1478
F-statistic: 6.377 on 1 and 30 DF, p-value: 0.01708
# Plot t-distribution
x <- seq(-5, 5, length.out = 1000)
y <- dt(x, df=30)
df <- data.frame(x = x, y = y)
t_value <- insight::get_statistic(model)[2, 2]
df |>
ggplot(aes(x = x, y = y)) +
geom_area(color="black", fill="grey") +
geom_segment(aes(x = t_value, y = 0, xend = t_value, yend = dt(t_value, df=30)),
color="red", linetype="solid", linewidth=1) +
geom_point(aes(x = t_value, y = dt(t_value, df=30)), color="red", size=3) +
theme_minimal() +
labs(x = "\nt-values - standardized coefficient under the null hypothesis", y = "Probability",
title = paste0("t-Distribution (df=30); ", "t-value = ", round(t_value, 2), "\n"))
df$Probability <- ifelse(df$x < -t_value, "< -t", "Smaller")
df$Probability <- ifelse(df$x > t_value, "> +t", df$Probability)
df |>
ggplot(aes(x = x, y = y)) +
geom_line() +
geom_area(aes(x = x, y = y, fill = Probability), alpha = 0.5) +
geom_segment(aes(x = t_value, y = 0, xend = t_value, yend = dt(t_value, df=30)),
color="red", linetype="solid", linewidth=1) +
geom_point(aes(x = t_value, y = dt(t_value, df=30)), color="red", size=3) +
theme_minimal() +
scale_fill_manual(values = c("red", "red", "grey")) +
labs(x = "\nt-values - standardized coefficient under the null hypothesis", y = "Probability",
title = paste0("t-Distribution (df=30); ", "t-value = ", round(t_value, 2), "\n"))
Call:
lm(formula = mpg ~ qsec, data = mtcars)
Residuals:
Min 1Q Median 3Q Max
-9.8760 -3.4539 -0.7203 2.2774 11.6491
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.1140 10.0295 -0.510 0.6139
qsec 1.4121 0.5592 2.525 0.0171 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.564 on 30 degrees of freedom
Multiple R-squared: 0.1753, Adjusted R-squared: 0.1478
F-statistic: 6.377 on 1 and 30 DF, p-value: 0.01708
# Get 95% ETI of the t distribution
ci <- qt(c(0.025, 0.975), df=30, lower.tail = TRUE) + t_value
# ci * parameters::parameters(model)$SE[2] # Indeed shows the right CI
df |>
ggplot(aes(x = x, y = y)) +
geom_area(fill="grey", alpha=0.5) +
geom_area(aes(x = x + t_value, y = y), color="red", fill="red", alpha=0.5) +
geom_segment(aes(x = t_value, y = 0, xend = t_value, yend = dt(t_value, df=30)),
color="red", linetype="solid", linewidth=1) +
geom_point(aes(x = t_value, y = dt(t_value, df=30)), color="red", size=3) +
# Horizontal segment indicating the CI range
geom_segment(aes(x = ci[1], y = 0.05, xend = ci[2], yend = 0.05),
color="orange", linetype="solid", linewidth=1, arrow = arrow(ends='both')) +
geom_vline(xintercept = ci[1], color="orange", linetype="dotted", linewidth=0.5) +
geom_vline(xintercept = ci[2], color="orange", linetype="dotted", linewidth=0.5) +
theme_minimal() +
labs(x = "\nt-values - standardized coefficient under the null hypothesis", y = "Probability",
title = paste0("Shifted t-Distribution (df=30); ", "location = ", round(t_value, 2), "(t-value)\n"))
Thank you!