Least Squares Simulation
Example to show least squares minimization
This little example is meant as a way to show the least square really minimizes the criterion, $ \sum \left( Y - \hat{Y} \right)^2 $.
In this example, we will generate some data so that we know what the truth is. Then, upon data generation, we will compute a bunch of different values for the linear slope and y-intercept. For each combination of the y-intercept and slope, I will compute the sum of squares error depicted above.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(simglm)
theme_set(theme_bw(base_size = 18))
set.seed(2021)
sim_arguments <- list(
formula = y ~ x,
fixed = list(x = list(var_type = 'continuous', mean = 100, sd = 20)),
error = list(variance = 100),
sample_size = 1000,
reg_weights = c(5, .5)
)
sim_data <- simulate_fixed(data = NULL, sim_arguments) %>%
simulate_error(sim_arguments) %>%
generate_response(sim_arguments)
head(sim_data)
## X.Intercept. x level1_id error fixed_outcome random_effects
## 1 1 97.55080 1 8.0959540 53.77540 0
## 2 1 111.04913 2 0.9941633 60.52457 0
## 3 1 106.97299 3 -23.4302131 58.48650 0
## 4 1 107.19264 4 6.7652598 58.59632 0
## 5 1 117.96107 5 -36.1147374 63.98054 0
## 6 1 61.54861 6 -1.6416799 35.77430 0
## y
## 1 61.87135
## 2 61.51873
## 3 35.05628
## 4 65.36158
## 5 27.86580
## 6 34.13262
library(ggformula)
## Loading required package: ggstance
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
## Loading required package: scales
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
## Loading required package: ggridges
##
## New to ggformula? Try the tutorials:
## learnr::run_tutorial("introduction", package = "ggformula")
## learnr::run_tutorial("refining", package = "ggformula")
gf_point(y ~ x, data = sim_data, size = 4) %>%
gf_smooth(method = 'lm')
sim_lm <- lm (y ~ x, data = sim_data)
coef(sim_lm)
## (Intercept) x
## 5.9653484 0.4941165
y_intercept <- seq(0, 15, by = .25)
slope <- seq(0, 1.5, by = .01)
conditions <- rbind(expand.grid(y_intercept = y_intercept,
slope = slope),
coef(sim_lm))
tail(conditions)
## y_intercept slope
## 9207 14.000000 1.5000000
## 9208 14.250000 1.5000000
## 9209 14.500000 1.5000000
## 9210 14.750000 1.5000000
## 9211 15.000000 1.5000000
## 9212 5.965348 0.4941165
dim(conditions)
## [1] 9212 2
gf_point(y ~ x, data = sim_data, size = 4) %>%
gf_smooth(method = 'lm') %>%
gf_abline(slope = ~slope, intercept = ~y_intercept, data = slice(conditions, 1), linetype = 2, size = 2) %>%
gf_abline(slope = ~slope, intercept = ~y_intercept, data = slice(conditions, 855), linetype = 2, color = 'lightgreen', size = 2) %>%
gf_refine(coord_cartesian(xlim = c(0, 160), ylim = c(0, 120)))
sum_square_error <- function(conditions, sim_data) {
fitted <- conditions[['y_intercept']] + conditions[['slope']] * sim_data[['x']]
deviation <- sim_data[['y']] - fitted
sqrt((sum(deviation^2) / (nrow(sim_data) - 2)))
}
sum_square_error(conditions[1892, ], sim_data)
## [1] 26.74862
summary(sim_lm)$sigma
## [1] 10.18153
library(future)
plan(multisession)
conditions$sse <- unlist(lapply(1:nrow(conditions), function(xx) sum_square_error(conditions[xx, ], sim_data)))
head(conditions)
## y_intercept slope sse
## 1 0.00 0 57.37574
## 2 0.25 0 57.13345
## 3 0.50 0 56.89123
## 4 0.75 0 56.64908
## 5 1.00 0 56.40699
## 6 1.25 0 56.16498
plan(sequential)