Least Squares Simulation

Example to show least squares minimization

This little example is meant as a way to show the least square really minimizes the criterion, $ \sum \left( Y - \hat{Y} \right)^2 $.

In this example, we will generate some data so that we know what the truth is. Then, upon data generation, we will compute a bunch of different values for the linear slope and y-intercept. For each combination of the y-intercept and slope, I will compute the sum of squares error depicted above.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(simglm)

theme_set(theme_bw(base_size = 18))

set.seed(2021)

sim_arguments <- list(
    formula = y ~ x,
    fixed = list(x = list(var_type = 'continuous', mean = 100, sd = 20)),
    error = list(variance = 100),
    sample_size = 1000,
    reg_weights = c(5, .5)
)

sim_data <- simulate_fixed(data = NULL, sim_arguments) %>%
  simulate_error(sim_arguments) %>%
  generate_response(sim_arguments)

head(sim_data)
##   X.Intercept.         x level1_id       error fixed_outcome random_effects
## 1            1  97.55080         1   8.0959540      53.77540              0
## 2            1 111.04913         2   0.9941633      60.52457              0
## 3            1 106.97299         3 -23.4302131      58.48650              0
## 4            1 107.19264         4   6.7652598      58.59632              0
## 5            1 117.96107         5 -36.1147374      63.98054              0
## 6            1  61.54861         6  -1.6416799      35.77430              0
##          y
## 1 61.87135
## 2 61.51873
## 3 35.05628
## 4 65.36158
## 5 27.86580
## 6 34.13262
library(ggformula)
## Loading required package: ggstance
## 
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## Loading required package: ggridges
## 
## New to ggformula?  Try the tutorials: 
## 	learnr::run_tutorial("introduction", package = "ggformula")
## 	learnr::run_tutorial("refining", package = "ggformula")
gf_point(y ~ x, data = sim_data, size = 4) %>%
  gf_smooth(method = 'lm')

sim_lm <- lm (y ~ x, data = sim_data)
coef(sim_lm)
## (Intercept)           x 
##   5.9653484   0.4941165
y_intercept <- seq(0, 15, by = .25)
slope <- seq(0, 1.5, by = .01)

conditions <- rbind(expand.grid(y_intercept = y_intercept, 
                          slope = slope),
                          coef(sim_lm))

tail(conditions)
##      y_intercept     slope
## 9207   14.000000 1.5000000
## 9208   14.250000 1.5000000
## 9209   14.500000 1.5000000
## 9210   14.750000 1.5000000
## 9211   15.000000 1.5000000
## 9212    5.965348 0.4941165
dim(conditions)
## [1] 9212    2
gf_point(y ~ x, data = sim_data, size = 4) %>%
  gf_smooth(method = 'lm') %>%
  gf_abline(slope = ~slope, intercept = ~y_intercept, data = slice(conditions, 1), linetype = 2, size = 2) %>%
  gf_abline(slope = ~slope, intercept = ~y_intercept, data = slice(conditions, 855), linetype = 2, color = 'lightgreen', size = 2) %>%
  gf_refine(coord_cartesian(xlim = c(0, 160), ylim = c(0, 120)))

sum_square_error <- function(conditions, sim_data) {
    fitted <- conditions[['y_intercept']] + conditions[['slope']] * sim_data[['x']]

    deviation <- sim_data[['y']] - fitted

    sqrt((sum(deviation^2) / (nrow(sim_data) - 2)))
}

sum_square_error(conditions[1892, ], sim_data)
## [1] 26.74862
summary(sim_lm)$sigma
## [1] 10.18153
library(future)

plan(multisession)

conditions$sse <- unlist(lapply(1:nrow(conditions), function(xx) sum_square_error(conditions[xx, ], sim_data)))

head(conditions)
##   y_intercept slope      sse
## 1        0.00     0 57.37574
## 2        0.25     0 57.13345
## 3        0.50     0 56.89123
## 4        0.75     0 56.64908
## 5        1.00     0 56.40699
## 6        1.25     0 56.16498
plan(sequential)
Previous
Next