Review - Class
Review for PSQF 6243
This serves as a non-exhaustive review for the course. These are elements that I assume you have knowledge of prior to starting the course.
- Variable vs constant attributes
- Types of variables (ie., nominal, ordinal, integer, ratio)
- Descriptive Statistics (eg., mean, median, standard deviation, variance, percentiles)
- Higher order moments (eg., skewness and kurtosis)
- Exploring/summarizing univariate distributions (eg., histogram or density figure)
- What is a statistical model? Why do we use them?
- Population vs Sample
Examples
Mario Kart 64 world record data:
variable | class | description |
---|---|---|
track | character | Track name |
type | factor | Single or three lap record |
shortcut | factor | Shortcut or non-shortcut record |
player | character | Player’s name |
system_played | character | Used system (NTSC or PAL) |
date | date | World record date |
time_period | period | Time as hms period |
time | double | Time in seconds |
record_duration | double | Record duration in days |
# load some libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggformula)
## Loading required package: ggstance
##
## Attaching package: 'ggstance'
##
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
##
## Loading required package: scales
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
##
## Loading required package: ggridges
##
## New to ggformula? Try the tutorials:
## learnr::run_tutorial("introduction", package = "ggformula")
## learnr::run_tutorial("refining", package = "ggformula")
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
##
## The following object is masked from 'package:Matrix':
##
## mean
##
## The following object is masked from 'package:scales':
##
## rescale
##
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
##
## The following object is masked from 'package:purrr':
##
## cross
##
## The following object is masked from 'package:ggplot2':
##
## stat
##
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
##
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
library(e1071)
theme_set(theme_bw(base_size = 18))
# load in some data
mariokart <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-25/records.csv') %>%
mutate(year = year(date),
month = month(date),
day = month(date))
## Rows: 2334 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): track, type, shortcut, player, system_played, time_period
## dbl (2): time, record_duration
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(mariokart)
## # A tibble: 6 × 12
## track type short…¹ player syste…² date time_…³ time recor…⁴ year
## <chr> <chr> <chr> <chr> <chr> <date> <chr> <dbl> <dbl> <dbl>
## 1 Luigi Rac… Thre… No Salam NTSC 1997-02-15 2M 12.… 133. 1 1997
## 2 Luigi Rac… Thre… No Booth NTSC 1997-02-16 2M 9.9… 130. 0 1997
## 3 Luigi Rac… Thre… No Salam NTSC 1997-02-16 2M 8.9… 129. 12 1997
## 4 Luigi Rac… Thre… No Salam NTSC 1997-02-28 2M 6.9… 127. 7 1997
## 5 Luigi Rac… Thre… No Gregg… NTSC 1997-03-07 2M 4.5… 125. 54 1997
## 6 Luigi Rac… Thre… No Rocky… NTSC 1997-04-30 2M 2.8… 123. 0 1997
## # … with 2 more variables: month <dbl>, day <dbl>, and abbreviated variable
## # names ¹shortcut, ²system_played, ³time_period, ⁴record_duration
# univariate distribution of time
gf_histogram(~ time, data = mariokart, bins = 30, color = 'black') %>%
gf_labs(x = "Time (in seconds)")
gf_histogram(~ time, fill = ~ shortcut, data = mariokart, bins = 30, color = 'black') %>%
gf_labs(x = "Time (in seconds)")
gf_density(~ time, data = mariokart) %>%
gf_labs(x = "Time (in seconds)")
gf_density(~ time, color = 'black', fill = ~ shortcut, data = mariokart) %>%
gf_labs(x = "Time (in seconds)")
df_stats(~ time, data = mariokart, mean, median, sd, skewness, kurtosis, quantile(probs = c(0.1, 0.5, 0.9)))
## response mean median sd skewness kurtosis 10% 50% 90%
## 1 time 90.62383 86.19 66.6721 1.771732 3.844745 31.31 86.19 171.961
count(mariokart, track, shortcut)
## # A tibble: 28 × 3
## track shortcut n
## <chr> <chr> <int>
## 1 Banshee Boardwalk No 83
## 2 Bowser's Castle No 69
## 3 Choco Mountain No 77
## 4 Choco Mountain Yes 71
## 5 D.K.'s Jungle Parkway No 106
## 6 D.K.'s Jungle Parkway Yes 74
## 7 Frappe Snowland No 93
## 8 Frappe Snowland Yes 87
## 9 Kalimari Desert No 102
## 10 Kalimari Desert Yes 67
## # … with 18 more rows
mariokart |>
filter(track == 'Choco Mountain') |>
gf_density(~ time, color = 'black', fill = ~ shortcut, data = mariokart) %>%
gf_labs(x = "Time (in seconds)") |>
gf_facet_wrap(~ type, scale = 'free_x')
count(mariokart, track, system_played)
## # A tibble: 32 × 3
## track system_played n
## <chr> <chr> <int>
## 1 Banshee Boardwalk NTSC 21
## 2 Banshee Boardwalk PAL 62
## 3 Bowser's Castle NTSC 18
## 4 Bowser's Castle PAL 51
## 5 Choco Mountain NTSC 56
## 6 Choco Mountain PAL 92
## 7 D.K.'s Jungle Parkway NTSC 47
## 8 D.K.'s Jungle Parkway PAL 133
## 9 Frappe Snowland NTSC 65
## 10 Frappe Snowland PAL 115
## # … with 22 more rows
mariokart |>
filter(track == 'Moo Moo Farm', type == 'Single Lap') |>
gf_density(~ time, color = 'black', fill = ~ system_played, data = mariokart) %>%
gf_labs(x = "Time (in seconds)")
count(mariokart, type)
## # A tibble: 2 × 2
## type n
## <chr> <int>
## 1 Single Lap 1123
## 2 Three Lap 1211
str(mariokart)
## tibble [2,334 × 12] (S3: tbl_df/tbl/data.frame)
## $ track : chr [1:2334] "Luigi Raceway" "Luigi Raceway" "Luigi Raceway" "Luigi Raceway" ...
## $ type : chr [1:2334] "Three Lap" "Three Lap" "Three Lap" "Three Lap" ...
## $ shortcut : chr [1:2334] "No" "No" "No" "No" ...
## $ player : chr [1:2334] "Salam" "Booth" "Salam" "Salam" ...
## $ system_played : chr [1:2334] "NTSC" "NTSC" "NTSC" "NTSC" ...
## $ date : Date[1:2334], format: "1997-02-15" "1997-02-16" ...
## $ time_period : chr [1:2334] "2M 12.99S" "2M 9.99S" "2M 8.99S" "2M 6.99S" ...
## $ time : num [1:2334] 133 130 129 127 125 ...
## $ record_duration: num [1:2334] 1 0 12 7 54 0 0 27 0 64 ...
## $ year : num [1:2334] 1997 1997 1997 1997 1997 ...
## $ month : num [1:2334] 2 2 2 2 3 4 4 4 5 5 ...
## $ day : num [1:2334] 2 2 2 2 3 4 4 4 5 5 ...
Bivariate Association
cor(time ~ record_duration, data = mariokart) |>
round(2)
## [1] -0.07
gf_point(time ~ record_duration, data = mariokart) %>%
gf_smooth(method = 'lm') |>
gf_labs(x = "How long the record was held",
y = "Time (in seconds)")
mariokart %>%
group_by(track, type, shortcut, system_played) %>%
summarise(correlation = cor(time ~ record_duration),
num = n()) %>%
arrange(correlation)
## `summarise()` has grouped output by 'track', 'type', 'shortcut'. You can
## override using the `.groups` argument.
## # A tibble: 112 × 6
## # Groups: track, type, shortcut [56]
## track type shortcut system_played correlation num
## <chr> <chr> <chr> <chr> <dbl> <int>
## 1 Rainbow Road Three Lap No NTSC -1.00 3
## 2 Rainbow Road Single Lap No NTSC -0.914 4
## 3 Rainbow Road Single Lap Yes NTSC -0.914 4
## 4 Sherbet Land Three Lap Yes NTSC -0.779 5
## 5 Wario Stadium Three Lap Yes PAL -0.682 7
## 6 Luigi Raceway Three Lap Yes PAL -0.630 9
## 7 Moo Moo Farm Single Lap No NTSC -0.582 19
## 8 Choco Mountain Three Lap No PAL -0.527 31
## 9 D.K.'s Jungle Parkway Three Lap No NTSC -0.495 5
## 10 Yoshi Valley Three Lap Yes PAL -0.469 10
## # … with 102 more rows
mariokart %>%
filter(track == 'Mario Raceway',
type == 'Three Lap',
shortcut == 'No',
system_played == 'NTSC') %>%
gf_point(time ~ record_duration) %>%
gf_smooth(method = 'lm') |>
gf_labs(x = "How long the record was held",
y = "Time (in seconds)")
str(mariokart)
## tibble [2,334 × 12] (S3: tbl_df/tbl/data.frame)
## $ track : chr [1:2334] "Luigi Raceway" "Luigi Raceway" "Luigi Raceway" "Luigi Raceway" ...
## $ type : chr [1:2334] "Three Lap" "Three Lap" "Three Lap" "Three Lap" ...
## $ shortcut : chr [1:2334] "No" "No" "No" "No" ...
## $ player : chr [1:2334] "Salam" "Booth" "Salam" "Salam" ...
## $ system_played : chr [1:2334] "NTSC" "NTSC" "NTSC" "NTSC" ...
## $ date : Date[1:2334], format: "1997-02-15" "1997-02-16" ...
## $ time_period : chr [1:2334] "2M 12.99S" "2M 9.99S" "2M 8.99S" "2M 6.99S" ...
## $ time : num [1:2334] 133 130 129 127 125 ...
## $ record_duration: num [1:2334] 1 0 12 7 54 0 0 27 0 64 ...
## $ year : num [1:2334] 1997 1997 1997 1997 1997 ...
## $ month : num [1:2334] 2 2 2 2 3 4 4 4 5 5 ...
## $ day : num [1:2334] 2 2 2 2 3 4 4 4 5 5 ...
Questions
- What is problematic about the analyses above? Why?
- What could be done to improve the analyses above?