# Review for PSQF 6243

This serves as a non-exhaustive review for the course. These are elements that I assume you have knowledge of prior to starting the course.

• Variable vs constant attributes
• Types of variables (ie., nominal, ordinal, integer, ratio)
• Descriptive Statistics (eg., mean, median, standard deviation, variance, percentiles)
• Higher order moments (eg., skewness and kurtosis)
• Exploring/summarizing univariate distributions (eg., histogram or density figure)
• What is a statistical model? Why do we use them?
• Population vs Sample

## Examples

Mario Kart 64 world record data:

variable class description
track character Track name
type factor Single or three lap record
shortcut factor Shortcut or non-shortcut record
player character Player’s name
system_played character Used system (NTSC or PAL)
date date World record date
time_period period Time as hms period
time double Time in seconds
record_duration double Record duration in days
# load some libraries
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1
## ✔ readr   2.1.2      ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──

library(ggformula)

## Loading required package: ggstance
##
## Attaching package: 'ggstance'
##
## The following objects are masked from 'package:ggplot2':
##
##     geom_errorbarh, GeomErrorbarh
##
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
##
##
##     col_factor
##
##
## New to ggformula?  Try the tutorials:
## 	learnr::run_tutorial("introduction", package = "ggformula")
## 	learnr::run_tutorial("refining", package = "ggformula")

library(lubridate)

##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
##     date, intersect, setdiff, union

library(mosaic)

## Registered S3 method overwritten by 'mosaic':
##   method                           from
##   fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features.  The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
##
## The following object is masked from 'package:Matrix':
##
##     mean
##
## The following object is masked from 'package:scales':
##
##     rescale
##
## The following objects are masked from 'package:dplyr':
##
##     count, do, tally
##
## The following object is masked from 'package:purrr':
##
##     cross
##
## The following object is masked from 'package:ggplot2':
##
##     stat
##
## The following objects are masked from 'package:stats':
##
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
##
## The following objects are masked from 'package:base':
##
##     max, mean, min, prod, range, sample, sum

library(e1071)

theme_set(theme_bw(base_size = 18))

mutate(year = year(date),
month = month(date),
day = month(date))

## Rows: 2334 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): track, type, shortcut, player, system_played, time_period
## dbl  (2): time, record_duration
## date (1): date
##
## ℹ Use spec() to retrieve the full column specification for this data.
## ℹ Specify the column types or set show_col_types = FALSE to quiet this message.

head(mariokart)

## # A tibble: 6 × 12
##   track      type  short…¹ player syste…² date       time_…³  time recor…⁴  year
##   <chr>      <chr> <chr>   <chr>  <chr>   <date>     <chr>   <dbl>   <dbl> <dbl>
## 1 Luigi Rac… Thre… No      Salam  NTSC    1997-02-15 2M 12.…  133.       1  1997
## 2 Luigi Rac… Thre… No      Booth  NTSC    1997-02-16 2M 9.9…  130.       0  1997
## 3 Luigi Rac… Thre… No      Salam  NTSC    1997-02-16 2M 8.9…  129.      12  1997
## 4 Luigi Rac… Thre… No      Salam  NTSC    1997-02-28 2M 6.9…  127.       7  1997
## 5 Luigi Rac… Thre… No      Gregg… NTSC    1997-03-07 2M 4.5…  125.      54  1997
## 6 Luigi Rac… Thre… No      Rocky… NTSC    1997-04-30 2M 2.8…  123.       0  1997
## # … with 2 more variables: month <dbl>, day <dbl>, and abbreviated variable
## #   names ¹​shortcut, ²​system_played, ³​time_period, ⁴​record_duration

# univariate distribution of time
gf_histogram(~ time, data = mariokart, bins = 30, color = 'black') %>%
gf_labs(x = "Time (in seconds)")


gf_histogram(~ time, fill = ~ shortcut, data = mariokart, bins = 30, color = 'black') %>%
gf_labs(x = "Time (in seconds)")


gf_density(~ time, data = mariokart) %>%
gf_labs(x = "Time (in seconds)")


gf_density(~ time, color = 'black', fill = ~ shortcut, data = mariokart) %>%
gf_labs(x = "Time (in seconds)")


df_stats(~ time, data = mariokart, mean, median, sd, skewness, kurtosis, quantile(probs = c(0.1, 0.5, 0.9)))

##   response     mean median      sd skewness kurtosis   10%   50%     90%
## 1     time 90.62383  86.19 66.6721 1.771732 3.844745 31.31 86.19 171.961

count(mariokart, track, shortcut)

## # A tibble: 28 × 3
##    track                 shortcut     n
##    <chr>                 <chr>    <int>
##  1 Banshee Boardwalk     No          83
##  2 Bowser's Castle       No          69
##  3 Choco Mountain        No          77
##  4 Choco Mountain        Yes         71
##  5 D.K.'s Jungle Parkway No         106
##  6 D.K.'s Jungle Parkway Yes         74
##  7 Frappe Snowland       No          93
##  8 Frappe Snowland       Yes         87
##  9 Kalimari Desert       No         102
## 10 Kalimari Desert       Yes         67
## # … with 18 more rows

mariokart |>
filter(track == 'Choco Mountain') |>
gf_density(~ time, color = 'black', fill = ~ shortcut, data = mariokart) %>%
gf_labs(x = "Time (in seconds)") |>
gf_facet_wrap(~ type, scale = 'free_x')


count(mariokart, track, system_played)

## # A tibble: 32 × 3
##    track                 system_played     n
##    <chr>                 <chr>         <int>
##  1 Banshee Boardwalk     NTSC             21
##  2 Banshee Boardwalk     PAL              62
##  3 Bowser's Castle       NTSC             18
##  4 Bowser's Castle       PAL              51
##  5 Choco Mountain        NTSC             56
##  6 Choco Mountain        PAL              92
##  7 D.K.'s Jungle Parkway NTSC             47
##  8 D.K.'s Jungle Parkway PAL             133
##  9 Frappe Snowland       NTSC             65
## 10 Frappe Snowland       PAL             115
## # … with 22 more rows

mariokart |>
filter(track == 'Moo Moo Farm', type == 'Single Lap') |>
gf_density(~ time, color = 'black', fill = ~ system_played, data = mariokart) %>%
gf_labs(x = "Time (in seconds)")


count(mariokart, type)

## # A tibble: 2 × 2
##   type           n
##   <chr>      <int>
## 1 Single Lap  1123
## 2 Three Lap   1211

str(mariokart)

## tibble [2,334 × 12] (S3: tbl_df/tbl/data.frame)
##  $track : chr [1:2334] "Luigi Raceway" "Luigi Raceway" "Luigi Raceway" "Luigi Raceway" ... ##$ type           : chr [1:2334] "Three Lap" "Three Lap" "Three Lap" "Three Lap" ...
##  $shortcut : chr [1:2334] "No" "No" "No" "No" ... ##$ player         : chr [1:2334] "Salam" "Booth" "Salam" "Salam" ...
##  $system_played : chr [1:2334] "NTSC" "NTSC" "NTSC" "NTSC" ... ##$ date           : Date[1:2334], format: "1997-02-15" "1997-02-16" ...
##  $time_period : chr [1:2334] "2M 12.99S" "2M 9.99S" "2M 8.99S" "2M 6.99S" ... ##$ time           : num [1:2334] 133 130 129 127 125 ...
##  $record_duration: num [1:2334] 1 0 12 7 54 0 0 27 0 64 ... ##$ year           : num [1:2334] 1997 1997 1997 1997 1997 ...
##  $month : num [1:2334] 2 2 2 2 3 4 4 4 5 5 ... ##$ day            : num [1:2334] 2 2 2 2 3 4 4 4 5 5 ...


# Bivariate Association

cor(time ~ record_duration, data = mariokart) |>
round(2)

## [1] -0.07

gf_point(time ~ record_duration, data = mariokart) %>%
gf_smooth(method = 'lm') |>
gf_labs(x = "How long the record was held",
y = "Time (in seconds)")


mariokart %>%
group_by(track, type, shortcut, system_played) %>%
summarise(correlation = cor(time ~ record_duration),
num = n()) %>%
arrange(correlation)

## summarise() has grouped output by 'track', 'type', 'shortcut'. You can
## override using the .groups argument.

## # A tibble: 112 × 6
## # Groups:   track, type, shortcut [56]
##    track                 type       shortcut system_played correlation   num
##    <chr>                 <chr>      <chr>    <chr>               <dbl> <int>
##  1 Rainbow Road          Three Lap  No       NTSC               -1.00      3
##  2 Rainbow Road          Single Lap No       NTSC               -0.914     4
##  3 Rainbow Road          Single Lap Yes      NTSC               -0.914     4
##  4 Sherbet Land          Three Lap  Yes      NTSC               -0.779     5
##  5 Wario Stadium         Three Lap  Yes      PAL                -0.682     7
##  6 Luigi Raceway         Three Lap  Yes      PAL                -0.630     9
##  7 Moo Moo Farm          Single Lap No       NTSC               -0.582    19
##  8 Choco Mountain        Three Lap  No       PAL                -0.527    31
##  9 D.K.'s Jungle Parkway Three Lap  No       NTSC               -0.495     5
## 10 Yoshi Valley          Three Lap  Yes      PAL                -0.469    10
## # … with 102 more rows

mariokart %>%
filter(track == 'Mario Raceway',
type == 'Three Lap',
shortcut == 'No',
system_played == 'NTSC') %>%
gf_point(time ~ record_duration) %>%
gf_smooth(method = 'lm') |>
gf_labs(x = "How long the record was held",
y = "Time (in seconds)")


str(mariokart)

## tibble [2,334 × 12] (S3: tbl_df/tbl/data.frame)
##  $track : chr [1:2334] "Luigi Raceway" "Luigi Raceway" "Luigi Raceway" "Luigi Raceway" ... ##$ type           : chr [1:2334] "Three Lap" "Three Lap" "Three Lap" "Three Lap" ...
##  $shortcut : chr [1:2334] "No" "No" "No" "No" ... ##$ player         : chr [1:2334] "Salam" "Booth" "Salam" "Salam" ...
##  $system_played : chr [1:2334] "NTSC" "NTSC" "NTSC" "NTSC" ... ##$ date           : Date[1:2334], format: "1997-02-15" "1997-02-16" ...
##  $time_period : chr [1:2334] "2M 12.99S" "2M 9.99S" "2M 8.99S" "2M 6.99S" ... ##$ time           : num [1:2334] 133 130 129 127 125 ...
##  $record_duration: num [1:2334] 1 0 12 7 54 0 0 27 0 64 ... ##$ year           : num [1:2334] 1997 1997 1997 1997 1997 ...
##  $month : num [1:2334] 2 2 2 2 3 4 4 4 5 5 ... ##$ day            : num [1:2334] 2 2 2 2 3 4 4 4 5 5 ...


## Questions

1. What is problematic about the analyses above? Why?
2. What could be done to improve the analyses above?
Previous