21.3 {infer} for simple, high level hypothesis testing

  • specify relationship and optionally hypothesis
  • calculate statistics from simulation or based on theoretical distributions
  • many common tests are supported for continuous and discreet variables as well

21.3.1 p value for idependence based on simulation with permutation

observed <- race_top_results %>%
  specify(avg_velocity ~ avg_elevation_gain) %>%
  calculate(stat = "correlation")

observed
## Response: avg_velocity (numeric)
## Explanatory: avg_elevation_gain (numeric)
## # A tibble: 1 × 1
##     stat
##    <dbl>
## 1 -0.561
permuted <- race_top_results %>%
  specify(avg_velocity ~ avg_elevation_gain) %>%
  hypothesise(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "correlation")

permuted
## Response: avg_velocity (numeric)
## Explanatory: avg_elevation_gain (numeric)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate     stat
##        <int>    <dbl>
##  1         1 -0.0162 
##  2         2  0.0118 
##  3         3 -0.0277 
##  4         4 -0.00381
##  5         5  0.00657
##  6         6 -0.0120 
##  7         7 -0.0171 
##  8         8 -0.0333 
##  9         9  0.0707 
## 10        10 -0.0279 
## # ℹ 990 more rows
permuted %>%
  visualize() +
  shade_p_value(observed, direction = "two_sided")

get_p_value(permuted, observed, direction = "two_sided")
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

21.3.2 Confidence interval for correlation based on simulation with bootstrapping

bootstrapped <- race_top_results %>%
  specify(avg_velocity ~ avg_elevation_gain) %>%
  generate(reps = 1000, type = "bootstrap") %>%
  calculate(stat = "correlation")

bootstrapped %>%
  visualize() +
  shade_confidence_interval(get_confidence_interval(bootstrapped))

21.3.3 Use theory instead of simulation

observed_t <- race_top_results %>%
  specify(response = avg_velocity) %>%
  hypothesise(null = "point", mu = 7) %>%
  calculate(stat = "t")

race_top_results %>%
  specify(response = avg_velocity) %>%
  assume("t") %>%
  visualize() +
  shade_p_value(observed_t, direction = "two_sided")

race_top_results %>%
  specify(response = avg_velocity) %>%
  assume("t") %>%
  get_p_value(observed_t, "two_sided")
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1  0.0190

21.3.4 Linear models with multiple explanatory variables

my_formula <- as.formula(avg_velocity ~ aid_stations + participants)

observed_fit <- race_top_results %>%
  specify(my_formula) %>%
  fit()

observed_fit
## # A tibble: 3 × 2
##   term          estimate
##   <chr>            <dbl>
## 1 intercept     6.94    
## 2 aid_stations -0.0131  
## 3 participants  0.000422
permuted_fits <- race_top_results %>%
  specify(my_formula) %>%
  hypothesise(null = "independence") %>%
  generate(reps = 1000, type = "permute", variables = c(aid_stations, participants)) %>%
  fit()

bootstrapped_fits <- race_top_results %>%
  specify(my_formula) %>%
  generate(reps = 2000, type = "bootstrap") %>%
  fit()
permuted_fits %>% get_p_value(observed_fit, "two_sided")
## # A tibble: 3 × 2
##   term         p_value
##   <chr>          <dbl>
## 1 aid_stations   0.054
## 2 intercept      0.334
## 3 participants   0.006
visualize(permuted_fits) +
  shade_p_value(observed_fit, "two_sided")

bootstrapped_fits %>%
  get_confidence_interval(type = "percentile", point_estimate = observed_fit)
## # A tibble: 3 × 3
##   term          lower_ci upper_ci
##   <chr>            <dbl>    <dbl>
## 1 aid_stations -0.0278   0.00223 
## 2 intercept     6.80     7.07    
## 3 participants  0.000123 0.000890