K-fold cross validation

kidiq <- read.csv("data/kidiq.csv")
# assign each row to a random fold
kidiq$fold <- sample(rep(1:10,ceiling(nrow(kidiq)/10)))[1:nrow(kidiq)]
kidiq <- as_tibble(kidiq) |> mutate(row_id = row_number())
kidiq$score <- 0
set.seed(33)
for( i in 1:10)
{
  fit <- stan_glm(kid_score ~ mom_hs + mom_iq, data = kidiq[kidiq$fold != i,], refresh = 0)
  testdat <- kidiq[kidiq$fold == i,]
  sigma <- as.data.frame(fit)$sigma
  
  # This for loop 'smells' , but works
  for(j in 1:nrow(testdat))
  {
    newdat <- testdat[j,]
    pred <- posterior_linpred(fit,newdata = newdat)
    testdat[j,]$score <- log(mean(dnorm(pred - newdat$kid_score, mean = 0, sd = sigma)))
  }
  kidiq <- rows_update(kidiq, testdat, by = 'row_id')
}
sum(kidiq$score)

# -1877.47
sd(kidiq$score)*sqrt(nrow(kidiq))

# 14.3
  • There exists a function kfold that can do all this for you though!
set.seed(33)
kfold_3 <- kfold(fit_3)
kfold_3
## 
## Based on 10-fold cross-validation.
## 
##            Estimate   SE
## elpd_kfold  -1876.6 14.2
## p_kfold         4.5  0.7
## kfoldic      3753.1 28.5
  • The results are compatable with loo_compare
kfold_1 <- kfold(fit_1)
loo_compare(kfold_1,kfold_3)
##       elpd_diff se_diff
## fit_3   0.0       0.0  
## fit_1 -38.7       8.4