K-fold cross validation
kidiq <- read.csv("data/kidiq.csv")
# assign each row to a random fold
kidiq$fold <- sample(rep(1:10,ceiling(nrow(kidiq)/10)))[1:nrow(kidiq)]
kidiq <- as_tibble(kidiq) |> mutate(row_id = row_number())
kidiq$score <- 0
set.seed(33)
for( i in 1:10)
{
fit <- stan_glm(kid_score ~ mom_hs + mom_iq, data = kidiq[kidiq$fold != i,], refresh = 0)
testdat <- kidiq[kidiq$fold == i,]
sigma <- as.data.frame(fit)$sigma
# This for loop 'smells' , but works
for(j in 1:nrow(testdat))
{
newdat <- testdat[j,]
pred <- posterior_linpred(fit,newdata = newdat)
testdat[j,]$score <- log(mean(dnorm(pred - newdat$kid_score, mean = 0, sd = sigma)))
}
kidiq <- rows_update(kidiq, testdat, by = 'row_id')
}
sum(kidiq$score)
# -1877.47
sd(kidiq$score)*sqrt(nrow(kidiq))
# 14.3
- There exists a function
kfold
that can do all this for you though!
set.seed(33)
kfold_3 <- kfold(fit_3)
kfold_3
##
## Based on 10-fold cross-validation.
##
## Estimate SE
## elpd_kfold -1876.6 14.2
## p_kfold 4.5 0.7
## kfoldic 3753.1 28.5
- The results are compatable with
loo_compare
kfold_1 <- kfold(fit_1)
loo_compare(kfold_1,kfold_3)
## elpd_diff se_diff
## fit_3 0.0 0.0
## fit_1 -38.7 8.4