Example: Forming a linear predictor from a multiple regression

From https://github.com/behrman/ros

Simulated data.

set.seed(33)

n <- 100
k <- 10
a <- 1
b <- 1:k
theta <- 5
sigma <- 2


data_2 <- 
  tibble(
    X = matrix(runif(n * k, min = 0, max = 1), nrow = n, ncol = k),
    z = rep(0:1, n / 2) %>% sample(),
    y = as.double(a + X %*% b + theta * z + rnorm(n, mean = 0, sd = sigma))
  )

Fit linear regression model.

set.seed(33)
fit_2 <- stan_glm(y ~ X + z, data = data_2, refresh = 0)
fit_2
## stan_glm
##  family:       gaussian [identity]
##  formula:      y ~ X + z
##  observations: 100
##  predictors:   12
## ------
##             Median MAD_SD
## (Intercept)  0.8    1.1  
## X1           0.4    0.7  
## X2           1.6    0.7  
## X3           3.0    0.7  
## X4           4.0    0.7  
## X5           4.9    0.7  
## X6           5.1    0.8  
## X7           6.8    0.7  
## X8           8.2    0.7  
## X9           9.0    0.7  
## X10         12.0    0.8  
## z            5.5    0.4  
## 
## Auxiliary parameter(s):
##       Median MAD_SD
## sigma 2.0    0.2   
## 
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg

Outcome vs. predicted value.

data_2 %>%
  mutate(pred = predict(fit_2)) %>% 
  ggplot(aes(pred, y, color = factor(z))) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0) +
  coord_fixed() +
  scale_y_continuous(breaks = scales::breaks_width(10)) +
  labs(
    title = "Outcome vs. predicted value",
    x = "Predicted value",
    y = "Outcome",
    color = "Treatment"
  )