10.2 Modeling of lung cancer risk in Pennsylvania

Bayesian spatial model to estimate the risk of lung cancer and assess its relationship with smoking in Pennsylvania, USA, in 2002.

library(tidyverse)
library(SpatialEpi)
data(pennLC)
names(pennLC)

## [1] "geo"             "data"            "smoking"         "spatial.polygon"

head(pennLC$data)

##   county cases population race gender      age
## 1  adams     0       1492    o      f Under.40
## 2  adams     0        365    o      f    40.59
## 3  adams     1         68    o      f    60.69
## 4  adams     0         73    o      f      70+
## 5  adams     0      23351    w      f Under.40
## 6  adams     5      12136    w      f    40.59

library(sf)
map <- st_as_sf(pennLC$spatial.polygon)
countynames <- sapply(slot(pennLC$spatial.polygon, "polygons"),
                      function(x){slot(x, "ID")})
map$county <- countynames
head(map)

## Simple feature collection with 6 features and 1 field
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -80.51776 ymin: 39.72889 xmax: -75.53303 ymax: 41.1441
## Geodetic CRS:  +proj=longlat
##                         geometry    county
## 1 POLYGON ((-77.4467 39.96954...     adams
## 2 POLYGON ((-80.14534 40.6742... allegheny
## 3 POLYGON ((-79.21142 40.9091... armstrong
## 4 POLYGON ((-80.1568 40.85189...    beaver
## 5 POLYGON ((-78.38063 39.7288...   bedford
## 6 POLYGON ((-75.53303 40.4508...     berks

d <- group_by(pennLC$data, county) %>% 
  summarize(Y = sum(cases))
head(d)

## # A tibble: 6 × 2
##   county        Y
##   <fct>     <int>
## 1 adams        55
## 2 allegheny  1275
## 3 armstrong    49
## 4 beaver      172
## 5 bedford      37
## 6 berks       308

p1 <- map %>%
  right_join(d, by = c("county" = "county")) %>%
  ggplot()+
  geom_sf(aes(fill=Y), color="navy")+
  scale_fill_continuous(low="white", high="navy")+
  geom_sf_text(aes(label=county,alpha=Y,color="orange"), size=2)+
  coord_sf()+
  labs(title = "Lung cancer cases",
       subtitle = "Pennsylvania 1990-1994",
       caption = "Source: PennLC data",
       fill = "Cases")+
  theme_minimal()
p1

## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data

10.2.1 Expected cases

$E_i = \sum_{j=1}^m r_j^{(s)} n_j^{(s)}$

where $r_j^{(s)}$ is the rate of disease in stratum $j$ and $n_j^{(s)}$ is the population in stratum $j$ .

pennLC$data <- pennLC$data[order(pennLC$data$county,
                                 pennLC$data$race, 
                                 pennLC$data$gender, 
                                 pennLC$data$age), ]

E <- expected(population = pennLC$data$population,
              cases = pennLC$data$cases, 
 # 2 races, 2 genders, and 4 age groups (2×2×4 = 16)
              n.strata = 16)
d$E <- E
head(d)

## # A tibble: 6 × 3
##   county        Y      E
##   <fct>     <int>  <dbl>
## 1 adams        55   69.6
## 2 allegheny  1275 1182. 
## 3 armstrong    49   67.6
## 4 beaver      172  173. 
## 5 bedford      37   44.2
## 6 berks       308  301.

Let’s visualize the difference between observed and expected cases.

map %>%
  right_join(d, by = c("county" = "county")) %>%
  pivot_longer(cols = c("Y", "E"), 
               names_to = "type", 
               values_to = "value") %>%
  mutate(value=round(value))%>%
  ggplot()+
  geom_sf(aes(fill=value), color="navy")+
  scale_fill_continuous(low="white", high="navy")+
  geom_sf_text(aes(label=value,
                   alpha=value,
                   color="orange"), 
               fontface="bold",
               size=2)+
  coord_sf()+
  facet_wrap(~type)+
  guides(fill = guide_colorbar(title = "Cases"),alpha="none",color="none")+
  labs(title = "Lung cancer Observed and Expected cases",
       subtitle = "Pennsylvania 1990-1994",
       caption = "Source: PennLC data",
       fill = "Type")+
  theme_minimal()+
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        legend.position = "bottom")

10.2.2 Standardized Mortality Ratios

$SMR_i = \frac{Y_i}{E_i}$

if $SMR_i > 1$ , then the county has more cases than expected, and if $SMR_i < 1$ , then the county has fewer cases than expected.

# add the smoking data
d <- dplyr::left_join(d, pennLC$smoking, by = "county")
d$SMR <- d$Y/d$E
head(d)

## # A tibble: 6 × 5
##   county        Y      E smoking   SMR
##   <fct>     <int>  <dbl>   <dbl> <dbl>
## 1 adams        55   69.6   0.234 0.790
## 2 allegheny  1275 1182.    0.245 1.08 
## 3 armstrong    49   67.6   0.25  0.725
## 4 beaver      172  173.    0.276 0.997
## 5 bedford      37   44.2   0.228 0.837
## 6 berks       308  301.    0.249 1.02

ggplot(d, aes(x=smoking, y=SMR))+
  geom_point(aes(color=smoking))+
  geom_smooth(se=FALSE)+
  labs(title = "Lung cancer SMR vs Smoking",
       subtitle = "Pennsylvania 1990-1994",
       caption = "Source: PennLC data",
       x = "Smoking",
       y = "SMR")+
  theme_minimal()

map <- dplyr::left_join(map, d, by = "county")

map %>%
  ggplot()+
  geom_sf(aes(fill=SMR), color="navy")+
  scale_fill_continuous(low="white", high="navy")+
  geom_sf_text(aes(label=county,
                   alpha=SMR,
                   color="orange"), 
               size=2)+
  coord_sf()+
  guides(fill = guide_colorbar(title = "Cases"),
         alpha="none",
         color="none")+
  labs(title = "Lung cancer SMR",
       subtitle = "Pennsylvania 1990-1994",
       caption = "Source: PennLC data",
       fill = "SMR")+
  theme_minimal()