20.4 Reading Excel spreadsheets

  • read_excel() will read the file in as a tibble.
students <- read_excel("data/students.xlsx")


students
## # A tibble: 6 × 5
##   `Student ID` `Full Name`      favourite.food     mealPlan            AGE  
##          <dbl> <chr>            <chr>              <chr>               <chr>
## 1            1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
## 2            2 Barclay Lynn     French fries       Lunch only          5    
## 3            3 Jayendra Lyne    N/A                Breakfast and lunch 7    
## 4            4 Leon Rossini     Anchovies          Lunch only          <NA> 
## 5            5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
## 6            6 Güvenç Attila    Ice cream          Lunch only          6

There are a few things we might want to address in this dataset:

  • The column names are all over the place. We can provide column names that follow a consistent format; we recommend snake_case using the col_names argument.
read_excel(
  "data/students.xlsx",
  col_names = c("student_id", "full_name", "favourite_food", "meal_plan", "age")
)
## # A tibble: 7 × 5
##   student_id full_name        favourite_food     meal_plan           age  
##   <chr>      <chr>            <chr>              <chr>               <chr>
## 1 Student ID Full Name        favourite.food     mealPlan            AGE  
## 2 1          Sunil Huffmann   Strawberry yoghurt Lunch only          4    
## 3 2          Barclay Lynn     French fries       Lunch only          5    
## 4 3          Jayendra Lyne    N/A                Breakfast and lunch 7    
## 5 4          Leon Rossini     Anchovies          Lunch only          <NA> 
## 6 5          Chidiegwu Dunkel Pizza              Breakfast and lunch five 
## 7 6          Güvenç Attila    Ice cream          Lunch only          6
  • Skiping the first row
read_excel(
  "data/students.xlsx",
  col_names = c("student_id", "full_name", "favourite_food", "meal_plan", "age"),
  skip = 1
)
## # A tibble: 6 × 5
##   student_id full_name        favourite_food     meal_plan           age  
##        <dbl> <chr>            <chr>              <chr>               <chr>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
## 2          2 Barclay Lynn     French fries       Lunch only          5    
## 3          3 Jayendra Lyne    N/A                Breakfast and lunch 7    
## 4          4 Leon Rossini     Anchovies          Lunch only          <NA> 
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
## 6          6 Güvenç Attila    Ice cream          Lunch only          6
  • Dealing with the missing data
read_excel(
  "data/students.xlsx",
  col_names = c("student_id", "full_name", "favourite_food", "meal_plan", "age"),
  skip = 1,
  na = c("", "N/A")
)
## # A tibble: 6 × 5
##   student_id full_name        favourite_food     meal_plan           age  
##        <dbl> <chr>            <chr>              <chr>               <chr>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
## 2          2 Barclay Lynn     French fries       Lunch only          5    
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch 7    
## 4          4 Leon Rossini     Anchovies          Lunch only          <NA> 
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
## 6          6 Güvenç Attila    Ice cream          Lunch only          6
  • We can also specify the column types
read_excel(
  "data/students.xlsx",
  col_names = c("student_id", "full_name", "favourite_food", "meal_plan", "age"),
  skip = 1,
  na = c("", "N/A"),
  col_types = c("numeric", "text", "text", "text", "numeric")
)
## Warning: Expecting numeric in E6 / R6C5: got 'five'
## # A tibble: 6 × 5
##   student_id full_name        favourite_food     meal_plan             age
##        <dbl> <chr>            <chr>              <chr>               <dbl>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4
## 2          2 Barclay Lynn     French fries       Lunch only              5
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch     7
## 4          4 Leon Rossini     Anchovies          Lunch only             NA
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch    NA
## 6          6 Güvenç Attila    Ice cream          Lunch only              6
  • This introduce us to a new problem in which we need to fix
students <- read_excel(
  "data/students.xlsx",
  col_names = c("student_id", "full_name", "favourite_food", "meal_plan", "age"),
  skip = 1,
  na = c("", "N/A"),
  col_types = c("numeric", "text", "text", "text", "text")
)


students
## # A tibble: 6 × 5
##   student_id full_name        favourite_food     meal_plan           age  
##        <dbl> <chr>            <chr>              <chr>               <chr>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only          4    
## 2          2 Barclay Lynn     French fries       Lunch only          5    
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch 7    
## 4          4 Leon Rossini     Anchovies          Lunch only          <NA> 
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch five 
## 6          6 Güvenç Attila    Ice cream          Lunch only          6
students <- students |>
  mutate(
    age = if_else(age == "five", "5", age),
    age = parse_number(age)
  )


students
## # A tibble: 6 × 5
##   student_id full_name        favourite_food     meal_plan             age
##        <dbl> <chr>            <chr>              <chr>               <dbl>
## 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4
## 2          2 Barclay Lynn     French fries       Lunch only              5
## 3          3 Jayendra Lyne    <NA>               Breakfast and lunch     7
## 4          4 Leon Rossini     Anchovies          Lunch only             NA
## 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5
## 6          6 Güvenç Attila    Ice cream          Lunch only              6