Skip to contents

spatter() is like tidyr::spread() but for when different columns have different data types. It works on data that has come via as_cells() or tidyxl::xlsx_cells(), where each row represents one cell of a table, and the value of the cell is represented in a different column, depending on the data type.

Usage

spatter(cells, key, values = NULL, types = data_type, formatters = list())

Arguments

cells

A data frame where each row represents a cell, with columns row and col, usually a column data_type, and additional columns of cell values.

key

The name of the column whose values will become column names

values

Optional. The column of cells to use as the value of each cell. Given as a bare variable name. If omitted (the default), the type argument will be used instead.

types

Optional. The column that names, for each row of cells, which column contains the cell value. Defaults to data_type.

formatters

A named list of functions for formatting particular data types, named by the data type (the name of the column of cells that contains the cell value.

Examples

# A tidy representation of cells of mixed data types
x <- data.frame(stringsAsFactors = FALSE,
        row = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L),
        col = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L),
  data_type = c("character", "character", "character", "numeric", "character",
                "numeric", "character", "numeric"),
  character = c("Name", "Age", "Matilda", NA, "Nicholas", NA, "Olivia", NA),
    numeric = c(NA, NA, NA, 1, NA, 3, NA, 5))
x
#>   row col data_type character numeric
#> 1   1   1 character      Name      NA
#> 2   1   2 character       Age      NA
#> 3   2   1 character   Matilda      NA
#> 4   2   2   numeric      <NA>       1
#> 5   3   1 character  Nicholas      NA
#> 6   3   2   numeric      <NA>       3
#> 7   4   1 character    Olivia      NA
#> 8   4   2   numeric      <NA>       5

# How it would look in a spreadsheet
rectify(x)
#> # A tibble: 4 × 3
#>   `row/col` `1(A)`   `2(B)`
#>       <int> <chr>    <chr> 
#> 1         1 Name     Age   
#> 2         2 Matilda  1     
#> 3         3 Nicholas 3     
#> 4         4 Olivia   5     

# How it looks after treating the cells in row 1 as headers
y <- behead(x, "N", header)
y$col <- NULL # Drop the 'col' column
y
#> # A tibble: 6 × 5
#>     row data_type character numeric header
#>   <int> <chr>     <chr>       <dbl> <chr> 
#> 1     2 character Matilda        NA Name  
#> 2     2 numeric   NA              1 Age   
#> 3     3 character Nicholas       NA Name  
#> 4     3 numeric   NA              3 Age   
#> 5     4 character Olivia         NA Name  
#> 6     4 numeric   NA              5 Age   

# At this point you might want to do tidyr::spread(), but it won't work because
# you want to use both the `character` and `numeric` columns as the values.
tidyr::spread(y, header, numeric)
#> # A tibble: 6 × 5
#>     row data_type character   Age  Name
#>   <int> <chr>     <chr>     <dbl> <dbl>
#> 1     2 character Matilda      NA    NA
#> 2     2 numeric   NA            1    NA
#> 3     3 character Nicholas     NA    NA
#> 4     3 numeric   NA            3    NA
#> 5     4 character Olivia       NA    NA
#> 6     4 numeric   NA            5    NA
tidyr::spread(y, header, character)
#> # A tibble: 6 × 5
#>     row data_type numeric Age   Name    
#>   <int> <chr>       <dbl> <chr> <chr>   
#> 1     2 character      NA NA    Matilda 
#> 2     2 numeric         1 NA    NA      
#> 3     3 character      NA NA    Nicholas
#> 4     3 numeric         3 NA    NA      
#> 5     4 character      NA NA    Olivia  
#> 6     4 numeric         5 NA    NA      
spatter(y, header)
#> # A tibble: 3 × 3
#>     row   Age Name    
#>   <int> <dbl> <chr>   
#> 1     2     1 Matilda 
#> 2     3     3 Nicholas
#> 3     4     5 Olivia  

# The difference between spatter() and tidyr::spread() is that spatter()
# needs to know which data-type to use for each cell beneath the headers.  By
# default, it looks at the `data_type` column to decide, but you can change
# that with the `types` argument.
y %>%
  dplyr::select(-data_type, -numeric) %>%
  dplyr::mutate(data_type_2 = "character") %>%
  spatter(header, types = data_type_2)
#> # A tibble: 3 × 3
#>     row Age   Name    
#>   <int> <chr> <chr>   
#> 1     2 NA    Matilda 
#> 2     3 NA    Nicholas
#> 3     4 NA    Olivia  

# Alternatively you can name one specific column to use for the cell values.
y %>%
  dplyr::mutate(foo = letters[1:6]) %>%
  dplyr::select(header, row, foo) %>%
  spatter(header, values = foo)
#> # A tibble: 3 × 3
#>     row Age   Name 
#>   <int> <chr> <chr>
#> 1     2 b     a    
#> 2     3 d     c    
#> 3     4 f     e    

# The column used for the values is consumed before the spread occurs. If
# it's necessary for demarking the rows, then make a copy of it first,
# otherwise you'll get an error like "Duplicate identifiers for rows ..."
y %>%
  dplyr::mutate(row2 = row) %>%
  dplyr::select(row, header, row2) %>%
  spatter(header, values = row2)
#> # A tibble: 3 × 3
#>     row   Age  Name
#>   <int> <int> <int>
#> 1     2     2     2
#> 2     3     3     3
#> 3     4     4     4

# Like tidyr::spread(), you need to discard extraneous columns beforehand.
# Otherwise you can get more rows out than you want.
y$extra <- 11:16
spatter(y, header)
#> # A tibble: 6 × 4
#>     row extra   Age Name    
#>   <int> <int> <dbl> <chr>   
#> 1     2    11    NA Matilda 
#> 2     2    12     1 NA      
#> 3     3    13    NA Nicholas
#> 4     3    14     3 NA      
#> 5     4    15    NA Olivia  
#> 6     4    16     5 NA      

# pack() is an easy way to keep just the columns you need, without knowing
# in advance which data-type columns you need.  This examples adds a new
# column, which is then removed by the pack-unpack sequence without having to
# mention it by name.
x$extra <- 11:18
x %>%
  pack() %>%
  dplyr::select(row, col, value) %>%
  unpack()
#>   row col data_type character numeric
#> 1   1   1 character      Name      NA
#> 2   1   2 character       Age      NA
#> 3   2   1 character   Matilda      NA
#> 4   2   2   numeric      <NA>       1
#> 5   3   1 character  Nicholas      NA
#> 6   3   2   numeric      <NA>       3
#> 7   4   1 character    Olivia      NA
#> 8   4   2   numeric      <NA>       5

# spatter() automatically converts data types so that they can coexist in the
# same column.  Ordered factors in particular will always be coerced to
# unordered factors.

# You can control data type conversion by supplying custom functions, named
# by the data type of the cells they are to convert (look at the `data_type`
# column).  If your custom functions aren't sufficient to avoid the need for
# coercion, then they will be overridden.
spatter(y, header,
        formatters = list(character = ~ toupper(.), numeric = as.complex))
#> # A tibble: 6 × 4
#>     row extra Age   Name    
#>   <int> <int> <cpl> <chr>   
#> 1     2    11   NA  MATILDA 
#> 2     2    12 1+0i  NA      
#> 3     3    13   NA  NICHOLAS
#> 4     3    14 3+0i  NA      
#> 5     4    15   NA  OLIVIA  
#> 6     4    16 5+0i  NA