This vignette for the unpivotr package demonstrates unpivoting html tables of various kinds.

The HTML files are in the package directory at system.file("extdata", c("rowspan.html", "colspan.html", "nested.html"), package = "unpivotr").

##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
##     filter, lag
## The following objects are masked from 'package:base':
##
##     intersect, setdiff, setequal, union

Rowspan and colspan examples

If a table has cells merged across rows or columns (or both), then as_cells() does not attempt to fill the cell contents across the rows or columns. This is different from other packages, e.g. rvest. However, if merged cells cause a table not to be square, then as_cells() pads the missing cells with blanks.

Rowspan

rowspan <- system.file("extdata", "rowspan.html", package = "unpivotr")
includeHTML(rowspan)
HTML table with rowspan
Header (1:2, 1) Header (1, 2)
cell (2, 2)

# rvest
rowspan %>%
  read_html() %>%
  html_table()
## [[1]]
## # A tibble: 1 x 2
##   `Header (1:2, 1)` `Header (1, 2)`
##   <chr>             <chr>
## 1 Header (1:2, 1)   cell (2, 2)
# unpivotr
rowspan %>%
  read_html() %>%
  as_cells()
## [[1]]
## # A tibble: 4 x 4
##     row   col data_type html
##   <int> <int> <chr>     <chr>
## 1     1     1 html      "<th rowspan=\"2\">Header (1:2, 1)</th>"
## 2     2     1 html       <NA>
## 3     1     2 html      "<th>Header (1, 2)</th>"
## 4     2     2 html      "<td>cell (2, 2)</td>"

Colspan

colspan <- system.file("extdata", "colspan.html", package = "unpivotr")
includeHTML(colspan)
HTML table with colspan
Header (1, 1:2)
cell (2, 1) cell (2, 2)

# rvest
colspan %>%
  read_html() %>%
  html_table()
## [[1]]
## # A tibble: 1 x 2
##   `Header (1, 1:2)` `Header (1, 1:2)`
##   <chr>             <chr>
## 1 cell (2, 1)       cell (2, 2)
# unpivotr
colspan %>%
  read_html() %>%
  as_cells()
## [[1]]
## # A tibble: 4 x 4
##     row   col data_type html
##   <int> <int> <chr>     <chr>
## 1     1     1 html      "<th colspan=\"2\">Header (1, 1:2)</th>"
## 2     2     1 html      "<td>cell (2, 1)</td>"
## 3     1     2 html       <NA>
## 4     2     2 html      "<td>cell (2, 2)</td>"

Both rowspan and colspan: non-square

rowandcolspan <- system.file("extdata",
                             "row-and-colspan.html",
                             package = "unpivotr")
includeHTML(rowandcolspan)
HTML table with colspan
Header (1:2, 1:2) Header (2, 3)
cell (3, 1) cell (3, 2) cell (3, 3)

# rvest
rowandcolspan %>%
  read_html() %>%
  html_table()
## [[1]]
## # A tibble: 1 x 5
##   `Header (1:2, 1:2)` `Header (1:2, 1:2)` `Header (2, 3)` ``          ``
##   <chr>               <chr>               <chr>           <chr>       <chr>
## 1 Header (1:2, 1:2)   Header (1:2, 1:2)   cell (3, 1)     cell (3, 2) cell (3, …
# unpivotr
rowandcolspan %>%
  read_html() %>%
  as_cells()
## [[1]]
## # A tibble: 10 x 4
##      row   col data_type html
##    <int> <int> <chr>     <chr>
##  1     1     1 html      "<th colspan=\"2\" rowspan=\"2\">Header (1:2, 1:2)</th…
##  2     2     1 html       <NA>
##  3     1     2 html       <NA>
##  4     2     2 html       <NA>
##  5     1     3 html      "<th>Header (2, 3)</th>"
##  6     2     3 html      "<td>cell (3, 1)</td>"
##  7     1     4 html       <NA>
##  8     2     4 html      "<td>cell (3, 2)</td>"
##  9     1     5 html       <NA>
## 10     2     5 html      "<td>cell (3, 3)</td>"

Nested example

as_cells() never descends into cells. If there is a table inside a cell, then to parse that table use html_table again on that cell.

nested <- system.file("extdata", "nested.html", package = "unpivotr")
includeHTML(nested)
Nested HTML table
Header (1, 1) Header (1, 2)
cell (2, 1)
Header (2, 2)(1, 1) Header (2, 2)(1, 2)
cell (2, 2)(2, 1) cell (2, 2)(2, 1)

# rvest parses both tables
nested %>%
  read_html() %>%
  html_table(fill = TRUE)
## [[1]]
## # A tibble: 3 x 6
##   `Header (1, 1)`  `Header (1, 2)`              ``       ``      ``      ``
##   <chr>            <chr>                        <chr>    <chr>   <chr>   <chr>
## 1 cell (2, 1)      "Header (2, 2)(1, 1)\n     … Header … Header… cell (… cell (…
## 2 Header (2, 2)(1… "Header (2, 2)(1, 2)"        <NA>     <NA>    <NA>    <NA>
## 3 cell (2, 2)(2, … "cell (2, 2)(2, 1)"          <NA>     <NA>    <NA>    <NA>
##
## [[2]]
## # A tibble: 1 x 2
##   `Header (2, 2)(1, 1)` `Header (2, 2)(1, 2)`
##   <chr>                 <chr>
## 1 cell (2, 2)(2, 1)     cell (2, 2)(2, 1)
# unpivotr
x <-
  nested %>%
  read_html() %>%
  as_cells() %>%
  .[[1]]
x
## # A tibble: 4 x 4
##     row   col data_type html
##   <int> <int> <chr>     <chr>
## 1     1     1 html      "<th>Header (1, 1)</th>"
## 2     2     1 html      "<td>cell (2, 1)</td>"
## 3     1     2 html      "<th>Header (1, 2)</th>"
## 4     2     2 html      "<td>\n          <table>\n<tr>\n<th>Header (2, 2)(1, 1)…
# The html of the table inside a cell
cell <-
  x %>%
  dplyr::filter(row == 2, col == 2) %>%
  .$html
cell
## [1] "<td>\n          <table>\n<tr>\n<th>Header (2, 2)(1, 1)</th>\n              <th>Header (2, 2)(1, 2)</th>\n            </tr>\n<tr>\n<td>cell (2, 2)(2, 1)</td>\n              <td>cell (2, 2)(2, 1)</td>\n            </tr>\n</table>\n</td>"
# Parsing the table inside the cell
cell %>%
  read_html() %>%
  as_cells()
## [[1]]
## # A tibble: 4 x 4
##     row   col data_type html
##   <int> <int> <chr>     <chr>
## 1     1     1 html      <th>Header (2, 2)(1, 1)</th>
## 2     2     1 html      <td>cell (2, 2)(2, 1)</td>
## 3     1     2 html      <th>Header (2, 2)(1, 2)</th>
## 4     2     2 html      <td>cell (2, 2)(2, 1)</td>

URL example

A motivation for using unpivotr::as_cells() is that it extracts more than just text – it can extract whatever part of the HTML you need.

Here, we extract URLs.

urls <- system.file("extdata", "url.html", package = "unpivotr")
includeHTML(urls)
HTML table with rowspan
Scraping HTML.
Sweet as? Yeah,
right.

cell_url <- function(x) {
  if (is.na(x)) return(NA)
  x %>%
    read_html %>%
    html_nodes("a") %>%
    html_attr("href")
}

cell_text <- function(x) {
  if (is.na(x)) return(NA)
  x %>%
    read_html %>%
    html_nodes("a") %>%
    html_text()
}

urls %>%
  read_html() %>%
  as_cells() %>%
  .[[1]] %>%
  mutate(text = purrr::map(html, cell_text),
         url = purrr::map(html, cell_url)) %>%
  tidyr::unnest(text, url)
## Warning: unnest() has a new interface. See ?unnest for details.
## Try `df %>% unnest(c(text, url))`, with `mutate()` if needed
## # A tibble: 8 x 6
##     row   col data_type html                           text   url
##   <int> <int> <chr>     <chr>                          <chr>  <chr>
## 1     1     1 html      "<td colspan=\"2\">\n<a href=… Scrap… https://www.r-pro…
## 2     1     1 html      "<td colspan=\"2\">\n<a href=… HTML.  https://cran.r-pr…
## 3     2     1 html      "<td><a href=\"https://cran.r… Sweet  https://cran.r-pr…
## 4     1     2 html       <NA>                          <NA>   <NA>
## 5     2     2 html      "<td><a href=\"https://cran.r… as?    https://cran.r-pr…
## 6     1     3 html       <NA>                          <NA>   <NA>
## 7     2     3 html      "<td>\n<a href=\"https://cran… Yeah,  https://cran.r-pr…
## 8     2     3 html      "<td>\n<a href=\"https://cran… right. https://www.db.co…