This vignette for the unpivotr package demonstrates unpivoting html tables of various kinds.
The HTML files are in the package directory at system.file("extdata", c("rowspan.html", "colspan.html", "nested.html"), package = "unpivotr")
.
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
If a table has cells merged across rows or columns (or both), then as_cells()
does not attempt to fill the cell contents across the rows or columns. This is different from other packages, e.g. rvest
. However, if merged cells cause a table not to be square, then as_cells()
pads the missing cells with blanks.
rowspan <- system.file("extdata", "rowspan.html", package = "unpivotr")
includeHTML(rowspan)
Header (1:2, 1) | Header (1, 2) |
---|---|
cell (2, 2) |
# rvest
rowspan %>%
read_html() %>%
html_table()
## [[1]]
## # A tibble: 1 x 2
## `Header (1:2, 1)` `Header (1, 2)`
## <chr> <chr>
## 1 Header (1:2, 1) cell (2, 2)
## [[1]]
## # A tibble: 4 x 4
## row col data_type html
## <int> <int> <chr> <chr>
## 1 1 1 html "<th rowspan=\"2\">Header (1:2, 1)</th>"
## 2 2 1 html <NA>
## 3 1 2 html "<th>Header (1, 2)</th>"
## 4 2 2 html "<td>cell (2, 2)</td>"
colspan <- system.file("extdata", "colspan.html", package = "unpivotr")
includeHTML(colspan)
Header (1, 1:2) | |
---|---|
cell (2, 1) | cell (2, 2) |
# rvest
colspan %>%
read_html() %>%
html_table()
## [[1]]
## # A tibble: 1 x 2
## `Header (1, 1:2)` `Header (1, 1:2)`
## <chr> <chr>
## 1 cell (2, 1) cell (2, 2)
## [[1]]
## # A tibble: 4 x 4
## row col data_type html
## <int> <int> <chr> <chr>
## 1 1 1 html "<th colspan=\"2\">Header (1, 1:2)</th>"
## 2 2 1 html "<td>cell (2, 1)</td>"
## 3 1 2 html <NA>
## 4 2 2 html "<td>cell (2, 2)</td>"
rowandcolspan <- system.file("extdata",
"row-and-colspan.html",
package = "unpivotr")
includeHTML(rowandcolspan)
Header (1:2, 1:2) | Header (2, 3) | |
---|---|---|
cell (3, 1) | cell (3, 2) | cell (3, 3) |
# rvest
rowandcolspan %>%
read_html() %>%
html_table()
## [[1]]
## # A tibble: 1 x 5
## `Header (1:2, 1:2)` `Header (1:2, 1:2)` `Header (2, 3)` `` ``
## <chr> <chr> <chr> <chr> <chr>
## 1 Header (1:2, 1:2) Header (1:2, 1:2) cell (3, 1) cell (3, 2) cell (3, …
## [[1]]
## # A tibble: 10 x 4
## row col data_type html
## <int> <int> <chr> <chr>
## 1 1 1 html "<th colspan=\"2\" rowspan=\"2\">Header (1:2, 1:2)</th…
## 2 2 1 html <NA>
## 3 1 2 html <NA>
## 4 2 2 html <NA>
## 5 1 3 html "<th>Header (2, 3)</th>"
## 6 2 3 html "<td>cell (3, 1)</td>"
## 7 1 4 html <NA>
## 8 2 4 html "<td>cell (3, 2)</td>"
## 9 1 5 html <NA>
## 10 2 5 html "<td>cell (3, 3)</td>"
as_cells()
never descends into cells. If there is a table inside a cell, then to parse that table use html_table
again on that cell.
nested <- system.file("extdata", "nested.html", package = "unpivotr")
includeHTML(nested)
Header (1, 1) | Header (1, 2) | ||||
---|---|---|---|---|---|
cell (2, 1) |
|
# rvest parses both tables
nested %>%
read_html() %>%
html_table(fill = TRUE)
## [[1]]
## # A tibble: 3 x 6
## `Header (1, 1)` `Header (1, 2)` `` `` `` ``
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 cell (2, 1) "Header (2, 2)(1, 1)\n … Header … Header… cell (… cell (…
## 2 Header (2, 2)(1… "Header (2, 2)(1, 2)" <NA> <NA> <NA> <NA>
## 3 cell (2, 2)(2, … "cell (2, 2)(2, 1)" <NA> <NA> <NA> <NA>
##
## [[2]]
## # A tibble: 1 x 2
## `Header (2, 2)(1, 1)` `Header (2, 2)(1, 2)`
## <chr> <chr>
## 1 cell (2, 2)(2, 1) cell (2, 2)(2, 1)
## # A tibble: 4 x 4
## row col data_type html
## <int> <int> <chr> <chr>
## 1 1 1 html "<th>Header (1, 1)</th>"
## 2 2 1 html "<td>cell (2, 1)</td>"
## 3 1 2 html "<th>Header (1, 2)</th>"
## 4 2 2 html "<td>\n <table>\n<tr>\n<th>Header (2, 2)(1, 1)…
# The html of the table inside a cell
cell <-
x %>%
dplyr::filter(row == 2, col == 2) %>%
.$html
cell
## [1] "<td>\n <table>\n<tr>\n<th>Header (2, 2)(1, 1)</th>\n <th>Header (2, 2)(1, 2)</th>\n </tr>\n<tr>\n<td>cell (2, 2)(2, 1)</td>\n <td>cell (2, 2)(2, 1)</td>\n </tr>\n</table>\n</td>"
## [[1]]
## # A tibble: 4 x 4
## row col data_type html
## <int> <int> <chr> <chr>
## 1 1 1 html <th>Header (2, 2)(1, 1)</th>
## 2 2 1 html <td>cell (2, 2)(2, 1)</td>
## 3 1 2 html <th>Header (2, 2)(1, 2)</th>
## 4 2 2 html <td>cell (2, 2)(2, 1)</td>
A motivation for using unpivotr::as_cells()
is that it extracts more than just text – it can extract whatever part of the HTML you need.
Here, we extract URLs.
urls <- system.file("extdata", "url.html", package = "unpivotr")
includeHTML(urls)
Scraping HTML. | ||
Sweet | as? |
Yeah, right. |
cell_url <- function(x) {
if (is.na(x)) return(NA)
x %>%
read_html %>%
html_nodes("a") %>%
html_attr("href")
}
cell_text <- function(x) {
if (is.na(x)) return(NA)
x %>%
read_html %>%
html_nodes("a") %>%
html_text()
}
urls %>%
read_html() %>%
as_cells() %>%
.[[1]] %>%
mutate(text = purrr::map(html, cell_text),
url = purrr::map(html, cell_url)) %>%
tidyr::unnest(text, url)
## Warning: unnest() has a new interface. See ?unnest for details.
## Try `df %>% unnest(c(text, url))`, with `mutate()` if needed
## # A tibble: 8 x 6
## row col data_type html text url
## <int> <int> <chr> <chr> <chr> <chr>
## 1 1 1 html "<td colspan=\"2\">\n<a href=… Scrap… https://www.r-pro…
## 2 1 1 html "<td colspan=\"2\">\n<a href=… HTML. https://cran.r-pr…
## 3 2 1 html "<td><a href=\"https://cran.r… Sweet https://cran.r-pr…
## 4 1 2 html <NA> <NA> <NA>
## 5 2 2 html "<td><a href=\"https://cran.r… as? https://cran.r-pr…
## 6 1 3 html <NA> <NA> <NA>
## 7 2 3 html "<td>\n<a href=\"https://cran… Yeah, https://cran.r-pr…
## 8 2 3 html "<td>\n<a href=\"https://cran… right. https://www.db.co…