HTML Tables

This vignette for the unpivotr package demonstrates unpivoting html tables of various kinds.

The HTML files are in the package directory at system.file("extdata", c("rowspan.html", "colspan.html", "nested.html"), package = "unpivotr").

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rvest)
library(htmltools)
library(unpivotr)

Rowspan and colspan examples

If a table has cells merged across rows or columns (or both), then as_cells() does not attempt to fill the cell contents across the rows or columns. This is different from other packages, e.g. rvest. However, if merged cells cause a table not to be square, then as_cells() pads the missing cells with blanks.

Rowspan

rowspan <- system.file("extdata", "rowspan.html", package = "unpivotr")
includeHTML(rowspan)

HTML table with rowspan

Header (1:2, 1)	Header (1, 2)
Header (1:2, 1)	cell (2, 2)

# rvest
rowspan %>%
  read_html() %>%
  html_table()

## [[1]]
## # A tibble: 1 × 2
##   `Header (1:2, 1)` `Header (1, 2)`
##   <chr>             <chr>          
## 1 Header (1:2, 1)   cell (2, 2)

# unpivotr
rowspan %>%
  read_html() %>%
  as_cells()

## [[1]]
## # A tibble: 4 × 4
##     row   col data_type html                                    
##   <int> <int> <chr>     <chr>                                   
## 1     1     1 html      "<th rowspan=\"2\">Header (1:2, 1)</th>"
## 2     2     1 html       NA                                     
## 3     1     2 html      "<th>Header (1, 2)</th>"                
## 4     2     2 html      "<td>cell (2, 2)</td>"

Colspan

colspan <- system.file("extdata", "colspan.html", package = "unpivotr")
includeHTML(colspan)

HTML table with colspan

Header (1, 1:2)
cell (2, 1)	cell (2, 2)

# rvest
colspan %>%
  read_html() %>%
  html_table()

## [[1]]
## # A tibble: 1 × 2
##   `Header (1, 1:2)` `Header (1, 1:2)`
##   <chr>             <chr>            
## 1 cell (2, 1)       cell (2, 2)

# unpivotr
colspan %>%
  read_html() %>%
  as_cells()

## [[1]]
## # A tibble: 4 × 4
##     row   col data_type html                                    
##   <int> <int> <chr>     <chr>                                   
## 1     1     1 html      "<th colspan=\"2\">Header (1, 1:2)</th>"
## 2     2     1 html      "<td>cell (2, 1)</td>"                  
## 3     1     2 html       NA                                     
## 4     2     2 html      "<td>cell (2, 2)</td>"

Both rowspan and colspan: non-square

rowandcolspan <- system.file("extdata",
                             "row-and-colspan.html",
                             package = "unpivotr")
includeHTML(rowandcolspan)

HTML table with colspan

Header (1:2, 1:2)		Header (2, 3)
Header (1:2, 1:2)		cell (3, 1)	cell (3, 2)	cell (3, 3)

# rvest
rowandcolspan %>%
  read_html() %>%
  html_table()

## [[1]]
## # A tibble: 1 × 5
##   `Header (1:2, 1:2)` `Header (1:2, 1:2)` `Header (2, 3)` ``          ``        
##   <chr>               <chr>               <chr>           <chr>       <chr>     
## 1 Header (1:2, 1:2)   Header (1:2, 1:2)   cell (3, 1)     cell (3, 2) cell (3, …

# unpivotr
rowandcolspan %>%
  read_html() %>%
  as_cells()

## [[1]]
## # A tibble: 10 × 4
##      row   col data_type html                                                   
##    <int> <int> <chr>     <chr>                                                  
##  1     1     1 html      "<th colspan=\"2\" rowspan=\"2\">Header (1:2, 1:2)</th…
##  2     2     1 html       NA                                                    
##  3     1     2 html       NA                                                    
##  4     2     2 html       NA                                                    
##  5     1     3 html      "<th>Header (2, 3)</th>"                               
##  6     2     3 html      "<td>cell (3, 1)</td>"                                 
##  7     1     4 html       NA                                                    
##  8     2     4 html      "<td>cell (3, 2)</td>"                                 
##  9     1     5 html       NA                                                    
## 10     2     5 html      "<td>cell (3, 3)</td>"

Nested example

as_cells() never descends into cells. If there is a table inside a cell, then to parse that table use html_table again on that cell.

nested <- system.file("extdata", "nested.html", package = "unpivotr")
includeHTML(nested)

Nested HTML table

Header (1, 1)

Header (1, 2)

cell (2, 1)

Header (2, 2)(1, 1)	Header (2, 2)(1, 2)
cell (2, 2)(2, 1)	cell (2, 2)(2, 1)

# rvest parses both tables
nested %>%
  read_html() %>%
  html_table(fill = TRUE)

## [[1]]
## # A tibble: 3 × 6
##   `Header (1, 1)`     `Header (1, 2)`                    ``    ``    ``    ``   
##   <chr>               <chr>                              <chr> <chr> <chr> <chr>
## 1 cell (2, 1)         "Header (2, 2)(1, 1)\n           … Head… Head… cell… cell…
## 2 Header (2, 2)(1, 1) "Header (2, 2)(1, 2)"              NA    NA    NA    NA   
## 3 cell (2, 2)(2, 1)   "cell (2, 2)(2, 1)"                NA    NA    NA    NA   
## 
## [[2]]
## # A tibble: 1 × 2
##   `Header (2, 2)(1, 1)` `Header (2, 2)(1, 2)`
##   <chr>                 <chr>                
## 1 cell (2, 2)(2, 1)     cell (2, 2)(2, 1)

# unpivotr
x <-
  nested %>%
  read_html() %>%
  as_cells() %>%
  .[[1]]
x

## # A tibble: 4 × 4
##     row   col data_type html                                                    
##   <int> <int> <chr>     <chr>                                                   
## 1     1     1 html      "<th>Header (1, 1)</th>"                                
## 2     2     1 html      "<td>cell (2, 1)</td>"                                  
## 3     1     2 html      "<th>Header (1, 2)</th>"                                
## 4     2     2 html      "<td>\n          <table>\n<tr>\n<th>Header (2, 2)(1, 1)…

# The html of the table inside a cell
cell <-
  x %>%
  dplyr::filter(row == 2, col == 2) %>%
  .$html
cell

## [1] "<td>\n          <table>\n<tr>\n<th>Header (2, 2)(1, 1)</th>\n              <th>Header (2, 2)(1, 2)</th>\n            </tr>\n<tr>\n<td>cell (2, 2)(2, 1)</td>\n              <td>cell (2, 2)(2, 1)</td>\n            </tr>\n</table>\n</td>"

# Parsing the table inside the cell
cell %>%
  read_html() %>%
  as_cells()

## [[1]]
## # A tibble: 4 × 4
##     row   col data_type html                        
##   <int> <int> <chr>     <chr>                       
## 1     1     1 html      <th>Header (2, 2)(1, 1)</th>
## 2     2     1 html      <td>cell (2, 2)(2, 1)</td>  
## 3     1     2 html      <th>Header (2, 2)(1, 2)</th>
## 4     2     2 html      <td>cell (2, 2)(2, 1)</td>

URL example

A motivation for using unpivotr::as_cells() is that it extracts more than just text – it can extract whatever part of the HTML you need.

Here, we extract URLs.

urls <- system.file("extdata", "url.html", package = "unpivotr")
includeHTML(urls)