ragged tsv

Some of the rows have more columns than the first row of column names.

writeLines(
"a\tb\tc\td\te\tf
1\t2\t3\t4\t5\t6\t
1\t2\t3\t4\t5\t6\t7\t
1\t2\t3\t4\t5\t6\t7\t8
1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11
1\t2\t3\t4\t5\t6
",
"test.tsv"
)

Note how the last column has entries with tabs \t in it:

# 2021+ behavior
readr::with_edition(
  2,
  readr::read_tsv("test.tsv", show_col_types = FALSE)
)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#>   dat <- vroom(...)
#>   problems(dat)
#> # A tibble: 5 × 6
#>       a     b     c     d     e f                   
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <chr>               
#> 1     1     2     3     4     5 "6"                 
#> 2     1     2     3     4     5 "6\t7"              
#> 3     1     2     3     4     5 "6\t7\t8"           
#> 4     1     2     3     4     5 "6\t7\t8\t9\t10\t11"
#> 5     1     2     3     4     5 "6"

These superfluous cells would be lopped off in the past.

# 2020 behavior, before vroom's parsing engine was adopted
readr::with_edition(
  1,
  readr::read_tsv("test.tsv", show_col_types = FALSE)
)
#> Warning: 4 parsing failures.
#> row col  expected     actual       file
#>   1  -- 6 columns 7 columns  'test.tsv'
#>   2  -- 6 columns 8 columns  'test.tsv'
#>   3  -- 6 columns 8 columns  'test.tsv'
#>   4  -- 6 columns 11 columns 'test.tsv'
#> # A tibble: 5 × 6
#>       a     b     c     d     e     f
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     1     2     3     4     5     6
#> 2     1     2     3     4     5     6
#> 3     1     2     3     4     5     6
#> 4     1     2     3     4     5     6
#> 5     1     2     3     4     5     6

A sensible solution would be read each line separate as a tsv and bind them together.

raw_lines <- readr::read_lines("test.tsv")
rows <- raw_lines |>
  tail(-1) |> 
  lapply(
    function(x) {
      readr::read_tsv(
        I(x), 
        col_types = readr::cols(.default = readr::col_character()),
        col_names = FALSE,
        show_col_types = FALSE
      )  
    }
  ) |> 
  purrr::list_rbind()
    
rows
#> # A tibble: 5 × 11
#>   X1    X2    X3    X4    X5    X6    X7    X8    X9    X10   X11  
#>   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1     2     3     4     5     6     <NA>  <NA>  <NA>  <NA>  <NA> 
#> 2 1     2     3     4     5     6     7     <NA>  <NA>  <NA>  <NA> 
#> 3 1     2     3     4     5     6     7     8     <NA>  <NA>  <NA> 
#> 4 1     2     3     4     5     6     7     8     9     10    11   
#> 5 1     2     3     4     5     6     <NA>  <NA>  <NA>  <NA>  <NA>

Then do some surgery to preserve the column names we do have.

first_row <- readr::read_tsv(
  I(head(raw_lines, 2)),
  show_col_types = FALSE
)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#>   dat <- vroom(...)
#>   problems(dat)

names(rows)[seq_along(colnames(first_row))] <- colnames(first_row)
readr::type_convert(rows)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   a = col_double(),
#>   b = col_double(),
#>   c = col_double(),
#>   d = col_double(),
#>   e = col_double(),
#>   f = col_double(),
#>   X7 = col_double(),
#>   X8 = col_double(),
#>   X9 = col_double(),
#>   X10 = col_double(),
#>   X11 = col_double()
#> )
#> # A tibble: 5 × 11
#>       a     b     c     d     e     f    X7    X8    X9   X10   X11
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     1     2     3     4     5     6    NA    NA    NA    NA    NA
#> 2     1     2     3     4     5     6     7    NA    NA    NA    NA
#> 3     1     2     3     4     5     6     7     8    NA    NA    NA
#> 4     1     2     3     4     5     6     7     8     9    10    11
#> 5     1     2     3     4     5     6    NA    NA    NA    NA    NA

The meltr package will read in the tsv into a long format.

m <- meltr::melt_tsv("test.tsv")
#> Registered S3 methods overwritten by 'meltr':
#>   method           from 
#>   print.date_names readr
#>   print.locale     readr
m
#> # A tibble: 50 × 4
#>      row   col data_type value
#>    <dbl> <dbl> <chr>     <chr>
#>  1     1     1 character a    
#>  2     1     2 character b    
#>  3     1     3 character c    
#>  4     1     4 character d    
#>  5     1     5 character e    
#>  6     1     6 logical   f    
#>  7     2     1 integer   1    
#>  8     2     2 integer   2    
#>  9     2     3 integer   3    
#> 10     2     4 integer   4    
#> # ℹ 40 more rows

So we can pivot this into a nice format and have the row-binding happen automatically.

m_wide <- m |> 
  dplyr::filter(row != 1) |> 
  tidyr::pivot_wider(
    id_cols = row, 
    names_from = col, 
    values_from = value
  ) |> 
  dplyr::select(-row)
m_wide
#> # A tibble: 9 × 11
#>   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`   `10`  `11` 
#>   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1     2     3     4     5     6     <NA>  <NA>  <NA>  <NA>  <NA> 
#> 2 <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#> 3 1     2     3     4     5     6     7     <NA>  <NA>  <NA>  <NA> 
#> 4 <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#> 5 1     2     3     4     5     6     7     8     <NA>  <NA>  <NA> 
#> 6 1     2     3     4     5     6     7     8     9     10    11   
#> 7 1     2     3     4     5     6     <NA>  <NA>  <NA>  <NA>  <NA> 
#> 8 <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#> 9 <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>

Let’s merge the names we do know into the column names and apply readr’s name repair strategy.

current_names <- m |> 
  dplyr::filter(row == 1) |> 
  dplyr::pull(value)

new_names <- rep("", length(names(m_wide)))
new_names[seq_along(current_names)] <- current_names
names(m_wide) <- vctrs::vec_as_names(new_names, repair = "unique_quiet")

readr::type_convert(m_wide)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   a = col_double(),
#>   b = col_double(),
#>   c = col_double(),
#>   d = col_double(),
#>   e = col_double(),
#>   f = col_double(),
#>   ...7 = col_double(),
#>   ...8 = col_double(),
#>   ...9 = col_double(),
#>   ...10 = col_double(),
#>   ...11 = col_double()
#> )
#> # A tibble: 9 × 11
#>       a     b     c     d     e     f  ...7  ...8  ...9 ...10 ...11
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     1     2     3     4     5     6    NA    NA    NA    NA    NA
#> 2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#> 3     1     2     3     4     5     6     7    NA    NA    NA    NA
#> 4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#> 5     1     2     3     4     5     6     7     8    NA    NA    NA
#> 6     1     2     3     4     5     6     7     8     9    10    11
#> 7     1     2     3     4     5     6    NA    NA    NA    NA    NA
#> 8    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#> 9    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA

^{Created on 2024-07-25 with reprex v2.1.1}

tjmahr/ragged-tsv.md