Skip to content

Instantly share code, notes, and snippets.

@tjmahr
Last active July 25, 2024 19:45
Show Gist options
  • Save tjmahr/b776ade3d5a179b130303e148c75ec6d to your computer and use it in GitHub Desktop.
Save tjmahr/b776ade3d5a179b130303e148c75ec6d to your computer and use it in GitHub Desktop.
ragged tsv

Some of the rows have more columns than the first row of column names.

writeLines(
"a\tb\tc\td\te\tf
1\t2\t3\t4\t5\t6\t
1\t2\t3\t4\t5\t6\t7\t
1\t2\t3\t4\t5\t6\t7\t8
1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11
1\t2\t3\t4\t5\t6
",
"test.tsv"
)

Note how the last column has entries with tabs \t in it:

# 2021+ behavior
readr::with_edition(
  2,
  readr::read_tsv("test.tsv", show_col_types = FALSE)
)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#>   dat <- vroom(...)
#>   problems(dat)
#> # A tibble: 5 × 6
#>       a     b     c     d     e f                   
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <chr>               
#> 1     1     2     3     4     5 "6"                 
#> 2     1     2     3     4     5 "6\t7"              
#> 3     1     2     3     4     5 "6\t7\t8"           
#> 4     1     2     3     4     5 "6\t7\t8\t9\t10\t11"
#> 5     1     2     3     4     5 "6"

These superfluous cells would be lopped off in the past.

# 2020 behavior, before vroom's parsing engine was adopted
readr::with_edition(
  1,
  readr::read_tsv("test.tsv", show_col_types = FALSE)
)
#> Warning: 4 parsing failures.
#> row col  expected     actual       file
#>   1  -- 6 columns 7 columns  'test.tsv'
#>   2  -- 6 columns 8 columns  'test.tsv'
#>   3  -- 6 columns 8 columns  'test.tsv'
#>   4  -- 6 columns 11 columns 'test.tsv'
#> # A tibble: 5 × 6
#>       a     b     c     d     e     f
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     1     2     3     4     5     6
#> 2     1     2     3     4     5     6
#> 3     1     2     3     4     5     6
#> 4     1     2     3     4     5     6
#> 5     1     2     3     4     5     6

A sensible solution would be read each line separate as a tsv and bind them together.

raw_lines <- readr::read_lines("test.tsv")
rows <- raw_lines |>
  tail(-1) |> 
  lapply(
    function(x) {
      readr::read_tsv(
        I(x), 
        col_types = readr::cols(.default = readr::col_character()),
        col_names = FALSE,
        show_col_types = FALSE
      )  
    }
  ) |> 
  purrr::list_rbind()
    
rows
#> # A tibble: 5 × 11
#>   X1    X2    X3    X4    X5    X6    X7    X8    X9    X10   X11  
#>   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1     2     3     4     5     6     <NA>  <NA>  <NA>  <NA>  <NA> 
#> 2 1     2     3     4     5     6     7     <NA>  <NA>  <NA>  <NA> 
#> 3 1     2     3     4     5     6     7     8     <NA>  <NA>  <NA> 
#> 4 1     2     3     4     5     6     7     8     9     10    11   
#> 5 1     2     3     4     5     6     <NA>  <NA>  <NA>  <NA>  <NA>

Then do some surgery to preserve the column names we do have.

first_row <- readr::read_tsv(
  I(head(raw_lines, 2)),
  show_col_types = FALSE
)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#>   dat <- vroom(...)
#>   problems(dat)

names(rows)[seq_along(colnames(first_row))] <- colnames(first_row)
readr::type_convert(rows)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   a = col_double(),
#>   b = col_double(),
#>   c = col_double(),
#>   d = col_double(),
#>   e = col_double(),
#>   f = col_double(),
#>   X7 = col_double(),
#>   X8 = col_double(),
#>   X9 = col_double(),
#>   X10 = col_double(),
#>   X11 = col_double()
#> )
#> # A tibble: 5 × 11
#>       a     b     c     d     e     f    X7    X8    X9   X10   X11
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     1     2     3     4     5     6    NA    NA    NA    NA    NA
#> 2     1     2     3     4     5     6     7    NA    NA    NA    NA
#> 3     1     2     3     4     5     6     7     8    NA    NA    NA
#> 4     1     2     3     4     5     6     7     8     9    10    11
#> 5     1     2     3     4     5     6    NA    NA    NA    NA    NA

The meltr package will read in the tsv into a long format.

m <- meltr::melt_tsv("test.tsv")
#> Registered S3 methods overwritten by 'meltr':
#>   method           from 
#>   print.date_names readr
#>   print.locale     readr
m
#> # A tibble: 50 × 4
#>      row   col data_type value
#>    <dbl> <dbl> <chr>     <chr>
#>  1     1     1 character a    
#>  2     1     2 character b    
#>  3     1     3 character c    
#>  4     1     4 character d    
#>  5     1     5 character e    
#>  6     1     6 logical   f    
#>  7     2     1 integer   1    
#>  8     2     2 integer   2    
#>  9     2     3 integer   3    
#> 10     2     4 integer   4    
#> # ℹ 40 more rows

So we can pivot this into a nice format and have the row-binding happen automatically.

m_wide <- m |> 
  dplyr::filter(row != 1) |> 
  tidyr::pivot_wider(
    id_cols = row, 
    names_from = col, 
    values_from = value
  ) |> 
  dplyr::select(-row)
m_wide
#> # A tibble: 9 × 11
#>   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`   `10`  `11` 
#>   <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1     2     3     4     5     6     <NA>  <NA>  <NA>  <NA>  <NA> 
#> 2 <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#> 3 1     2     3     4     5     6     7     <NA>  <NA>  <NA>  <NA> 
#> 4 <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#> 5 1     2     3     4     5     6     7     8     <NA>  <NA>  <NA> 
#> 6 1     2     3     4     5     6     7     8     9     10    11   
#> 7 1     2     3     4     5     6     <NA>  <NA>  <NA>  <NA>  <NA> 
#> 8 <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
#> 9 <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>

Let’s merge the names we do know into the column names and apply readr’s name repair strategy.

current_names <- m |> 
  dplyr::filter(row == 1) |> 
  dplyr::pull(value)

new_names <- rep("", length(names(m_wide)))
new_names[seq_along(current_names)] <- current_names
names(m_wide) <- vctrs::vec_as_names(new_names, repair = "unique_quiet")

readr::type_convert(m_wide)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   a = col_double(),
#>   b = col_double(),
#>   c = col_double(),
#>   d = col_double(),
#>   e = col_double(),
#>   f = col_double(),
#>   ...7 = col_double(),
#>   ...8 = col_double(),
#>   ...9 = col_double(),
#>   ...10 = col_double(),
#>   ...11 = col_double()
#> )
#> # A tibble: 9 × 11
#>       a     b     c     d     e     f  ...7  ...8  ...9 ...10 ...11
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     1     2     3     4     5     6    NA    NA    NA    NA    NA
#> 2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#> 3     1     2     3     4     5     6     7    NA    NA    NA    NA
#> 4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#> 5     1     2     3     4     5     6     7     8    NA    NA    NA
#> 6     1     2     3     4     5     6     7     8     9    10    11
#> 7     1     2     3     4     5     6    NA    NA    NA    NA    NA
#> 8    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#> 9    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA

Created on 2024-07-25 with reprex v2.1.1

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment