Skip to content

Instantly share code, notes, and snippets.

@mdsumner
Last active September 2, 2024 21:35
Show Gist options
  • Save mdsumner/e124bc00b924def71f3e632a320f8984 to your computer and use it in GitHub Desktop.
Save mdsumner/e124bc00b924def71f3e632a320f8984 to your computer and use it in GitHub Desktop.

List files and stream (non-spatial) table from within a tarball on CRAN.

(we need dev gdalraster for the full dir/vsitar listing capability, but reading from remote files or archives is available in many GDAL versions and existing supported GDAL packages on CRAN)

cransrc <- "https://cran.r-project.org/src/contrib" 
library(gdalraster)  ## for listing dirs recursively we need gh:USDAForestService/gdalraster for now
#> GDAL 3.10.0dev-449d5f09b7, released 2024/08/26, GEOS 3.12.2, PROJ 9.4.1

## list all R packages .tar.gz
r_pkgs <- vsi_read_dir(glue::glue("/vsicurl/{cransrc}"), recursive = FALSE)
## list files in the inst/ dir of the arrow package 
src <- glue::glue("/vsitar//vsicurl/{cransrc}/{grep('^arrow_', r_pkgs, value = TRUE)}")
inst_files <- grep("/inst/", vsi_read_dir(glue::glue("{src}"), recursive = TRUE), value = TRUE)
## and find that one Parquet file (not spatial)
parq_file <- grep('parquet$', inst_files, value = TRUE)[1]
parq_file
#> [1] "arrow/inst/v0.7.1.parquet"
dsn <- glue::glue("{src}/{parq_file}")

## now this of course works the in all the GDAL packages
terra::vect(dsn, proxy = TRUE)
#>  class       : SpatVectorProxy
#>  geometry    : none 
#>  dimensions  : 10, 11  (geometries, attributes)
#>  extent      : NaN, NaN, NaN, NaN  (xmin, xmax, ymin, ymax)
#>  source      : v0.7.1.parquet
#>  coord. ref. :  
#>  names       : carat   cut color clarity depth table price     x     y     z
#>  type        : <num> <chr> <chr>   <chr> <num> <num> <int> <num> <num> <num>
#>  __index_level_0__
#>              <int>
#sf::read_sf(dsn)
v <- new(gdalraster::GDALVector, dsn)
v$getNextFeature()
#> $FID
#> integer64
#> [1] 0
#> 
#> $carat
#> [1] 0.23
#> 
#> $cut
#> [1] "Ideal"
#> 
#> $color
#> [1] "E"
#> 
#> $clarity
#> [1] "SI2"
#> 
#> $depth
#> [1] 61.5
#> 
#> $table
#> [1] 55
#> 
#> $price
#> integer64
#> [1] 326
#> 
#> $x
#> [1] 3.95
#> 
#> $y
#> [1] 3.98
#> 
#> $z
#> [1] 2.43
#> 
#> $`__index_level_0__`
#> integer64
#> [1] 0

vapour::vapour_read_fields(dsn, limit_n = 5)
#> $carat
#> [1] 0.23 0.21 0.23 0.29 0.31
#> 
#> $cut
#> [1] "Ideal"   "Premium" "Good"    "Premium" "Good"   
#> 
#> $color
#> [1] "E" "E" "E" "I" "J"
#> 
#> $clarity
#> [1] "SI2" "SI1" "VS1" "VS2" "SI2"
#> 
#> $depth
#> [1] 61.5 59.8 56.9 62.4 63.3
#> 
#> $table
#> [1] 55 61 65 58 58
#> 
#> $price
#> [1] 326 326 327 334 335
#> 
#> $x
#> [1] 3.95 3.89 4.05 4.20 4.34
#> 
#> $y
#> [1] 3.98 3.84 4.07 4.23 4.35
#> 
#> $z
#> [1] 2.43 2.31 2.31 2.63 2.75
#> 
#> $`__index_level_0__`
#> [1] 0 1 2 3 4

Created on 2024-08-29 with reprex v2.1.0

@mdsumner
Copy link
Author

turn that into a file-finder for within CRAN tarball sources

  ## devtools::install_github("USDAForestService/gdalraster")
## search for a file in a package on CRAN
search_src <- function(package, pattern, full = TRUE, ..., cran = "https://cran.r-project.org/src/contrib" ) {
  ## all the tar balls
  r_pkgs <- gdalraster::vsi_read_dir(glue::glue("/vsicurl/{cran}"), recursive = FALSE)
  ## the tarball we want
  pkg <- grep(sprintf("^%s_", package), r_pkgs, value = TRUE)
  tarball <- glue::glue("/vsitar//vsicurl/{cran}/{pkg}")
  pkgfiles <- gdalraster::vsi_read_dir(tarball, recursive = TRUE)
  files <- pkgfiles[grep(pattern, pkgfiles, ...) ]
  if (full) {
    files <- glue::glue("{tarball}/{files}")
  }

  files
}
search_src("terra", "lux.*shp")
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/terra_1.7-78.tar.gz/terra/inst/ex/lux.shp


search_src("sf", "*gpkg$")
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/sf_1.0-16.tar.gz/sf/inst/gpkg/tl.gpkg
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/sf_1.0-16.tar.gz/sf/inst/gpkg/b_pump.gpkg
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/sf_1.0-16.tar.gz/sf/inst/gpkg/buildings.gpkg
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/sf_1.0-16.tar.gz/sf/inst/gpkg/nospatial.gpkg
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/sf_1.0-16.tar.gz/sf/inst/gpkg/nc.gpkg

search_src("gdalraster", "extdata")
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/ynp_fires_1984_2022.gpkg
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/storml_evt.tif
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/storm_lake.prj
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/storm_lake.lcp
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/storml_elev.tif
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/sr_b4_20200829.tif
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/metadata.zip
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/test.geojson
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/storml_evc.tif
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/sr_b5_20200829.tif
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/storml_tcc.tif
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/LF20_EVC_220.csv
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/LF20_F40_220.csv
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/storml_pts.csv
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/int64.tif
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/LF20_EVH_220.csv
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/storml_evh.tif
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/sr_b6_20200829.tif
#> /vsitar//vsicurl/https://cran.r-project.org/src/contrib/gdalraster_1.11.1.tar.gz/gdalraster/inst/extdata/LF20_EVT_220.csv

Created on 2024-08-29 with reprex v2.1.0

@mdsumner
Copy link
Author

we can also stream from an in-memory array

there's a helper for the GDAL MEM driver in dsn::mem(), ximage::xcontour() will draw output from vapour::gdal_raster_data() and the Copernicus 30m DEM is avalable via sds::cop30()

devtools::install_github("hypertidy/dsn")
devtools::install_github("hypertidy/sds")
devtools::install_github("hypertidy/ximage")
ex  <- c(2667400, 2668010, 6478700, 6479570)
crs <- "EPSG:27200" 
## we need this orientation
v <- t(volcano[nrow(volcano):1, ncol(volcano):1])
## now point to that matrix in memory, apply the extent and crs
terra::plot(terra::rast(dsn::mem(v, extent = ex, projection = crs)))

cl <- vapour::gdal_raster_data(sds::cop30(),  target_ext = ex, target_crs = crs, target_res = 10, resample = "cubic")
ximage::xcontour(cl, add = T)

Created on 2024-08-29 with reprex v2.0.2

@aitap
Copy link

aitap commented Aug 29, 2024

Some extra complexity (mostly taken from download.packages) will save CRAN some resources by letting available.packages() download the list of packages from a mirror and cache it for the duration of the session:

ap <- available.packages(type = 'source')
apkg <- ap[ap[,'Package'] == package, c('Repository', 'Package', 'Version'), drop = FALSE]
# FIXME: could be 0 or >1 package here
# FIXME: a repo could specify ap[,'File'] instead, but CRAN currently doesn't
tarball <- glue::glue("/vsitar//vsicurl/{apkg[,'Repository']}/{apkg[,'Package']}_{apkg[,'Version']}.tar.gz")

The package is downloaded from the mirror as well.

@mdsumner
Copy link
Author

Hey thanks, I actually realized that step was unnecessary - I use tools::CRAN_package_db() fwiw

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment