tomschenkjr · May 15, 2020 19:55
diff --git a/tidy-api-downloader.R b/tidy-api-downloader.R
 # This is a conceptual study to leverage the tidyverse syntax and lazy
 # evaluation concepts (influenced by sparklyr) to approach API wrapper packages.
 #
 # Problem statement: API wrappers often require users to download *all* data
 # to manipulate and filter the information. Some packages support custom
 # queries but often complicate syntax by adding many parameters within the
 # function that may be hard for data scientists to formulate.
 #
 # Potential solution: API queries can be separated into three principal parts.
 # First, to query the API metadata to understand column names and data types.
 # Second, to allow users to specify query parameters without having yet 
 # downloading the data but allowing users to understand what they would
 # expect to see. Third, to move the actual download to the last step so data
 # is only downloaded after the query has been defined. Each of these steps are
 # tied together using pipes (%>%)
 #
 # Why this solution: First, tidyverse syntax helps break apart major types of
 # data manipulation steps and ties them together using pipes. The result is a
 # simpler code syntax that allows users to easily add or remove data
 # manipulation logic. Second, APIs have large upfront costs. Misspecifying the
 # query means users may need to wait a long time before they understand the 
 # mistake and need to reformulate their query. Often, users may opt to just
 # download all the data, which puts the onus on local machines. Influenced by
 # lazy evaluation in Spark (and the sparklyr syntax), the download of data
 # is the final step of the process. The first step is limited to only
 # downloading metadata.



 # Simple download example

 data <- read_api(data = "id") %>%    # Fetches columns, data types
  select(cols) %>%                 # Select desired columns
  top_n() %>%                      # Only download n rows
  download_api()                   # Download the data



 # Prior to `download_api()`, evaluating the data frame will present a tibble
 # showing column names, expected data types. This explicitly depends on the
 # ability to quickly fetch or determine metadata for an API.


 # Preparing to download all data

 all_data <- read_api(data = "id")
 str(all_data)
 #> # A tibble: 0 x 5
 #>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
 #>           <dbl>       <dbl>        <dbl>       <dbl> <fct> 



 # Only some columns will be downloaded

 select_data <- read_api(data = "id") %>%
  select(Sepal.Length, Sepal.Width)
 str(select_data)
 #> # A tibble: 0 x 2
 #>    Sepal.Length Sepal.Width
 #>           <dbl>       <dbl>



 # Add download_api() to finally retrieve the data from an API

 select_data <- read_api(data = "id") %>%
  select(Sepal.Length, Sepal.Width) %>%
  download_api()
 str(select_data)
 #> # A tibble: 150 x 2
 #>    Sepal.Length Sepal.Width
 #>           <dbl>       <dbl>
 #>  1          5.1         3.5
 #>  2          4.9         3  
 #>  3          4.7         3.2
 #>  4          4.6         3.1
 #>  5          5           3.6
 #>  6          5.4         3.9
 #>  7          4.6         3.4
 #>  8          5           3.4
 #>  9          4.4         2.9
 #> 10          4.9         3.1
 #> # … with 140 more rows

 # Not all tidy-like functions make sense in preparing to query an API. Also,
 # the extent of filtering and grouping (e.g., spread, gather) depends on
 # specific features of the API.

 data <- read_api(data = "id") %>% # Base function to retrieve metadata
         select(cols) %>%       # Select columns
         filter(criteria) %>%   # Filtering rows
         arrange(criteria)      # Sorting rows
         top_n(int)             # Select number of rows

 # Behind the scenes, parameters are being compiled into valid REST calls

 data <- read_api(data = "id") %>%
         select(Sepal.Length, Sepal.Width) %>%
         filter(Sepal.Width > 0.1) %>%
         top_n(5) %>%
         download_api(print_url = TRUE)
 #> https://www.example.com/api?id=id&$select=Sepal.Length,Sepal.Width&$where=Sepal.Width>0.1&limit=5
 #> # A tibble: 5 x 2
 #>    Sepal.Length Sepal.Width
 #>           <dbl>       <dbl>
 #>  1          5.1         3.5
 #>  2          4.9         3  
 #>  3          4.7         3.2
 #>  4          4.6         3.1
 #>  5          5           3.6



 # Code can be combined with tidyr functions after fetching data. One potential
 # issue is naming conflicts with tidyr.

 data <- read_api(data = "id") %>%
  select(cols) %>%
  top_n()
  download_api() %>%
  tidyr::mutate(Sepal.Width.Inches = Sepal.Width * 0.393701)
 str(data)
 #> # A tibble: 5 x 3
 #>    Sepal.Length Sepal.Width Sepal.Width.Inches
 #>           <dbl>       <dbl>              <dbl>
 #>  1          5.1         3.5          1.3779535
 #>  2          4.9         3            1.1811030
 #>  3          4.7         3.2          1.2598432
 #>  4          4.6         3.1          1.2204731
 #>  5          5           3.6          1.4173236



 # Two key porcelain functions are needed: read_api() and download_api().
 # read_api() is focused on retrieving metadata so users can understand what
 # is available and building-out the data frame where data will be placed.
 # download_api() accumulates the arguments, compiles the download URL, then
 # handles the downloading process.
 # These will need to be specific to an API source

 read_api(data,      # Some unique identifier for a particular data set
 	 base_url   # Base url e.g. www.example.com/api/v2
 	)

 download_api(print_url,    # Print URL on console
 	     page_size,    # Handling how much data to fetch with each GET
 	)

 # Often, API wrappers are serving two audiences. One, audiences who are not
 # familiar with APIs so appreciate a syntax that is familiar in R. However,
 # some users are able to write/understand native REST calls so they leverage
 # an API wrapper to handle data types and the GET request. For those users,
 # they could directly leverage the `download_api()` function. This is also
 # handy for those who see a specified URL on the web and can copy/paste
 # it directly.


 # This is a direct example to download data
 data <- download_api(url = "http://www.example.com/api/v2?$id=id&$select=Sepal.Width,Sepal.Length$where=Sepal.Length>1.0")

 # The above is equivalent to a tidy-like query. Note, the `url` in `download_api()`
 # cannot be specified when using this approach.
 data <- read_api(data = "id") %>%
         select(Sepal.Length, Sepal.Width) %>%
         filter(Sepal.Width > 0.1) %>%
         download_api(print_url = TRUE)
 #> "http://www.example.com/api/v2?$id=id&$select=Sepal.Width,Sepal.Length$where=Sepal.Length>1.0"
	# This is a conceptual study to leverage the tidyverse syntax and lazy
	# evaluation concepts (influenced by sparklyr) to approach API wrapper packages.
	#
	# Problem statement: API wrappers often require users to download all data
	# to manipulate and filter the information. Some packages support custom
	# queries but often complicate syntax by adding many parameters within the
	# function that may be hard for data scientists to formulate.
	#
	# Potential solution: API queries can be separated into three principal parts.
	# First, to query the API metadata to understand column names and data types.
	# Second, to allow users to specify query parameters without having yet
	# downloading the data but allowing users to understand what they would
	# expect to see. Third, to move the actual download to the last step so data
	# is only downloaded after the query has been defined. Each of these steps are
	# tied together using pipes (%>%)
	#
	# Why this solution: First, tidyverse syntax helps break apart major types of
	# data manipulation steps and ties them together using pipes. The result is a
	# simpler code syntax that allows users to easily add or remove data
	# manipulation logic. Second, APIs have large upfront costs. Misspecifying the
	# query means users may need to wait a long time before they understand the
	# mistake and need to reformulate their query. Often, users may opt to just
	# download all the data, which puts the onus on local machines. Influenced by
	# lazy evaluation in Spark (and the sparklyr syntax), the download of data
	# is the final step of the process. The first step is limited to only
	# downloading metadata.



	# Simple download example

	data <- read_api(data = "id") %>% # Fetches columns, data types
	select(cols) %>% # Select desired columns
	top_n() %>% # Only download n rows
	download_api() # Download the data



	# Prior to `download_api()`, evaluating the data frame will present a tibble
	# showing column names, expected data types. This explicitly depends on the
	# ability to quickly fetch or determine metadata for an API.


	# Preparing to download all data

	all_data <- read_api(data = "id")
	str(all_data)
	#> # A tibble: 0 x 5
	#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
	#> <dbl> <dbl> <dbl> <dbl> <fct>



	# Only some columns will be downloaded

	select_data <- read_api(data = "id") %>%
	select(Sepal.Length, Sepal.Width)
	str(select_data)
	#> # A tibble: 0 x 2
	#> Sepal.Length Sepal.Width
	#> <dbl> <dbl>



	# Add download_api() to finally retrieve the data from an API

	select_data <- read_api(data = "id") %>%
	select(Sepal.Length, Sepal.Width) %>%
	download_api()
	str(select_data)
	#> # A tibble: 150 x 2
	#> Sepal.Length Sepal.Width
	#> <dbl> <dbl>
	#> 1 5.1 3.5
	#> 2 4.9 3
	#> 3 4.7 3.2
	#> 4 4.6 3.1
	#> 5 5 3.6
	#> 6 5.4 3.9
	#> 7 4.6 3.4
	#> 8 5 3.4
	#> 9 4.4 2.9
	#> 10 4.9 3.1
	#> # … with 140 more rows

	# Not all tidy-like functions make sense in preparing to query an API. Also,
	# the extent of filtering and grouping (e.g., spread, gather) depends on
	# specific features of the API.

	data <- read_api(data = "id") %>% # Base function to retrieve metadata
	select(cols) %>% # Select columns
	filter(criteria) %>% # Filtering rows
	arrange(criteria) # Sorting rows
	top_n(int) # Select number of rows

	# Behind the scenes, parameters are being compiled into valid REST calls

	data <- read_api(data = "id") %>%
	select(Sepal.Length, Sepal.Width) %>%
	filter(Sepal.Width > 0.1) %>%
	top_n(5) %>%
	download_api(print_url = TRUE)
	#> https://www.example.com/api?id=id&$select=Sepal.Length,Sepal.Width&$where=Sepal.Width>0.1&limit=5
	#> # A tibble: 5 x 2
	#> Sepal.Length Sepal.Width
	#> <dbl> <dbl>
	#> 1 5.1 3.5
	#> 2 4.9 3
	#> 3 4.7 3.2
	#> 4 4.6 3.1
	#> 5 5 3.6



	# Code can be combined with tidyr functions after fetching data. One potential
	# issue is naming conflicts with tidyr.

	data <- read_api(data = "id") %>%
	select(cols) %>%
	top_n()
	download_api() %>%
	tidyr::mutate(Sepal.Width.Inches = Sepal.Width * 0.393701)
	str(data)
	#> # A tibble: 5 x 3
	#> Sepal.Length Sepal.Width Sepal.Width.Inches
	#> <dbl> <dbl> <dbl>
	#> 1 5.1 3.5 1.3779535
	#> 2 4.9 3 1.1811030
	#> 3 4.7 3.2 1.2598432
	#> 4 4.6 3.1 1.2204731
	#> 5 5 3.6 1.4173236



	# Two key porcelain functions are needed: read_api() and download_api().
	# read_api() is focused on retrieving metadata so users can understand what
	# is available and building-out the data frame where data will be placed.
	# download_api() accumulates the arguments, compiles the download URL, then
	# handles the downloading process.
	# These will need to be specific to an API source

	read_api(data, # Some unique identifier for a particular data set
	base_url # Base url e.g. www.example.com/api/v2
	)

	download_api(print_url, # Print URL on console
	page_size, # Handling how much data to fetch with each GET
	)

	# Often, API wrappers are serving two audiences. One, audiences who are not
	# familiar with APIs so appreciate a syntax that is familiar in R. However,
	# some users are able to write/understand native REST calls so they leverage
	# an API wrapper to handle data types and the GET request. For those users,
	# they could directly leverage the `download_api()` function. This is also
	# handy for those who see a specified URL on the web and can copy/paste
	# it directly.


	# This is a direct example to download data
	data <- download_api(url = "http://www.example.com/api/v2?$id=id&$select=Sepal.Width,Sepal.Length$where=Sepal.Length>1.0")

	# The above is equivalent to a tidy-like query. Note, the `url` in `download_api()`
	# cannot be specified when using this approach.
	data <- read_api(data = "id") %>%
	select(Sepal.Length, Sepal.Width) %>%
	filter(Sepal.Width > 0.1) %>%
	download_api(print_url = TRUE)
	#> "http://www.example.com/api/v2?$id=id&$select=Sepal.Width,Sepal.Length$where=Sepal.Length>1.0"