bohdanszymanik · May 7, 2021 04:04
diff --git a/LiquorLicenceRegisterAddressLookup.R b/LiquorLicenceRegisterAddressLookup.R
 ##################################################################################
 # Import liquor licencing data, geocode and plot
 # Plot on a NZ map
 # Show a regional view - some demographics eg population density at SA2
 # Import the public NZ Police aggregated crime data
 # Look at crime harm regional cf liquor outlet distribution
 #

 library(readxl)
 library(rgdal)
 library(broom)
 library(dplyr)
 library(ggplot2)
 library(ggmap)
 library(memoise)
 library(naniar)
 library(stringr)
 library(purrr)
 library(tmaptools)
 library(logger)
 library(cowplot)
 library(ggrepel)
 library(sf)
 library(nngeo)
 library(rlist)
 library(bench) # https://cran.r-project.org/web/packages/bench/bench.pdf
 library(pryr)
 library(stringdist)
 library(ngram)
 library(areal)

 setwd("h:\\Documents\\wd\\erp")

 ################################################################################
 # Bringing in liquour licensing data
 #
 # read the liquor licensing licences register in two steps
 # first get the column names
 cNames <- read_excel("February-2021-Licences.xlsx", skip=1, n_max=1) %>% names()
 # then the raw data
 llr <- read_excel("February-2021-Licences.xlsx", skip=2, col_names=cNames)

 # getting a comma separated address - bit pointless having its own function?
 #createAddress <- function(`Premises Street`, `Premises Suburb`, `Premises City`, ...) {
 #  paste(`Premises Street`, `Premises Suburb`, `Premises City`, sep=',')
 #}
 # eg
 #llr %>% rowwise() %>% pmap_chr(createAddress)

 llr2 <- llr %>% dplyr::mutate(address = paste(`Premises Street`, `Premises Suburb`, `Premises City`, c("New Zealand"), sep=',') )
 llr2 %>% slice_sample(n=50)
 llr2 %>% slice_head(n=5)


 geocode_OSM("Klondike Ale House, 5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org")
 geocode_OSM("5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org")
 #geocode_OSM("5 Gillies St, Kawakawa, new zealand")
 #geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords
 #typeof(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
 #length(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
 #str(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
 #is.vector(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
 #geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords["x"]


 #log_layout(layout_glue_colors) # warning ... unreadable in the log with default text editors
 log_formatter(formatter_glue)
 log_layout(layout_simple)

 log_threshold(TRACE)

 #t <- tempfile()
 t <- "mylogger.log"
 log_appender(appender_file(t))

 #readLines(t)
 #unlink(t)


 cf <- cache_filesystem(".rcache", algo = "xxhash64", compress = FALSE)
 # cache_memory(algo="sha512")

 # geocode_OSM defaults to EPSG 4326 lat lon
 # st_crs can be used to set coordinate reference system on underlying sf objects
 # not sure if nztm is supported by sf

 lookupAddress <- memoise (
  function(address) {
    geocode <- geocode_OSM(address) # returns either the address if not found, or a vector with lat lon
    
    if (is.null(geocode)) {
      log_trace('{address}')
      NA
    } else { 
      
      log_trace('{address}\t{geocode$coords["x"]}\t{geocode$coords["y"]}')
      geocode$coords 
    }
  },
  cache = cf
 )


 lookupAddress("5 Gillies St,Kawakawa,New Zealand")
 # has_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand")
 # drop_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand")
 # forget(lookupAddress)
 #geocode_OSM("5 Gillies St,Kawakawa,New Zealand")

 # it doesn't like the presence of a # char in a column in the 'address'Premises Street' data so we need to filter that out
 llr3 <- 
  llr2 %>% 
  # slice_head(n=20) %>% 
  select(LicenceType, `Date Licence is Valid From`, `Date Licence Expires`, address) %>%
  rowwise %>%
  dplyr::mutate(addressFiltered = str_replace(address, "#", "")) %>%
  dplyr::mutate(lat = lookupAddress(address)["y"], lon = lookupAddress(address)["x"])

 # we really need to be doing this for licences that are currently active
 # that means date today is after `Date Licence is Valid From` and before `Date Licence Expires`

 llr3Active <-
  llr3 %>% 
  # slice_head(n=20) %>% 
  filter(Sys.time() >= `Date Licence is Valid From` & Sys.time() < `Date Licence Expires`)
 # down from 10719 obs to 9547 observations


 ################################################################################

 # Plotting

 # first we run against the natural earth data which gives a simple global map with countries
 # then we'll go to an NZ map with regional boundaries

 library("rnaturalearth")
 library("rnaturalearthdata")

 world <- ne_countries(scale = "medium", returnclass = "sf")
 class(world)

 ggplot(data = world) +
  geom_sf() +
  coord_sf(expand=F)

 ggplot(data = world) +
  geom_sf() +
  label_graticule = "EW" +
  xlab("Longitude") + ylab("Latitude") +
  ggtitle("World map", subtitle = paste0("(", length(unique(world$NAME)), " countries)"))

 # # example of colouring the earth map using an aesthetic based off population data
 # ggplot(data = world) +
 #   geom_sf(aes(fill = pop_est)) +
 #   scale_fill_viridis_c(option = "plasma", trans = "sqrt")

 # # plotting just nz out of the rnaturalearth data set
 # nz <- world[world$name == 'New Zealand',]
 # 
 # ggplot(data = nz) +
 #   geom_sf() +


 # # test by plotting one or two points
 # two ways to create tibbles on the fly - tribble is quite nice
 # tibble( lat = -35.11244, lon = 173.26267)
 # 
 # tribble (
 #   ~lat, ~lon,
 #   -35.11244, 173.26267,
 #   -35.73572, 174.33096,
 # )
 # 
 # 
 # ggplot(data = world) +
 #   geom_sf() +
 #   geom_point(data = tibble( lat = -35.11244, lon = 173.26267), 
 #              aes(x = lon, y = lat), 
 #              size = 2, 
 #              shape = 23, 
 #              fill = "darkred") +
 #   coord_sf(
 #     crs = 4326, #"EPSG:4326", or equivalently "WGS84"
 #     xlim = c(164, 179), 
 #     ylim = c(-48, -33), 
 #     expand = FALSE)

 ggplot(data = world) +
  geom_sf() +
  geom_point(data = llr3, aes(x = lon, y = lat), size = 1,
             shape = 23, fill = "darkred") +
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179), 
    ylim = c(-48, -33), 
    expand = FALSE)


 ################################################################################

 # Time to bring in an NZ map with regions

 # We can use the annual boundaries published with population data collected during the census
 # With meshblocks it gets real tough without decent memory so let's use a large grain size
 # estimated resident population by Statistical Area 2
 # https://datafinder.stats.govt.nz/layer/105008-estimated-resident-population-at-30-june-2018-by-statistical-area-2/data/

 # In the download dialog box you get a choice of geodatabase or shapefile for the download
 # In this example I'm using shapefile

 # unzip the shapefile to a directory and then load it like this
 # the zipped shape file doesn't make it fully clear here that it's actually SA2 regions
 # if you open it up in explorer and take a look at the txt or pdf files 
 # they explain what's contained... ERP, sex ratio, age groups, ethnicity, median age by sa2
 shp <- readOGR(dsn ="h:\\Downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-SHP", layer="estimated-resident-population-at-30-june-2018-by-statistical")

 tidy_nzdf <- tidy(shp)

 # Pulling out the polygon id to match on when trying to merge with meshblock id 
 shp$polyID <- sapply(slot(shp, "polygons"), function(x) slot(x, "ID"))

 # Merge to get meshblock ids (again can take a couple mins)
 nz_df <- merge(tidy_nzdf, shp, by.x = "id", by.y="polyID")


 map <- ggplot() +
  geom_path(data = nz_df,
            aes(x = long, y = lat, group = group),
            colour = 'grey', fill = 'white', size = 0.2)

 print(map)

 populationMap <- ggplot(data=nz_df, aes(x=long, y=lat, group=group)) + 
  geom_polygon(aes(fill=erp96), colour='gray', size=0.1) + 
  theme(line = element_blank(), axis.text=element_blank(),axis.title=element_blank(), panel.background = element_blank()) +
  scale_fill_gradient2("#4d4dff", mid = "white", high = "#ff4d4d") + 
  guides(fill=guide_legend(title="Estimated resident population"))

 # plots out with some colour aesthetics for the regions
 populationMap


 # all very nice but now we want to sum up the number of liquor licenses sitting within each region
 # then, with the population being known we can generate a normalised liquor license measure
 # then we want to compare to 
 #   crime harm measure: https://www.police.govt.nz/about-us/publications-statistics/data-and-statistics/policedatanz/victimisation-time-and-place
 #   police demand measure: https://www.police.govt.nz/about-us/statistics-and-publications/data-and-statistics/demand-and-activity

 # todo - filter out missing values for coords in llr3
 llr4 <-
  llr3 %>%
  # slice_head(n=20) %>% 
  filter(!is.na(lat)) %>%
  filter(!is.na(lon)) %>%
  filter(between(lat, -48, -33 )) %>% # reasonableness clipping
  filter(between(lon, 164, 179 ))     # reasonableness clipping


 licence_sf_points <- st_as_sf( llr4 %>% select(lat,lon), coords = c("lon","lat"), crs = 4326)
 lsp <- st_as_sf( llr4 , coords = c("lon","lat"), crs = 4326)

 ggplot() + geom_sf(data=licence_sf_points)  # plot the licence locations

 # my first attempt at working with nz region data is via the spData package
 # library(spData)
 # licenceSum <- aggregate(x = licence_sf_points, by = nz, FUN = sum)
 # 
 # licenceSumPlt <- ggplot(data = licenses)
 # licenceSumPlt
 # 
 # plot(licenceSum)
 # 
 # class(licence_sf_points)
 # head(licence_sf_points)
 # class(licenceSum)
 # head(licenceSum)
 # 
 # class(nz)
 # head(nz)
 # 
 # ggplot() + geom_sf(data=nz)
 # plot(nz)
 # 
 # nz %>% filter(Name == "Canterbury") # that's unexpected, no name in the df
 # 
 # m1 = cbind(c(0, 0, 1, 0), c(0, 1, 1, 0))
 # m2 = cbind(c(0, 1, 1, 0), c(0, 0, 1, 0))
 # pol = st_sfc(st_polygon(list(m1)), st_polygon(list(m2)))
 # set.seed(1985)
 # d = data.frame(matrix(runif(15), ncol = 3))
 # p = st_as_sf(x = d, coords = 1:2)
 # head(p)
 # plot(pol)
 # plot(p, add = TRUE)
 # 
 # (p_ag1 = aggregate(p, pol, mean))
 # 
 # this isn't working the way I thought.
 # how about this
 class(nz_df)
 # nz_df is just a dataframe - I want a sf class

 # I'm following approach of what's shown in the spData documentation for the nz data set: https://cran.r-project.org/web/packages/spData/spData.pdf
 library(rmapshaper)

 # I've downloaded the SA2 dataset as a geopackage this time
 unzip("h:\\downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-FGDB.zip")
 nz_full = st_read("estimated-resident-population-at-30-june-2018-by-statistical.gdb")
 print(object.size(nz_full), units= "Kb") # 30784.2 Kb

 # make our memory issues go away... refer to line 435 for better examples

 # # we can use other native r packages for the feature simplification
 # library(rgeos)
 # # rgeos doens't work with sf objects, only sp, so you have to cast
 # 
 # nz_simple <- gSimplify(as(nz_full, 'Spatial'), tol = 1, topologyPreserve = T) %>% st_as_sf()
 # print(object.size(nz_simple), units = "Kb") # 30782.6 Kb - no big change :( , maybe don't worry about this atm

 class(nz_full) # good - it's an sf dataframe

 # I need to get the crs the same on each sf data structure - maybe consolidate on WGS 84
 st_crs(licence_sf_points) # WGS 84, EPSG:4326
 st_crs(nz_full) # NZGD2000, EPSG:2193

 (nz_full_WGS84 <- st_transform(nz_full, 4326))

 # Bit confused over aggregate but I think I have it figured now
 # I'm using 'length' as the function but it's just going to generate the same length for every field
 licenceSum <- aggregate(x = licence_sf_points, by = nz_full_WGS84, FUN = length)

 # alternative approach refer https://geocompr.robinlovelace.net/spatial-operations.html#spatial-vec
 # where I've purposely got a field in licence_sf_points that just contains 1s
 (licenseRegionCount <- nz_full_WGS84 %>%
    st_join(licence_sf_points1) %>%
    group_by(SA22020_V1_00_NAME) %>%
    summarize(count = sum(rowone, na.rm = TRUE)))
 head(licenseRegionCount)


 # this is better I think
 # The count is an explicit field

 ####################
 # I think this is somewhat working!
 ggplot() + 
  geom_sf(data = licenseDensity, aes(fill=count)) +
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)


 # what about intersection as in this stackexchange answer https://gis.stackexchange.com/a/270479
 (intersection <- st_intersection(x = st_transform(nz_full, 4326), y = licence_sf_points)) # nice, this has counts up front

 ggplot() +
  geom_sf(data = st_transform(nz_full, 4326)) +
  geom_sf(data = intersection[1]) +
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)


 licenceSum1 <- aggregate(x = lsp, by = st_transform(nz_full, 4326), FUN = length)



 ###########################################
 nashp <- readOGR(dsn ="H:\\Documents\\wd\\erp\\alcoshp", layer="NA_Alco")

 head(nashp)
 class(nashp)

 # generate an sf dataframe
 (nalicences <- st_as_sf( nashp, coords = c("EASTING","NORTHING"), crs = 2193))

 # transform to wgs84
 (nalicences_WGS84 <- st_transform(nalicences, 4326))

 # plotting like this ggplot wont know how to create a legend refer https://stackoverflow.com/questions/38907371/ggplot-legends-when-plot-is-built-from-two-data-frames
 ggplot() +
  geom_sf(data = st_transform(nz_full, 4326)) +
  geom_sf(data = intersection[1], colour="blue", size=2) +
  geom_sf(data = nalicences_WGS84[1], colour="red", size=0.5, alpha=0.5) + 
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)

 # the answer is to do it like this
 ggplot() +
  geom_sf(data = st_transform(nz_full, 4326)) +
  geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) +
  geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) + 
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE) 

 # this is quite slow to plot out but we can investigate what's going on
 # bench_time(expr)
 # bench_memory(expr)
 # bench::mark( some series of functions )

 benchMarkResults <- bench::mark( ggplot() +
                                   geom_sf(data = st_transform(nz_full, 4326)) +
                                   geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) +
                                   geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) + 
                                   coord_sf(
                                     crs = 4326, #"EPSG:4326", or equivalently "WGS84"
                                     xlim = c(164, 179),
                                     ylim = c(-48, -33),
                                     expand = FALSE) 
 )
 View(benchMarkResults)



 # the plot appears to show that all/most MoJ records have a matching NA record - and differences?
 # maybe for each item in the NA dataset look for a nearby matching MoJ record
 # and can I put a boundary region on this so I don't need to test every point to point distance?

 # possible approaches in
 # https://stackoverflow.com/questions/22121742/calculate-the-distance-between-two-points-of-two-datasets-nearest-neighbor
 # https://gis.stackexchange.com/questions/127244/is-there-an-r-equivalent-of-near-in-esri-arcgis

 # or we can just do a pair wise distance calc using st_distance

 # (sf::st_distance(intersection, nalicences_WGS84, by_element=F, tolerance=100)) # this is way too slow - what can we do to make this better?
 # this still calculates over the full product of points so we need a way to just get the nearest neighbour features

 # maybe just take a fraction of points to start with
 (sf::st_distance(intersection %>% slice_sample(n=100), nalicences_WGS84 %>% slice_sample(n=100), by_element=F, tolerance=100)) # this is slow - what can we do to make this better?

 # I just popped back to look at simplification again to see if I can save some memory on the polygon features
 # two examples of simplification: one using st_simplify; the other ms_simplify
 ggplot() +
  geom_sf(data = st_transform(st_simplify(nz_full, dTolerance = 4000), 4326)) + # st_simplify needs to work on a project crs not lat lon where a unit of longitude changes with latitude
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)

 ggplot() +
  geom_sf(data = st_transform(rmapshaper::ms_simplify(nz_full, keep = 0.001, keep_shapes = T), 4326)) + # keeps 0.1% of vertices
  coord_sf(
    crs = 4326, #"EPSG:4326", or equivalently "WGS84"
    xlim = c(164, 179),
    ylim = c(-48, -33),
    expand = FALSE)

 # let's try nngeo::st_nn nearest neighbour search for Simple Features - sounds like what we want
 p2p <- st_nn(x = intersection, y = nalicences_WGS84, k = 1, maxdist = 1000, returnDist = T, progress = T) #, parallel = 16)
 # using parallel option with all 16 cores destabilises the system and when I try the line 'ggplot() + geom_histogram(aes(distanceSample))' - it dies

 # let's get the structure of the created list
 str(p2p)
 # List of two, $nn and $dist
 # I'm guessing $nn identifies the neighbour somehow

 # lets histogram the distribution of distances - sample first
 distanceSample <- unlist(list.sample(p2p$dist, 100))
 ggplot() + geom_histogram(aes(distanceSample))

 # may as well do the lot
 ggplot() + geom_histogram(aes(unlist(p2p$dist)))
 # so we have vast majority of nearest neighbours matching within just a few metres

 # do we have duplicated records?
 nalicences %>%
  group_by(ORG_ID) %>%
  filter(n() > 1 )
 # answer = no

 # filtering out the Not Applicable TYPE from nalicences
 nalicencesApplicable <- nalicences %>%
                          filter(TYPE != "Not Applicable" ) %>%
                          length()


 # next problem, we've matched by geocoordinate - now let's try and match by address using string similarity measures
 # n-grams work well but we have others like cosine/jacquard etc
 # https://cran.r-project.org/web/packages/ngram/vignettes/ngram-guide.pdf
 # https://cran.r-project.org/web/packages/stringdist/stringdist.pdf



 # small aside - might try and check how apportionment works in R for two data sets we can do the exact same
 # thing as I've done in ArcGIS - Census Estimated Resident Population apportioned to Police Areas - police boundaries are available on koordinates
 # https://koordinates.com/layer/3825-nz-police-area-boundaries/
 # looks like I can use sf::st_interpolate_aw or, more complete, the areal package
 PoliceAreasShp <- readOGR(dsn ="h:\\Documents\\wd\\erp\\Police_Areas_Clipped__2021_04", layer="police_area_clipped_with_maori")

 nrow(shp) # features in source
 nrow(PoliceAreasShp) # features in target

 nrow(suppressWarnings(sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp))))
 (sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp) ) )

 sourceERP <- st_as_sf(shp)
 targetPoliceAreas <- st_as_sf(PoliceAreasShp)

 apportioned <- areal::aw_interpolate(targetPoliceAreas, 
                                     tid = "AREA_ID", 
                                     source = sourceERP, 
                                     sid = "SA22020_V1", 
                                     weight = "sum", 
                                     output = "sf", 
                                     extensive = c("erp18"))

 head(apportioned)
	##################################################################################
	# Import liquor licencing data, geocode and plot
	# Plot on a NZ map
	# Show a regional view - some demographics eg population density at SA2
	# Import the public NZ Police aggregated crime data
	# Look at crime harm regional cf liquor outlet distribution
	#

	library(readxl)
	library(rgdal)
	library(broom)
	library(dplyr)
	library(ggplot2)
	library(ggmap)
	library(memoise)
	library(naniar)
	library(stringr)
	library(purrr)
	library(tmaptools)
	library(logger)
	library(cowplot)
	library(ggrepel)
	library(sf)
	library(nngeo)
	library(rlist)
	library(bench) # https://cran.r-project.org/web/packages/bench/bench.pdf
	library(pryr)
	library(stringdist)
	library(ngram)
	library(areal)

	setwd("h:\\Documents\\wd\\erp")

	################################################################################
	# Bringing in liquour licensing data
	#
	# read the liquor licensing licences register in two steps
	# first get the column names
	cNames <- read_excel("February-2021-Licences.xlsx", skip=1, n_max=1) %>% names()
	# then the raw data
	llr <- read_excel("February-2021-Licences.xlsx", skip=2, col_names=cNames)

	# getting a comma separated address - bit pointless having its own function?
	#createAddress <- function(`Premises Street`, `Premises Suburb`, `Premises City`, ...) {
	# paste(`Premises Street`, `Premises Suburb`, `Premises City`, sep=',')
	#}
	# eg
	#llr %>% rowwise() %>% pmap_chr(createAddress)

	llr2 <- llr %>% dplyr::mutate(address = paste(`Premises Street`, `Premises Suburb`, `Premises City`, c("New Zealand"), sep=',') )
	llr2 %>% slice_sample(n=50)
	llr2 %>% slice_head(n=5)


	geocode_OSM("Klondike Ale House, 5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org")
	geocode_OSM("5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org")
	#geocode_OSM("5 Gillies St, Kawakawa, new zealand")
	#geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords
	#typeof(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
	#length(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
	#str(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
	#is.vector(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords)
	#geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords["x"]


	#log_layout(layout_glue_colors) # warning ... unreadable in the log with default text editors
	log_formatter(formatter_glue)
	log_layout(layout_simple)

	log_threshold(TRACE)

	#t <- tempfile()
	t <- "mylogger.log"
	log_appender(appender_file(t))

	#readLines(t)
	#unlink(t)


	cf <- cache_filesystem(".rcache", algo = "xxhash64", compress = FALSE)
	# cache_memory(algo="sha512")

	# geocode_OSM defaults to EPSG 4326 lat lon
	# st_crs can be used to set coordinate reference system on underlying sf objects
	# not sure if nztm is supported by sf

	lookupAddress <- memoise (
	function(address) {
	geocode <- geocode_OSM(address) # returns either the address if not found, or a vector with lat lon

	if (is.null(geocode)) {
	log_trace('{address}')
	NA
	} else {

	log_trace('{address}\t{geocode$coords["x"]}\t{geocode$coords["y"]}')
	geocode$coords
	}
	},
	cache = cf
	)


	lookupAddress("5 Gillies St,Kawakawa,New Zealand")
	# has_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand")
	# drop_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand")
	# forget(lookupAddress)
	#geocode_OSM("5 Gillies St,Kawakawa,New Zealand")

	# it doesn't like the presence of a # char in a column in the 'address'Premises Street' data so we need to filter that out
	llr3 <-
	llr2 %>%
	# slice_head(n=20) %>%
	select(LicenceType, `Date Licence is Valid From`, `Date Licence Expires`, address) %>%
	rowwise %>%
	dplyr::mutate(addressFiltered = str_replace(address, "#", "")) %>%
	dplyr::mutate(lat = lookupAddress(address)["y"], lon = lookupAddress(address)["x"])

	# we really need to be doing this for licences that are currently active
	# that means date today is after `Date Licence is Valid From` and before `Date Licence Expires`

	llr3Active <-
	llr3 %>%
	# slice_head(n=20) %>%
	filter(Sys.time() >= `Date Licence is Valid From` & Sys.time() < `Date Licence Expires`)
	# down from 10719 obs to 9547 observations


	################################################################################

	# Plotting

	# first we run against the natural earth data which gives a simple global map with countries
	# then we'll go to an NZ map with regional boundaries

	library("rnaturalearth")
	library("rnaturalearthdata")

	world <- ne_countries(scale = "medium", returnclass = "sf")
	class(world)

	ggplot(data = world) +
	geom_sf() +
	coord_sf(expand=F)

	ggplot(data = world) +
	geom_sf() +
	label_graticule = "EW" +
	xlab("Longitude") + ylab("Latitude") +
	ggtitle("World map", subtitle = paste0("(", length(unique(world$NAME)), " countries)"))

	# # example of colouring the earth map using an aesthetic based off population data
	# ggplot(data = world) +
	# geom_sf(aes(fill = pop_est)) +
	# scale_fill_viridis_c(option = "plasma", trans = "sqrt")

	# # plotting just nz out of the rnaturalearth data set
	# nz <- world[world$name == 'New Zealand',]
	#
	# ggplot(data = nz) +
	# geom_sf() +


	# # test by plotting one or two points
	# two ways to create tibbles on the fly - tribble is quite nice
	# tibble( lat = -35.11244, lon = 173.26267)
	#
	# tribble (
	# ~lat, ~lon,
	# -35.11244, 173.26267,
	# -35.73572, 174.33096,
	# )
	#
	#
	# ggplot(data = world) +
	# geom_sf() +
	# geom_point(data = tibble( lat = -35.11244, lon = 173.26267),
	# aes(x = lon, y = lat),
	# size = 2,
	# shape = 23,
	# fill = "darkred") +
	# coord_sf(
	# crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	# xlim = c(164, 179),
	# ylim = c(-48, -33),
	# expand = FALSE)

	ggplot(data = world) +
	geom_sf() +
	geom_point(data = llr3, aes(x = lon, y = lat), size = 1,
	shape = 23, fill = "darkred") +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)


	################################################################################

	# Time to bring in an NZ map with regions

	# We can use the annual boundaries published with population data collected during the census
	# With meshblocks it gets real tough without decent memory so let's use a large grain size
	# estimated resident population by Statistical Area 2
	# https://datafinder.stats.govt.nz/layer/105008-estimated-resident-population-at-30-june-2018-by-statistical-area-2/data/

	# In the download dialog box you get a choice of geodatabase or shapefile for the download
	# In this example I'm using shapefile

	# unzip the shapefile to a directory and then load it like this
	# the zipped shape file doesn't make it fully clear here that it's actually SA2 regions
	# if you open it up in explorer and take a look at the txt or pdf files
	# they explain what's contained... ERP, sex ratio, age groups, ethnicity, median age by sa2
	shp <- readOGR(dsn ="h:\\Downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-SHP", layer="estimated-resident-population-at-30-june-2018-by-statistical")

	tidy_nzdf <- tidy(shp)

	# Pulling out the polygon id to match on when trying to merge with meshblock id
	shp$polyID <- sapply(slot(shp, "polygons"), function(x) slot(x, "ID"))

	# Merge to get meshblock ids (again can take a couple mins)
	nz_df <- merge(tidy_nzdf, shp, by.x = "id", by.y="polyID")


	map <- ggplot() +
	geom_path(data = nz_df,
	aes(x = long, y = lat, group = group),
	colour = 'grey', fill = 'white', size = 0.2)

	print(map)

	populationMap <- ggplot(data=nz_df, aes(x=long, y=lat, group=group)) +
	geom_polygon(aes(fill=erp96), colour='gray', size=0.1) +
	theme(line = element_blank(), axis.text=element_blank(),axis.title=element_blank(), panel.background = element_blank()) +
	scale_fill_gradient2("#4d4dff", mid = "white", high = "#ff4d4d") +
	guides(fill=guide_legend(title="Estimated resident population"))

	# plots out with some colour aesthetics for the regions
	populationMap


	# all very nice but now we want to sum up the number of liquor licenses sitting within each region
	# then, with the population being known we can generate a normalised liquor license measure
	# then we want to compare to
	# crime harm measure: https://www.police.govt.nz/about-us/publications-statistics/data-and-statistics/policedatanz/victimisation-time-and-place
	# police demand measure: https://www.police.govt.nz/about-us/statistics-and-publications/data-and-statistics/demand-and-activity

	# todo - filter out missing values for coords in llr3
	llr4 <-
	llr3 %>%
	# slice_head(n=20) %>%
	filter(!is.na(lat)) %>%
	filter(!is.na(lon)) %>%
	filter(between(lat, -48, -33 )) %>% # reasonableness clipping
	filter(between(lon, 164, 179 )) # reasonableness clipping


	licence_sf_points <- st_as_sf( llr4 %>% select(lat,lon), coords = c("lon","lat"), crs = 4326)
	lsp <- st_as_sf( llr4 , coords = c("lon","lat"), crs = 4326)

	ggplot() + geom_sf(data=licence_sf_points) # plot the licence locations

	# my first attempt at working with nz region data is via the spData package
	# library(spData)
	# licenceSum <- aggregate(x = licence_sf_points, by = nz, FUN = sum)
	#
	# licenceSumPlt <- ggplot(data = licenses)
	# licenceSumPlt
	#
	# plot(licenceSum)
	#
	# class(licence_sf_points)
	# head(licence_sf_points)
	# class(licenceSum)
	# head(licenceSum)
	#
	# class(nz)
	# head(nz)
	#
	# ggplot() + geom_sf(data=nz)
	# plot(nz)
	#
	# nz %>% filter(Name == "Canterbury") # that's unexpected, no name in the df
	#
	# m1 = cbind(c(0, 0, 1, 0), c(0, 1, 1, 0))
	# m2 = cbind(c(0, 1, 1, 0), c(0, 0, 1, 0))
	# pol = st_sfc(st_polygon(list(m1)), st_polygon(list(m2)))
	# set.seed(1985)
	# d = data.frame(matrix(runif(15), ncol = 3))
	# p = st_as_sf(x = d, coords = 1:2)
	# head(p)
	# plot(pol)
	# plot(p, add = TRUE)
	#
	# (p_ag1 = aggregate(p, pol, mean))
	#
	# this isn't working the way I thought.
	# how about this
	class(nz_df)
	# nz_df is just a dataframe - I want a sf class

	# I'm following approach of what's shown in the spData documentation for the nz data set: https://cran.r-project.org/web/packages/spData/spData.pdf
	library(rmapshaper)

	# I've downloaded the SA2 dataset as a geopackage this time
	unzip("h:\\downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-FGDB.zip")
	nz_full = st_read("estimated-resident-population-at-30-june-2018-by-statistical.gdb")
	print(object.size(nz_full), units= "Kb") # 30784.2 Kb

	# make our memory issues go away... refer to line 435 for better examples

	# # we can use other native r packages for the feature simplification
	# library(rgeos)
	# # rgeos doens't work with sf objects, only sp, so you have to cast
	#
	# nz_simple <- gSimplify(as(nz_full, 'Spatial'), tol = 1, topologyPreserve = T) %>% st_as_sf()
	# print(object.size(nz_simple), units = "Kb") # 30782.6 Kb - no big change :( , maybe don't worry about this atm

	class(nz_full) # good - it's an sf dataframe

	# I need to get the crs the same on each sf data structure - maybe consolidate on WGS 84
	st_crs(licence_sf_points) # WGS 84, EPSG:4326
	st_crs(nz_full) # NZGD2000, EPSG:2193

	(nz_full_WGS84 <- st_transform(nz_full, 4326))

	# Bit confused over aggregate but I think I have it figured now
	# I'm using 'length' as the function but it's just going to generate the same length for every field
	licenceSum <- aggregate(x = licence_sf_points, by = nz_full_WGS84, FUN = length)

	# alternative approach refer https://geocompr.robinlovelace.net/spatial-operations.html#spatial-vec
	# where I've purposely got a field in licence_sf_points that just contains 1s
	(licenseRegionCount <- nz_full_WGS84 %>%
	st_join(licence_sf_points1) %>%
	group_by(SA22020_V1_00_NAME) %>%
	summarize(count = sum(rowone, na.rm = TRUE)))
	head(licenseRegionCount)


	# this is better I think
	# The count is an explicit field

	####################
	# I think this is somewhat working!
	ggplot() +
	geom_sf(data = licenseDensity, aes(fill=count)) +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)


	# what about intersection as in this stackexchange answer https://gis.stackexchange.com/a/270479
	(intersection <- st_intersection(x = st_transform(nz_full, 4326), y = licence_sf_points)) # nice, this has counts up front

	ggplot() +
	geom_sf(data = st_transform(nz_full, 4326)) +
	geom_sf(data = intersection[1]) +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)


	licenceSum1 <- aggregate(x = lsp, by = st_transform(nz_full, 4326), FUN = length)



	###########################################
	nashp <- readOGR(dsn ="H:\\Documents\\wd\\erp\\alcoshp", layer="NA_Alco")

	head(nashp)
	class(nashp)

	# generate an sf dataframe
	(nalicences <- st_as_sf( nashp, coords = c("EASTING","NORTHING"), crs = 2193))

	# transform to wgs84
	(nalicences_WGS84 <- st_transform(nalicences, 4326))

	# plotting like this ggplot wont know how to create a legend refer https://stackoverflow.com/questions/38907371/ggplot-legends-when-plot-is-built-from-two-data-frames
	ggplot() +
	geom_sf(data = st_transform(nz_full, 4326)) +
	geom_sf(data = intersection[1], colour="blue", size=2) +
	geom_sf(data = nalicences_WGS84[1], colour="red", size=0.5, alpha=0.5) +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)

	# the answer is to do it like this
	ggplot() +
	geom_sf(data = st_transform(nz_full, 4326)) +
	geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) +
	geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)

	# this is quite slow to plot out but we can investigate what's going on
	# bench_time(expr)
	# bench_memory(expr)
	# bench::mark( some series of functions )

	benchMarkResults <- bench::mark( ggplot() +
	geom_sf(data = st_transform(nz_full, 4326)) +
	geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) +
	geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) +
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)
	)
	View(benchMarkResults)



	# the plot appears to show that all/most MoJ records have a matching NA record - and differences?
	# maybe for each item in the NA dataset look for a nearby matching MoJ record
	# and can I put a boundary region on this so I don't need to test every point to point distance?

	# possible approaches in
	# https://stackoverflow.com/questions/22121742/calculate-the-distance-between-two-points-of-two-datasets-nearest-neighbor
	# https://gis.stackexchange.com/questions/127244/is-there-an-r-equivalent-of-near-in-esri-arcgis

	# or we can just do a pair wise distance calc using st_distance

	# (sf::st_distance(intersection, nalicences_WGS84, by_element=F, tolerance=100)) # this is way too slow - what can we do to make this better?
	# this still calculates over the full product of points so we need a way to just get the nearest neighbour features

	# maybe just take a fraction of points to start with
	(sf::st_distance(intersection %>% slice_sample(n=100), nalicences_WGS84 %>% slice_sample(n=100), by_element=F, tolerance=100)) # this is slow - what can we do to make this better?

	# I just popped back to look at simplification again to see if I can save some memory on the polygon features
	# two examples of simplification: one using st_simplify; the other ms_simplify
	ggplot() +
	geom_sf(data = st_transform(st_simplify(nz_full, dTolerance = 4000), 4326)) + # st_simplify needs to work on a project crs not lat lon where a unit of longitude changes with latitude
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)

	ggplot() +
	geom_sf(data = st_transform(rmapshaper::ms_simplify(nz_full, keep = 0.001, keep_shapes = T), 4326)) + # keeps 0.1% of vertices
	coord_sf(
	crs = 4326, #"EPSG:4326", or equivalently "WGS84"
	xlim = c(164, 179),
	ylim = c(-48, -33),
	expand = FALSE)

	# let's try nngeo::st_nn nearest neighbour search for Simple Features - sounds like what we want
	p2p <- st_nn(x = intersection, y = nalicences_WGS84, k = 1, maxdist = 1000, returnDist = T, progress = T) #, parallel = 16)
	# using parallel option with all 16 cores destabilises the system and when I try the line 'ggplot() + geom_histogram(aes(distanceSample))' - it dies

	# let's get the structure of the created list
	str(p2p)
	# List of two, $nn and $dist
	# I'm guessing $nn identifies the neighbour somehow

	# lets histogram the distribution of distances - sample first
	distanceSample <- unlist(list.sample(p2p$dist, 100))
	ggplot() + geom_histogram(aes(distanceSample))

	# may as well do the lot
	ggplot() + geom_histogram(aes(unlist(p2p$dist)))
	# so we have vast majority of nearest neighbours matching within just a few metres

	# do we have duplicated records?
	nalicences %>%
	group_by(ORG_ID) %>%
	filter(n() > 1 )
	# answer = no

	# filtering out the Not Applicable TYPE from nalicences
	nalicencesApplicable <- nalicences %>%
	filter(TYPE != "Not Applicable" ) %>%
	length()


	# next problem, we've matched by geocoordinate - now let's try and match by address using string similarity measures
	# n-grams work well but we have others like cosine/jacquard etc
	# https://cran.r-project.org/web/packages/ngram/vignettes/ngram-guide.pdf
	# https://cran.r-project.org/web/packages/stringdist/stringdist.pdf



	# small aside - might try and check how apportionment works in R for two data sets we can do the exact same
	# thing as I've done in ArcGIS - Census Estimated Resident Population apportioned to Police Areas - police boundaries are available on koordinates
	# https://koordinates.com/layer/3825-nz-police-area-boundaries/
	# looks like I can use sf::st_interpolate_aw or, more complete, the areal package
	PoliceAreasShp <- readOGR(dsn ="h:\\Documents\\wd\\erp\\Police_Areas_Clipped__2021_04", layer="police_area_clipped_with_maori")

	nrow(shp) # features in source
	nrow(PoliceAreasShp) # features in target

	nrow(suppressWarnings(sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp))))
	(sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp) ) )

	sourceERP <- st_as_sf(shp)
	targetPoliceAreas <- st_as_sf(PoliceAreasShp)

	apportioned <- areal::aw_interpolate(targetPoliceAreas,
	tid = "AREA_ID",
	source = sourceERP,
	sid = "SA22020_V1",
	weight = "sum",
	output = "sf",
	extensive = c("erp18"))

	head(apportioned)