Last active
May 7, 2021 04:04
-
-
Save bohdanszymanik/ebae4fd663be37f9cbaa0dd79e00ee2e to your computer and use it in GitHub Desktop.
nominatum openstreetmap address lookup for the liquor licencing register
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################################## | |
# Import liquor licencing data, geocode and plot | |
# Plot on a NZ map | |
# Show a regional view - some demographics eg population density at SA2 | |
# Import the public NZ Police aggregated crime data | |
# Look at crime harm regional cf liquor outlet distribution | |
# | |
library(readxl) | |
library(rgdal) | |
library(broom) | |
library(dplyr) | |
library(ggplot2) | |
library(ggmap) | |
library(memoise) | |
library(naniar) | |
library(stringr) | |
library(purrr) | |
library(tmaptools) | |
library(logger) | |
library(cowplot) | |
library(ggrepel) | |
library(sf) | |
library(nngeo) | |
library(rlist) | |
library(bench) # https://cran.r-project.org/web/packages/bench/bench.pdf | |
library(pryr) | |
library(stringdist) | |
library(ngram) | |
library(areal) | |
setwd("h:\\Documents\\wd\\erp") | |
################################################################################ | |
# Bringing in liquour licensing data | |
# | |
# read the liquor licensing licences register in two steps | |
# first get the column names | |
cNames <- read_excel("February-2021-Licences.xlsx", skip=1, n_max=1) %>% names() | |
# then the raw data | |
llr <- read_excel("February-2021-Licences.xlsx", skip=2, col_names=cNames) | |
# getting a comma separated address - bit pointless having its own function? | |
#createAddress <- function(`Premises Street`, `Premises Suburb`, `Premises City`, ...) { | |
# paste(`Premises Street`, `Premises Suburb`, `Premises City`, sep=',') | |
#} | |
# eg | |
#llr %>% rowwise() %>% pmap_chr(createAddress) | |
llr2 <- llr %>% dplyr::mutate(address = paste(`Premises Street`, `Premises Suburb`, `Premises City`, c("New Zealand"), sep=',') ) | |
llr2 %>% slice_sample(n=50) | |
llr2 %>% slice_head(n=5) | |
geocode_OSM("Klondike Ale House, 5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org") | |
geocode_OSM("5 Gillies St, Kawakawa, new zealand", server = "https://nominatim.openstreetmap.org") | |
#geocode_OSM("5 Gillies St, Kawakawa, new zealand") | |
#geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords | |
#typeof(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords) | |
#length(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords) | |
#str(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords) | |
#is.vector(geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords) | |
#geocode_OSM("5 Gillies St, Kawakawa, new zealand")$coords["x"] | |
#log_layout(layout_glue_colors) # warning ... unreadable in the log with default text editors | |
log_formatter(formatter_glue) | |
log_layout(layout_simple) | |
log_threshold(TRACE) | |
#t <- tempfile() | |
t <- "mylogger.log" | |
log_appender(appender_file(t)) | |
#readLines(t) | |
#unlink(t) | |
cf <- cache_filesystem(".rcache", algo = "xxhash64", compress = FALSE) | |
# cache_memory(algo="sha512") | |
# geocode_OSM defaults to EPSG 4326 lat lon | |
# st_crs can be used to set coordinate reference system on underlying sf objects | |
# not sure if nztm is supported by sf | |
lookupAddress <- memoise ( | |
function(address) { | |
geocode <- geocode_OSM(address) # returns either the address if not found, or a vector with lat lon | |
if (is.null(geocode)) { | |
log_trace('{address}') | |
NA | |
} else { | |
log_trace('{address}\t{geocode$coords["x"]}\t{geocode$coords["y"]}') | |
geocode$coords | |
} | |
}, | |
cache = cf | |
) | |
lookupAddress("5 Gillies St,Kawakawa,New Zealand") | |
# has_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand") | |
# drop_cache(lookupAddress)("5 Gillies St,Kawakawa,New Zealand") | |
# forget(lookupAddress) | |
#geocode_OSM("5 Gillies St,Kawakawa,New Zealand") | |
# it doesn't like the presence of a # char in a column in the 'address'Premises Street' data so we need to filter that out | |
llr3 <- | |
llr2 %>% | |
# slice_head(n=20) %>% | |
select(LicenceType, `Date Licence is Valid From`, `Date Licence Expires`, address) %>% | |
rowwise %>% | |
dplyr::mutate(addressFiltered = str_replace(address, "#", "")) %>% | |
dplyr::mutate(lat = lookupAddress(address)["y"], lon = lookupAddress(address)["x"]) | |
# we really need to be doing this for licences that are currently active | |
# that means date today is after `Date Licence is Valid From` and before `Date Licence Expires` | |
llr3Active <- | |
llr3 %>% | |
# slice_head(n=20) %>% | |
filter(Sys.time() >= `Date Licence is Valid From` & Sys.time() < `Date Licence Expires`) | |
# down from 10719 obs to 9547 observations | |
################################################################################ | |
# Plotting | |
# first we run against the natural earth data which gives a simple global map with countries | |
# then we'll go to an NZ map with regional boundaries | |
library("rnaturalearth") | |
library("rnaturalearthdata") | |
world <- ne_countries(scale = "medium", returnclass = "sf") | |
class(world) | |
ggplot(data = world) + | |
geom_sf() + | |
coord_sf(expand=F) | |
ggplot(data = world) + | |
geom_sf() + | |
label_graticule = "EW" + | |
xlab("Longitude") + ylab("Latitude") + | |
ggtitle("World map", subtitle = paste0("(", length(unique(world$NAME)), " countries)")) | |
# # example of colouring the earth map using an aesthetic based off population data | |
# ggplot(data = world) + | |
# geom_sf(aes(fill = pop_est)) + | |
# scale_fill_viridis_c(option = "plasma", trans = "sqrt") | |
# # plotting just nz out of the rnaturalearth data set | |
# nz <- world[world$name == 'New Zealand',] | |
# | |
# ggplot(data = nz) + | |
# geom_sf() + | |
# # test by plotting one or two points | |
# two ways to create tibbles on the fly - tribble is quite nice | |
# tibble( lat = -35.11244, lon = 173.26267) | |
# | |
# tribble ( | |
# ~lat, ~lon, | |
# -35.11244, 173.26267, | |
# -35.73572, 174.33096, | |
# ) | |
# | |
# | |
# ggplot(data = world) + | |
# geom_sf() + | |
# geom_point(data = tibble( lat = -35.11244, lon = 173.26267), | |
# aes(x = lon, y = lat), | |
# size = 2, | |
# shape = 23, | |
# fill = "darkred") + | |
# coord_sf( | |
# crs = 4326, #"EPSG:4326", or equivalently "WGS84" | |
# xlim = c(164, 179), | |
# ylim = c(-48, -33), | |
# expand = FALSE) | |
ggplot(data = world) + | |
geom_sf() + | |
geom_point(data = llr3, aes(x = lon, y = lat), size = 1, | |
shape = 23, fill = "darkred") + | |
coord_sf( | |
crs = 4326, #"EPSG:4326", or equivalently "WGS84" | |
xlim = c(164, 179), | |
ylim = c(-48, -33), | |
expand = FALSE) | |
################################################################################ | |
# Time to bring in an NZ map with regions | |
# We can use the annual boundaries published with population data collected during the census | |
# With meshblocks it gets real tough without decent memory so let's use a large grain size | |
# estimated resident population by Statistical Area 2 | |
# https://datafinder.stats.govt.nz/layer/105008-estimated-resident-population-at-30-june-2018-by-statistical-area-2/data/ | |
# In the download dialog box you get a choice of geodatabase or shapefile for the download | |
# In this example I'm using shapefile | |
# unzip the shapefile to a directory and then load it like this | |
# the zipped shape file doesn't make it fully clear here that it's actually SA2 regions | |
# if you open it up in explorer and take a look at the txt or pdf files | |
# they explain what's contained... ERP, sex ratio, age groups, ethnicity, median age by sa2 | |
shp <- readOGR(dsn ="h:\\Downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-SHP", layer="estimated-resident-population-at-30-june-2018-by-statistical") | |
tidy_nzdf <- tidy(shp) | |
# Pulling out the polygon id to match on when trying to merge with meshblock id | |
shp$polyID <- sapply(slot(shp, "polygons"), function(x) slot(x, "ID")) | |
# Merge to get meshblock ids (again can take a couple mins) | |
nz_df <- merge(tidy_nzdf, shp, by.x = "id", by.y="polyID") | |
map <- ggplot() + | |
geom_path(data = nz_df, | |
aes(x = long, y = lat, group = group), | |
colour = 'grey', fill = 'white', size = 0.2) | |
print(map) | |
populationMap <- ggplot(data=nz_df, aes(x=long, y=lat, group=group)) + | |
geom_polygon(aes(fill=erp96), colour='gray', size=0.1) + | |
theme(line = element_blank(), axis.text=element_blank(),axis.title=element_blank(), panel.background = element_blank()) + | |
scale_fill_gradient2("#4d4dff", mid = "white", high = "#ff4d4d") + | |
guides(fill=guide_legend(title="Estimated resident population")) | |
# plots out with some colour aesthetics for the regions | |
populationMap | |
# all very nice but now we want to sum up the number of liquor licenses sitting within each region | |
# then, with the population being known we can generate a normalised liquor license measure | |
# then we want to compare to | |
# crime harm measure: https://www.police.govt.nz/about-us/publications-statistics/data-and-statistics/policedatanz/victimisation-time-and-place | |
# police demand measure: https://www.police.govt.nz/about-us/statistics-and-publications/data-and-statistics/demand-and-activity | |
# todo - filter out missing values for coords in llr3 | |
llr4 <- | |
llr3 %>% | |
# slice_head(n=20) %>% | |
filter(!is.na(lat)) %>% | |
filter(!is.na(lon)) %>% | |
filter(between(lat, -48, -33 )) %>% # reasonableness clipping | |
filter(between(lon, 164, 179 )) # reasonableness clipping | |
licence_sf_points <- st_as_sf( llr4 %>% select(lat,lon), coords = c("lon","lat"), crs = 4326) | |
lsp <- st_as_sf( llr4 , coords = c("lon","lat"), crs = 4326) | |
ggplot() + geom_sf(data=licence_sf_points) # plot the licence locations | |
# my first attempt at working with nz region data is via the spData package | |
# library(spData) | |
# licenceSum <- aggregate(x = licence_sf_points, by = nz, FUN = sum) | |
# | |
# licenceSumPlt <- ggplot(data = licenses) | |
# licenceSumPlt | |
# | |
# plot(licenceSum) | |
# | |
# class(licence_sf_points) | |
# head(licence_sf_points) | |
# class(licenceSum) | |
# head(licenceSum) | |
# | |
# class(nz) | |
# head(nz) | |
# | |
# ggplot() + geom_sf(data=nz) | |
# plot(nz) | |
# | |
# nz %>% filter(Name == "Canterbury") # that's unexpected, no name in the df | |
# | |
# m1 = cbind(c(0, 0, 1, 0), c(0, 1, 1, 0)) | |
# m2 = cbind(c(0, 1, 1, 0), c(0, 0, 1, 0)) | |
# pol = st_sfc(st_polygon(list(m1)), st_polygon(list(m2))) | |
# set.seed(1985) | |
# d = data.frame(matrix(runif(15), ncol = 3)) | |
# p = st_as_sf(x = d, coords = 1:2) | |
# head(p) | |
# plot(pol) | |
# plot(p, add = TRUE) | |
# | |
# (p_ag1 = aggregate(p, pol, mean)) | |
# | |
# this isn't working the way I thought. | |
# how about this | |
class(nz_df) | |
# nz_df is just a dataframe - I want a sf class | |
# I'm following approach of what's shown in the spData documentation for the nz data set: https://cran.r-project.org/web/packages/spData/spData.pdf | |
library(rmapshaper) | |
# I've downloaded the SA2 dataset as a geopackage this time | |
unzip("h:\\downloads\\statsnzestimated-resident-population-at-30-june-2018-by-statistical-FGDB.zip") | |
nz_full = st_read("estimated-resident-population-at-30-june-2018-by-statistical.gdb") | |
print(object.size(nz_full), units= "Kb") # 30784.2 Kb | |
# make our memory issues go away... refer to line 435 for better examples | |
# # we can use other native r packages for the feature simplification | |
# library(rgeos) | |
# # rgeos doens't work with sf objects, only sp, so you have to cast | |
# | |
# nz_simple <- gSimplify(as(nz_full, 'Spatial'), tol = 1, topologyPreserve = T) %>% st_as_sf() | |
# print(object.size(nz_simple), units = "Kb") # 30782.6 Kb - no big change :( , maybe don't worry about this atm | |
class(nz_full) # good - it's an sf dataframe | |
# I need to get the crs the same on each sf data structure - maybe consolidate on WGS 84 | |
st_crs(licence_sf_points) # WGS 84, EPSG:4326 | |
st_crs(nz_full) # NZGD2000, EPSG:2193 | |
(nz_full_WGS84 <- st_transform(nz_full, 4326)) | |
# Bit confused over aggregate but I think I have it figured now | |
# I'm using 'length' as the function but it's just going to generate the same length for every field | |
licenceSum <- aggregate(x = licence_sf_points, by = nz_full_WGS84, FUN = length) | |
# alternative approach refer https://geocompr.robinlovelace.net/spatial-operations.html#spatial-vec | |
# where I've purposely got a field in licence_sf_points that just contains 1s | |
(licenseRegionCount <- nz_full_WGS84 %>% | |
st_join(licence_sf_points1) %>% | |
group_by(SA22020_V1_00_NAME) %>% | |
summarize(count = sum(rowone, na.rm = TRUE))) | |
head(licenseRegionCount) | |
# this is better I think | |
# The count is an explicit field | |
#################### | |
# I think this is somewhat working! | |
ggplot() + | |
geom_sf(data = licenseDensity, aes(fill=count)) + | |
coord_sf( | |
crs = 4326, #"EPSG:4326", or equivalently "WGS84" | |
xlim = c(164, 179), | |
ylim = c(-48, -33), | |
expand = FALSE) | |
# what about intersection as in this stackexchange answer https://gis.stackexchange.com/a/270479 | |
(intersection <- st_intersection(x = st_transform(nz_full, 4326), y = licence_sf_points)) # nice, this has counts up front | |
ggplot() + | |
geom_sf(data = st_transform(nz_full, 4326)) + | |
geom_sf(data = intersection[1]) + | |
coord_sf( | |
crs = 4326, #"EPSG:4326", or equivalently "WGS84" | |
xlim = c(164, 179), | |
ylim = c(-48, -33), | |
expand = FALSE) | |
licenceSum1 <- aggregate(x = lsp, by = st_transform(nz_full, 4326), FUN = length) | |
########################################### | |
nashp <- readOGR(dsn ="H:\\Documents\\wd\\erp\\alcoshp", layer="NA_Alco") | |
head(nashp) | |
class(nashp) | |
# generate an sf dataframe | |
(nalicences <- st_as_sf( nashp, coords = c("EASTING","NORTHING"), crs = 2193)) | |
# transform to wgs84 | |
(nalicences_WGS84 <- st_transform(nalicences, 4326)) | |
# plotting like this ggplot wont know how to create a legend refer https://stackoverflow.com/questions/38907371/ggplot-legends-when-plot-is-built-from-two-data-frames | |
ggplot() + | |
geom_sf(data = st_transform(nz_full, 4326)) + | |
geom_sf(data = intersection[1], colour="blue", size=2) + | |
geom_sf(data = nalicences_WGS84[1], colour="red", size=0.5, alpha=0.5) + | |
coord_sf( | |
crs = 4326, #"EPSG:4326", or equivalently "WGS84" | |
xlim = c(164, 179), | |
ylim = c(-48, -33), | |
expand = FALSE) | |
# the answer is to do it like this | |
ggplot() + | |
geom_sf(data = st_transform(nz_full, 4326)) + | |
geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) + | |
geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) + | |
coord_sf( | |
crs = 4326, #"EPSG:4326", or equivalently "WGS84" | |
xlim = c(164, 179), | |
ylim = c(-48, -33), | |
expand = FALSE) | |
# this is quite slow to plot out but we can investigate what's going on | |
# bench_time(expr) | |
# bench_memory(expr) | |
# bench::mark( some series of functions ) | |
benchMarkResults <- bench::mark( ggplot() + | |
geom_sf(data = st_transform(nz_full, 4326)) + | |
geom_sf(data = intersection[1], aes(color="MoJ LLR"), size=2) + | |
geom_sf(data = nalicences_WGS84[1], aes(color="NA LLR"), size=0.5, alpha=0.5) + | |
coord_sf( | |
crs = 4326, #"EPSG:4326", or equivalently "WGS84" | |
xlim = c(164, 179), | |
ylim = c(-48, -33), | |
expand = FALSE) | |
) | |
View(benchMarkResults) | |
# the plot appears to show that all/most MoJ records have a matching NA record - and differences? | |
# maybe for each item in the NA dataset look for a nearby matching MoJ record | |
# and can I put a boundary region on this so I don't need to test every point to point distance? | |
# possible approaches in | |
# https://stackoverflow.com/questions/22121742/calculate-the-distance-between-two-points-of-two-datasets-nearest-neighbor | |
# https://gis.stackexchange.com/questions/127244/is-there-an-r-equivalent-of-near-in-esri-arcgis | |
# or we can just do a pair wise distance calc using st_distance | |
# (sf::st_distance(intersection, nalicences_WGS84, by_element=F, tolerance=100)) # this is way too slow - what can we do to make this better? | |
# this still calculates over the full product of points so we need a way to just get the nearest neighbour features | |
# maybe just take a fraction of points to start with | |
(sf::st_distance(intersection %>% slice_sample(n=100), nalicences_WGS84 %>% slice_sample(n=100), by_element=F, tolerance=100)) # this is slow - what can we do to make this better? | |
# I just popped back to look at simplification again to see if I can save some memory on the polygon features | |
# two examples of simplification: one using st_simplify; the other ms_simplify | |
ggplot() + | |
geom_sf(data = st_transform(st_simplify(nz_full, dTolerance = 4000), 4326)) + # st_simplify needs to work on a project crs not lat lon where a unit of longitude changes with latitude | |
coord_sf( | |
crs = 4326, #"EPSG:4326", or equivalently "WGS84" | |
xlim = c(164, 179), | |
ylim = c(-48, -33), | |
expand = FALSE) | |
ggplot() + | |
geom_sf(data = st_transform(rmapshaper::ms_simplify(nz_full, keep = 0.001, keep_shapes = T), 4326)) + # keeps 0.1% of vertices | |
coord_sf( | |
crs = 4326, #"EPSG:4326", or equivalently "WGS84" | |
xlim = c(164, 179), | |
ylim = c(-48, -33), | |
expand = FALSE) | |
# let's try nngeo::st_nn nearest neighbour search for Simple Features - sounds like what we want | |
p2p <- st_nn(x = intersection, y = nalicences_WGS84, k = 1, maxdist = 1000, returnDist = T, progress = T) #, parallel = 16) | |
# using parallel option with all 16 cores destabilises the system and when I try the line 'ggplot() + geom_histogram(aes(distanceSample))' - it dies | |
# let's get the structure of the created list | |
str(p2p) | |
# List of two, $nn and $dist | |
# I'm guessing $nn identifies the neighbour somehow | |
# lets histogram the distribution of distances - sample first | |
distanceSample <- unlist(list.sample(p2p$dist, 100)) | |
ggplot() + geom_histogram(aes(distanceSample)) | |
# may as well do the lot | |
ggplot() + geom_histogram(aes(unlist(p2p$dist))) | |
# so we have vast majority of nearest neighbours matching within just a few metres | |
# do we have duplicated records? | |
nalicences %>% | |
group_by(ORG_ID) %>% | |
filter(n() > 1 ) | |
# answer = no | |
# filtering out the Not Applicable TYPE from nalicences | |
nalicencesApplicable <- nalicences %>% | |
filter(TYPE != "Not Applicable" ) %>% | |
length() | |
# next problem, we've matched by geocoordinate - now let's try and match by address using string similarity measures | |
# n-grams work well but we have others like cosine/jacquard etc | |
# https://cran.r-project.org/web/packages/ngram/vignettes/ngram-guide.pdf | |
# https://cran.r-project.org/web/packages/stringdist/stringdist.pdf | |
# small aside - might try and check how apportionment works in R for two data sets we can do the exact same | |
# thing as I've done in ArcGIS - Census Estimated Resident Population apportioned to Police Areas - police boundaries are available on koordinates | |
# https://koordinates.com/layer/3825-nz-police-area-boundaries/ | |
# looks like I can use sf::st_interpolate_aw or, more complete, the areal package | |
PoliceAreasShp <- readOGR(dsn ="h:\\Documents\\wd\\erp\\Police_Areas_Clipped__2021_04", layer="police_area_clipped_with_maori") | |
nrow(shp) # features in source | |
nrow(PoliceAreasShp) # features in target | |
nrow(suppressWarnings(sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp)))) | |
(sf::st_intersection(st_as_sf(shp), st_as_sf(PoliceAreasShp) ) ) | |
sourceERP <- st_as_sf(shp) | |
targetPoliceAreas <- st_as_sf(PoliceAreasShp) | |
apportioned <- areal::aw_interpolate(targetPoliceAreas, | |
tid = "AREA_ID", | |
source = sourceERP, | |
sid = "SA22020_V1", | |
weight = "sum", | |
output = "sf", | |
extensive = c("erp18")) | |
head(apportioned) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment