Last active
February 12, 2022 00:40
-
-
Save emilyeros/37589878a7bccaf4a1d31d3d340c2ad3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# code source: GIS stack exchange: https://gis.stackexchange.com/questions/266897/how-to-get-around-the-1000-objectids-limit-on-arcgis-server?newreg=e085133f1a9d475b99c094e1c497a2a5 | |
import numpy as np | |
import pandas as pd | |
import geopandas as gpd | |
import urllib.parse | |
import requests | |
def query_arcgis_feature_server(url_feature_server=''): | |
''' | |
This function downloads all of the features available on a given ArcGIS | |
feature server. The function is written to bypass the limitations imposed | |
by the online service, such as only returning up to 1,000 or 2,000 featues | |
at a time. | |
Parameters | |
---------- | |
url_feature_server : string | |
String containing the URL of the service API you want to query. It should | |
end in a forward slash and look something like this: | |
'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/USA_Counties/FeatureServer/0/' | |
EV Justice40 URL is: | |
'https://services2.arcgis.com/spDVOHcj11Fk80s4/ArcGIS/rest/services/DAC6/FeatureServer/4/' | |
Returns | |
------- | |
geodata_final : gpd.GeoDataFrame | |
This is a GeoDataFrame that contains all of the features from the | |
Feature Server. After calling this function, the `geodata_final` object | |
can be used to store the data on disk in several different formats | |
including, but not limited to, Shapefile (.shp), GeoJSON (.geojson), | |
GeoPackage (.gpkg), or PostGIS. | |
See https://geopandas.org/en/stable/docs/user_guide/io.html#writing-spatial-data | |
for more details. | |
''' | |
if url_feature_server == '': | |
geodata_final = gpd.GeoDataFrame() | |
return geodata_final | |
# Fixing last character in case the URL provided didn't end in a | |
# forward slash | |
if url_feature_server[-1] != '/': | |
url_feature_server = url_feature_server + '/' | |
# Getting the layer definitions. This contains important info such as the | |
# name of the column used as feature_ids/object_ids, among other things. | |
layer_def = requests.get(url_feature_server + '?f=pjson').json() | |
# The `objectIdField` is the column name used for the | |
# feature_ids/object_ids | |
fid_colname = layer_def['objectIdField'] | |
# The `maxRecordCount` tells us the maximum number of records this REST | |
# API service can return at once. The code below is written such that we | |
# perform multiple calls to the API, each one being short enough never to | |
# go beyond this limit. | |
record_count_max = layer_def['maxRecordCount'] | |
# Part of the URL that specifically requests only the object IDs | |
url_query_get_ids = (f'query?f=geojson&returnIdsOnly=true' | |
f'&where={fid_colname}+is+not+null') | |
url_comb = url_feature_server + url_query_get_ids | |
# Getting all the object IDs | |
service_request = requests.get(url_comb) | |
all_objectids = np.sort(service_request.json()['properties']['objectIds']) | |
# This variable will store all the parts of the multiple queries. These | |
# parts will, at the end, be concatenated into one large GeoDataFrame. | |
geodata_parts = [] | |
# This part of the query is fixed and never actually changes | |
url_query_fixed = ('query?f=geojson&outFields=*&where=') | |
# Identifying the largest query size allowed per request. This will dictate | |
# how many queries will need to be made. We start the search at | |
# the max record count, but that generates errors sometimes - the query | |
# might time out because it's too big. If the test query times out, we try | |
# shrink the query size until the test query goes through without | |
# generating a time-out error. | |
block_size = min(record_count_max, len(all_objectids)) | |
worked = False | |
while not worked: | |
# Moving the "cursors" to their appropriate locations | |
id_start = all_objectids[0] | |
id_end = all_objectids[block_size-1] | |
readable_query_string = (f'{fid_colname}>={id_start} ' | |
f'and {fid_colname}<={id_end}') | |
url_query_variable = urllib.parse.quote(readable_query_string) | |
url_comb = url_feature_server + url_query_fixed + url_query_variable | |
url_get = requests.get(url_comb) | |
if 'error' in url_get.json(): | |
block_size = int(block_size/2)+1 | |
else: | |
geodata_part = gpd.read_file(url_get.text) | |
geodata_parts.append(geodata_part.copy()) | |
worked = True | |
# Performing the actual query to the API multiple times. This skips the | |
# first few rows/features in the data because those rows were already | |
# captured in the query performed in the code chunk above. | |
for i in range(block_size, len(all_objectids), block_size): | |
# Moving the "cursors" to their appropriate locations and finding the | |
# limits of each block | |
sub_list = all_objectids[i:i + block_size] | |
id_start = sub_list[0] | |
id_end = sub_list[-1] | |
readable_query_string = (f'{fid_colname}>={id_start} ' | |
f'and {fid_colname}<={id_end}') | |
# Encoding from readable text to URL | |
url_query_variable = urllib.parse.quote(readable_query_string) | |
# Constructing the full request URL | |
url_comb = url_feature_server + url_query_fixed + url_query_variable | |
# Actually performing the query and storing its results in a | |
# GeoDataFrame | |
geodata_part = (gpd.read_file(url_comb, | |
driver='GeoJSON')) | |
# Appending the result to `geodata_parts` | |
if geodata_part.shape[0] > 0: | |
geodata_parts.append(geodata_part) | |
# Concatenating all of the query parts into one large GeoDataFrame | |
geodata_final = (pd.concat(geodata_parts, | |
ignore_index=True) | |
.sort_values(by=fid_colname) | |
.reset_index(drop=True)) | |
# Checking if any object ID is missing | |
ids_queried = set(geodata_final[fid_colname]) | |
for i,this_id in enumerate(all_objectids): | |
if this_id not in ids_queried: | |
print('WARNING! The following ObjectID is missing from the final ' | |
f'GeoDataFrame: ObjectID={this_id}') | |
pass | |
# Checking if any object ID is included twice | |
geodata_temp = geodata_final[[fid_colname]].copy() | |
geodata_temp['temp'] = 1 | |
geodata_temp = (geodata_temp | |
.groupby(fid_colname) | |
.agg({'temp':'sum'}) | |
.reset_index()) | |
geodata_temp = geodata_temp.loc[geodata_temp['temp']>1].copy() | |
for i,this_id in enumerate(geodata_temp[fid_colname].values): | |
n_times = geodata_temp['temp'].values[i] | |
print('WARNING! The following ObjectID is included multiple times in' | |
f'the final GeoDataFrame: ObjectID={this_id}\tOccurrences={n_times}') | |
return geodata_final | |
# You just need to use the query_arcgis_feature_server function defined above and it will download all of the data on the feature server. It takes care of a couple of pitfalls, such as avoiding the maximum feature limit, or requests that are too big and just time out. | |
''' | |
Here's an example of how to download the data and store it into a shapefile on your disk: | |
url = 'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/USA_Counties/FeatureServer/0/' | |
usa_counties_gdf = query_arcgis_feature_server(url) | |
usa_counties_gdf.to_file('usa_counties.shp') | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment