Created
July 29, 2021 22:55
-
-
Save mankoff/37c0680110fb862427f12a1ca9e1f757 to your computer and use it in GitHub Desktop.
NSIDC download
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# ---------------------------------------------------------------------------- | |
# NSIDC Data Download Script | |
# | |
# Copyright (c) 2021 Regents of the University of Colorado | |
# Permission is hereby granted, free of charge, to any person obtaining | |
# a copy of this software and associated documentation files (the "Software"), | |
# to deal in the Software without restriction, including without limitation | |
# the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
# and/or sell copies of the Software, and to permit persons to whom the | |
# Software is furnished to do so, subject to the following conditions: | |
# The above copyright notice and this permission notice shall be included | |
# in all copies or substantial portions of the Software. | |
# | |
# Tested in Python 2.7 and Python 3.4, 3.6, 3.7 | |
# | |
# To run the script at a Linux, macOS, or Cygwin command-line terminal: | |
# $ python nsidc-data-download.py | |
# | |
# On Windows, open Start menu -> Run and type cmd. Then type: | |
# python nsidc-data-download.py | |
# | |
# The script will first search Earthdata for all matching files. | |
# You will then be prompted for your Earthdata username/password | |
# and the script will download the matching files. | |
# | |
# If you wish, you may store your Earthdata username/password in a .netrc | |
# file in your $HOME directory and the script will automatically attempt to | |
# read this file. The .netrc file should have the following format: | |
# machine urs.earthdata.nasa.gov login myusername password mypassword | |
# where 'myusername' and 'mypassword' are your Earthdata credentials. | |
# | |
from __future__ import print_function | |
import base64 | |
import getopt | |
import itertools | |
import json | |
import math | |
import netrc | |
import os.path | |
import ssl | |
import sys | |
import time | |
from getpass import getpass | |
try: | |
from urllib.parse import urlparse | |
from urllib.request import urlopen, Request, build_opener, HTTPCookieProcessor | |
from urllib.error import HTTPError, URLError | |
except ImportError: | |
from urlparse import urlparse | |
from urllib2 import urlopen, Request, HTTPError, URLError, build_opener, HTTPCookieProcessor | |
short_name = 'IDBMG4' | |
version = '4' | |
time_start = '1993-01-01T00:00:00Z' | |
time_end = '2020-10-01T23:59:59Z' | |
bounding_box = '' | |
polygon = '' | |
filename_filter = '' | |
url_list = [] | |
CMR_URL = 'https://cmr.earthdata.nasa.gov' | |
URS_URL = 'https://urs.earthdata.nasa.gov' | |
CMR_PAGE_SIZE = 2000 | |
CMR_FILE_URL = ('{0}/search/granules.json?provider=NSIDC_ECS' | |
'&sort_key[]=start_date&sort_key[]=producer_granule_id' | |
'&scroll=true&page_size={1}'.format(CMR_URL, CMR_PAGE_SIZE)) | |
def get_username(): | |
username = '' | |
# For Python 2/3 compatibility: | |
try: | |
do_input = raw_input # noqa | |
except NameError: | |
do_input = input | |
while not username: | |
username = do_input('Earthdata username: ') | |
return username | |
def get_password(): | |
password = '' | |
while not password: | |
password = getpass('password: ') | |
return password | |
def get_credentials(url): | |
"""Get user credentials from .netrc or prompt for input.""" | |
credentials = None | |
errprefix = '' | |
try: | |
info = netrc.netrc() | |
username, account, password = info.authenticators(urlparse(URS_URL).hostname) | |
errprefix = 'netrc error: ' | |
except Exception as e: | |
if (not ('No such file' in str(e))): | |
print('netrc error: {0}'.format(str(e))) | |
username = None | |
password = None | |
while not credentials: | |
if not username: | |
username = get_username() | |
password = get_password() | |
credentials = '{0}:{1}'.format(username, password) | |
credentials = base64.b64encode(credentials.encode('ascii')).decode('ascii') | |
if url: | |
try: | |
req = Request(url) | |
req.add_header('Authorization', 'Basic {0}'.format(credentials)) | |
opener = build_opener(HTTPCookieProcessor()) | |
opener.open(req) | |
except HTTPError: | |
print(errprefix + 'Incorrect username or password') | |
errprefix = '' | |
credentials = None | |
username = None | |
password = None | |
return credentials | |
def build_version_query_params(version): | |
desired_pad_length = 3 | |
if len(version) > desired_pad_length: | |
print('Version string too long: "{0}"'.format(version)) | |
quit() | |
version = str(int(version)) # Strip off any leading zeros | |
query_params = '' | |
while len(version) <= desired_pad_length: | |
padded_version = version.zfill(desired_pad_length) | |
query_params += '&version={0}'.format(padded_version) | |
desired_pad_length -= 1 | |
return query_params | |
def filter_add_wildcards(filter): | |
if not filter.startswith('*'): | |
filter = '*' + filter | |
if not filter.endswith('*'): | |
filter = filter + '*' | |
return filter | |
def build_filename_filter(filename_filter): | |
filters = filename_filter.split(',') | |
result = '&options[producer_granule_id][pattern]=true' | |
for filter in filters: | |
result += '&producer_granule_id[]=' + filter_add_wildcards(filter) | |
return result | |
def build_cmr_query_url(short_name, version, time_start, time_end, | |
bounding_box=None, polygon=None, | |
filename_filter=None): | |
params = '&short_name={0}'.format(short_name) | |
params += build_version_query_params(version) | |
params += '&temporal[]={0},{1}'.format(time_start, time_end) | |
if polygon: | |
params += '&polygon={0}'.format(polygon) | |
elif bounding_box: | |
params += '&bounding_box={0}'.format(bounding_box) | |
if filename_filter: | |
params += build_filename_filter(filename_filter) | |
return CMR_FILE_URL + params | |
def get_speed(time_elapsed, chunk_size): | |
if time_elapsed <= 0: | |
return '' | |
speed = chunk_size / time_elapsed | |
if speed <= 0: | |
speed = 1 | |
size_name = ('', 'k', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y') | |
i = int(math.floor(math.log(speed, 1000))) | |
p = math.pow(1000, i) | |
return '{0:.1f}{1}B/s'.format(speed / p, size_name[i]) | |
def output_progress(count, total, status='', bar_len=60): | |
if total <= 0: | |
return | |
fraction = min(max(count / float(total), 0), 1) | |
filled_len = int(round(bar_len * fraction)) | |
percents = int(round(100.0 * fraction)) | |
bar = '=' * filled_len + ' ' * (bar_len - filled_len) | |
fmt = ' [{0}] {1:3d}% {2} '.format(bar, percents, status) | |
print('\b' * (len(fmt) + 4), end='') # clears the line | |
sys.stdout.write(fmt) | |
sys.stdout.flush() | |
def cmr_read_in_chunks(file_object, chunk_size=1024 * 1024): | |
"""Read a file in chunks using a generator. Default chunk size: 1Mb.""" | |
while True: | |
data = file_object.read(chunk_size) | |
if not data: | |
break | |
yield data | |
def cmr_download(urls, force=False, quiet=False): | |
"""Download files from list of urls.""" | |
if not urls: | |
return | |
url_count = len(urls) | |
if not quiet: | |
print('Downloading {0} files...'.format(url_count)) | |
credentials = None | |
for index, url in enumerate(urls, start=1): | |
if not credentials and urlparse(url).scheme == 'https': | |
credentials = get_credentials(url) | |
filename = url.split('/')[-1] | |
if not quiet: | |
print('{0}/{1}: {2}'.format(str(index).zfill(len(str(url_count))), | |
url_count, filename)) | |
try: | |
req = Request(url) | |
if credentials: | |
req.add_header('Authorization', 'Basic {0}'.format(credentials)) | |
opener = build_opener(HTTPCookieProcessor()) | |
response = opener.open(req) | |
length = int(response.headers['content-length']) | |
try: | |
if not force and length == os.path.getsize(filename): | |
if not quiet: | |
print(' File exists, skipping') | |
continue | |
except OSError: | |
pass | |
count = 0 | |
chunk_size = min(max(length, 1), 1024 * 1024) | |
max_chunks = int(math.ceil(length / chunk_size)) | |
time_initial = time.time() | |
with open(filename, 'wb') as out_file: | |
for data in cmr_read_in_chunks(response, chunk_size=chunk_size): | |
out_file.write(data) | |
if not quiet: | |
count = count + 1 | |
time_elapsed = time.time() - time_initial | |
download_speed = get_speed(time_elapsed, count * chunk_size) | |
output_progress(count, max_chunks, status=download_speed) | |
if not quiet: | |
print() | |
except HTTPError as e: | |
print('HTTP error {0}, {1}'.format(e.code, e.reason)) | |
except URLError as e: | |
print('URL error: {0}'.format(e.reason)) | |
except IOError: | |
raise | |
def cmr_filter_urls(search_results): | |
"""Select only the desired data files from CMR response.""" | |
if 'feed' not in search_results or 'entry' not in search_results['feed']: | |
return [] | |
entries = [e['links'] | |
for e in search_results['feed']['entry'] | |
if 'links' in e] | |
# Flatten "entries" to a simple list of links | |
links = list(itertools.chain(*entries)) | |
urls = [] | |
unique_filenames = set() | |
for link in links: | |
if 'href' not in link: | |
# Exclude links with nothing to download | |
continue | |
if 'inherited' in link and link['inherited'] is True: | |
# Why are we excluding these links? | |
continue | |
if 'rel' in link and 'data#' not in link['rel']: | |
# Exclude links which are not classified by CMR as "data" or "metadata" | |
continue | |
if 'title' in link and 'opendap' in link['title'].lower(): | |
# Exclude OPeNDAP links--they are responsible for many duplicates | |
# This is a hack; when the metadata is updated to properly identify | |
# non-datapool links, we should be able to do this in a non-hack way | |
continue | |
filename = link['href'].split('/')[-1] | |
if filename in unique_filenames: | |
# Exclude links with duplicate filenames (they would overwrite) | |
continue | |
unique_filenames.add(filename) | |
urls.append(link['href']) | |
return urls | |
def cmr_search(short_name, version, time_start, time_end, | |
bounding_box='', polygon='', filename_filter='', quiet=False): | |
"""Perform a scrolling CMR query for files matching input criteria.""" | |
cmr_query_url = build_cmr_query_url(short_name=short_name, version=version, | |
time_start=time_start, time_end=time_end, | |
bounding_box=bounding_box, | |
polygon=polygon, filename_filter=filename_filter) | |
if not quiet: | |
print('Querying for data:\n\t{0}\n'.format(cmr_query_url)) | |
cmr_scroll_id = None | |
ctx = ssl.create_default_context() | |
ctx.check_hostname = False | |
ctx.verify_mode = ssl.CERT_NONE | |
urls = [] | |
hits = 0 | |
while True: | |
req = Request(cmr_query_url) | |
if cmr_scroll_id: | |
req.add_header('cmr-scroll-id', cmr_scroll_id) | |
response = urlopen(req, context=ctx) | |
if not cmr_scroll_id: | |
# Python 2 and 3 have different case for the http headers | |
headers = {k.lower(): v for k, v in dict(response.info()).items()} | |
cmr_scroll_id = headers['cmr-scroll-id'] | |
hits = int(headers['cmr-hits']) | |
if not quiet: | |
if hits > 0: | |
print('Found {0} matches.'.format(hits)) | |
else: | |
print('Found no matches.') | |
search_page = response.read() | |
search_page = json.loads(search_page.decode('utf-8')) | |
url_scroll_results = cmr_filter_urls(search_page) | |
if not url_scroll_results: | |
break | |
if not quiet and hits > CMR_PAGE_SIZE: | |
print('.', end='') | |
sys.stdout.flush() | |
urls += url_scroll_results | |
if not quiet and hits > CMR_PAGE_SIZE: | |
print() | |
return urls | |
def main(argv=None): | |
global short_name, version, time_start, time_end, bounding_box, \ | |
polygon, filename_filter, url_list | |
if argv is None: | |
argv = sys.argv[1:] | |
force = False | |
quiet = False | |
usage = 'usage: nsidc-download_***.py [--help, -h] [--force, -f] [--quiet, -q]' | |
try: | |
opts, args = getopt.getopt(argv, 'hfq', ['help', 'force', 'quiet']) | |
for opt, _arg in opts: | |
if opt in ('-f', '--force'): | |
force = True | |
elif opt in ('-q', '--quiet'): | |
quiet = True | |
elif opt in ('-h', '--help'): | |
print(usage) | |
sys.exit(0) | |
except getopt.GetoptError as e: | |
print(e.args[0]) | |
print(usage) | |
sys.exit(1) | |
# Supply some default search parameters, just for testing purposes. | |
# These are only used if the parameters aren't filled in up above. | |
if 'short_name' in short_name: | |
short_name = 'ATL06' | |
version = '003' | |
time_start = '2018-10-14T00:00:00Z' | |
time_end = '2021-01-08T21:48:13Z' | |
bounding_box = '' | |
polygon = '' | |
filename_filter = '*ATL06_2020111121*' | |
url_list = [] | |
try: | |
if not url_list: | |
url_list = cmr_search(short_name, version, time_start, time_end, | |
bounding_box=bounding_box, polygon=polygon, | |
filename_filter=filename_filter, quiet=quiet) | |
cmr_download(url_list, force=force, quiet=quiet) | |
except KeyboardInterrupt: | |
quit() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment