Skip to content

Instantly share code, notes, and snippets.

@samuelschumacher
Created January 21, 2015 03:27
Show Gist options
  • Save samuelschumacher/86a5bc6e40e611007dec to your computer and use it in GitHub Desktop.
Save samuelschumacher/86a5bc6e40e611007dec to your computer and use it in GitHub Desktop.
ControlFinder
GPS1448
GPS1536
GPS1563
GPS1751
GPS1846
GPS1874
GPS1875
GPS1998
GPS2213
GPS2471
GPS2474
GPS2518
GPS2626
GPS2829
GPS2870
GPS3009
L26028
L26041
L26191
L26401
L26624
L26768
L26874
L26993
L27016
L27110
L27162
L27163
L27186
L27187
L27253
L27314
L27316
L27614
L27615
L27930
L27931
L28089
# This script automates the HMP download process. This script downloads all the HMP shapefiles according to a text file containing the relevant project IDs. The script then unzips the files and merges the shapefiles.
# The hmp_download.py script depends on a great many internal and external settings to run smoothly.
# First, the computer running it should have a Python environment as well as an editor that can display script content in a readable format, like IDLE or Notepad++.
# Second, the script must be able to reach our internal servers as well as the external websites (NOAA.gov).
# Third, the script must have valid packages to decompress, and following this decompression each of the resulting shapefiles must contain a valid geometry in order to be merged.
# What this means is that if an error is encountered:
# * Before anything even seems to happen, the computer may not have BeautifulSoup installed.
# Read the software requirements and try again.
# * During the download, look through the download directory to determine the final file downloaded.
# Attempt a manual download of this file.
# The site is given in the manual method, and will be listed in the python file in comments.
# If Python-savvy, you can then modify and rerun the script to resume downloading where you left off, otherwise continue manually.
# hmp_unzipmerge may be run separately from the download script after all files have been downloaded.
# * Following a successful download, check the integrity of the .zip files.
# If any may not be unzipped manually, this is what is causing the error.
# Attempt a re-download and, in worst cases, merge only functional files.
# * Following successful unzipping, one shapefile may contain a corrupt or empty geometry.
# Although qGIS can automatically merge only valid geometries, do examine the files individually in qGIS to determine the problem dataset so that it may be flagged and reported to Brenda for removal from the list.
# At this point, if problems are being encountered, a manual merge is necessary.
# In qGIS, select 'Merge shapefiles to one' under the 'Data Management' submenu in the 'Vector' menu and responding to the resulting prompt.
# * Following unzipping, but all shapefiles are valid, the computer performing the operations may not have the correct path specified to 'shapefile.py.'
# After ensuring that the correct version of the script is on the machine, move it to C:\PythonXX (where XX is the version, likely 27).
# Add this directory to the system path by visiting the 'System' control panel item, selecting 'Advanced system settings,' then clicking on 'Environment Variables...' under the 'Advanced' tab.
# Double-click on the 'Path' variable and without deleting anything, append a semicolon and the directory's path to the end of the variable definition.
import urllib2, urllib, zipfile, os, glob, shapefile, time
from bs4 import BeautifulSoup
from urlparse import urlsplit
from time import localtime, strftime, gmtime
# Define NOAA website url
noaaUrl = 'http://www.ngs.noaa.gov/cgi-bin/ds_proj_sf.prl'
# Define directories based on current month and year
dir = 'Z:/PROJECTS/controlfinder/NGS/'
year_dir = dir + 'NGS_' + strftime("%Y/", localtime())
month_dir = year_dir + 'NGS_' + strftime("%b_%Y/", localtime())
# Test if directory 'NGS_YEAR' exists, if not create it
if (not(os.path.isdir(year_dir))):
os.mkdir(year_dir)
# Test if month directory exists, if not create it
if (not(os.path.isdir(month_dir))):
os.mkdir(month_dir)
# Test if hmp zip and unzip directories exist, if not create them
if (not(os.path.isdir(month_dir + 'hmp_zip'))):
os.mkdir(month_dir + 'hmp_zip')
if (not(os.path.isdir(month_dir + 'hmp_unzip'))):
os.mkdir(month_dir + 'hmp_unzip')
# Create parameter dictionary to be encoded and sent to first page of NOAA url.
valueDict = {
'ProjBox': 'GPS1448',
'PREFIX': 'GPS1448',
'IncludeSelected': 'N',
'TypeSelected': 'MOD',
'StabilSelected': '0',
'MetaDataFormat': 'HTML',
'CompressType': 'Zipped'}
# Open text file with list of hmp project IDs.
projList = open('hmp_projIDs.txt').readlines()
# Print start time
print 'The START time is: ' + strftime("%A %c", localtime()) + '\n'
# Save the start time for calculation purpose
starttime = time.clock()
# Test with shortened project list
#projList = open('hmp_projIDs_short.txt').readlines()
# Create edited list to remove newline (\n) characters from each list item. Necessary to do so in this fashion because the last item does not have a trailing newline character.
projListEdit = []
a = 0
while a < len(projList)-1:
projListEdit.append(projList[a][:-1])
a += 1
projListEdit.append(projList[len(projList)-1])
count = 0
for each in projListEdit:
# Replace 'ProjBox' and 'PREFIX' value with new project ID and send post. Open html response and parse out station values using BeautifulSoup. Create list of station values.
projstarttime = time.clock()
valueDict['ProjBox'] = each
valueDict['PREFIX'] = each
projData = urllib.urlencode(valueDict)
req = urllib2.Request(noaaUrl, projData)
response = urllib2.urlopen(req)
soup = BeautifulSoup(response.read())
# print 'Received response for project code: '+each+'\n'
optionList = []
for options in soup.find_all('option'):
optionList.append(options.get('value')[:-1])
del optionList[0:2]
# print 'Parsed out station values for project code: '+each+'\n'
# Url-encoded key/value pairs for parameters other than station values.
valueDict2 = {
'QueryHidden': each,
'MetaDataFormatHidden': 'HTML',
'DisplayActive': 'IsDisplayed',
'Get Shapefile': 'Get Shapefile',
'CompressTypeHidden': 'Zipped',
'SortSelected': 'Designation',
'PrefixHidden': each,
'ControlTypeHidden': 'MOD-0-0-0-0'}
paramString = urllib.urlencode(valueDict2)
# Append each encoded key (MarkSelected) - value (station) pair to the parameter string.
for station in optionList:
i = urllib.urlencode({'MarkSelected': station})
paramString = paramString + '&' + i
# print 'Created parameter string including all station values for project code: '+each+'\n'
# Send request with NOAA url and parameter values encoded in string, open response and send filename into variable, open file in output directory, write contents of received file and close.
request = urllib2.Request(noaaUrl, paramString)
response = urllib2.urlopen(request)
filename = response.info()['Content-Disposition'].split('filename=')[1]
f = open(month_dir+'/hmp_zip/'+filename, 'wb')
f.write(response.read())
f.close()
# Update progress
count += 1
print 'Completed '+str(count)+' of '+str(len(projListEdit))+' projects. Download took ' + strftime("%M minutes and %S seconds", (gmtime(time.clock() - projstarttime)))
# print 'Saved zip file for project code: '+each+'\n'
print '\nFinished downloading hmp shapefiles!!!\n'
print '\nTotal download time: ' + strftime("%H hours, %M minutes and %S seconds", (gmtime(time.clock() - starttime)))
# Create list of folder contents
zipList = os.listdir(month_dir+'/hmp_zip')
# Unzip and output contents to output directory for all files in input folder
for i in zipList:
zip = zipfile.ZipFile(month_dir+'/hmp_zip/'+i)
zip.extractall(month_dir+'/hmp_unzip/')
fileList = glob.glob(month_dir+'/hmp_unzip/'+'*.shp')
w = shapefile.Writer()
# Create shapefile using pyshp (shapefile) library...no idea how, but it works!
for f in fileList:
r = shapefile.Reader(f)
w._shapes.extend(r.shapes())
w.records.extend(r.records())
w.fields = list(r.fields)
w.save(month_dir+'merged_shape')
# Create .prj file for GCS_North_American_1983_HARN
prj = open(month_dir+'merged_shape.prj', 'w')
epsg = 'GEOGCS["GCS_North_American_1983_HARN",DATUM["D_North_American_1983_HARN",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]'
prj.write(epsg)
prj.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment