samuelschumacher · January 21, 2015 03:27
diff --git a/hmp_projIDs.txt b/hmp_projIDs.txt
 GPS1448
 GPS1536
 GPS1563
 GPS1751
 GPS1846
 GPS1874
 GPS1875
 GPS1998
 GPS2213
 GPS2471
 GPS2474
 GPS2518
 GPS2626
 GPS2829
 GPS2870
 GPS3009
 L26028
 L26041
 L26191
 L26401
 L26624
 L26768
 L26874
 L26993
 L27016
 L27110
 L27162
 L27163
 L27186
 L27187
 L27253
 L27314
 L27316
 L27614
 L27615
 L27930
 L27931
 L28089
diff --git a/hmpdownload.py b/hmpdownload.py
 # This script automates the HMP download process. This script downloads all the HMP shapefiles according to a text file containing the relevant project IDs. The script then unzips the files and merges the shapefiles.

 # The hmp_download.py script depends on a great many internal and external settings to run smoothly. 
 # First, the computer running it should have a Python environment as well as an editor that can display script content in a readable format, like IDLE or Notepad++. 
 # Second, the script must be able to reach our internal servers as well as the external websites (NOAA.gov). 
 # Third, the script must have valid packages to decompress, and following this decompression each of the resulting shapefiles must contain a valid geometry in order to be merged. 
 # What this means is that if an error is encountered:
 # *  Before anything even seems to happen, the computer may not have BeautifulSoup installed.
 #		Read the software requirements and try again.
 # *  During the download, look through the download directory to determine the final file downloaded.
 #		Attempt a manual download of this file. 
 #		The site is given in the manual method, and will be listed in the python file in comments. 
 #		If Python-savvy, you can then modify and rerun the script to resume downloading where you left off, otherwise continue manually.
 #		hmp_unzipmerge may be run separately from the download script after all files have been downloaded.
 # *  Following a successful download, check the integrity of the .zip files.
 #		If any may not be unzipped manually, this is what is causing the error.
 #		Attempt a re-download and, in worst cases, merge only functional files.
 # *  Following successful unzipping, one shapefile may contain a corrupt or empty geometry. 
 #		Although qGIS can automatically merge only valid geometries, do examine the files individually in qGIS to determine the problem dataset so that it may be flagged and reported to Brenda for removal from the list.
 #		At this point, if problems are being encountered, a manual merge is necessary. 
 #			In qGIS, select 'Merge shapefiles to one' under the 'Data Management' submenu in the 'Vector' menu and responding to the resulting prompt.
 # *  Following unzipping, but all shapefiles are valid, the computer performing the operations may not have the correct path specified to 'shapefile.py.' 
 #		After ensuring that the correct version of the script is on the machine, move it to C:\PythonXX (where XX is the version, likely 27).
 #		Add this directory to the system path by visiting the 'System' control panel item, selecting 'Advanced system settings,' then clicking on 'Environment Variables...' under the 'Advanced' tab. 
 #		Double-click on the 'Path' variable and without deleting anything, append a semicolon and the directory's path to the end of the variable definition.

 import urllib2, urllib, zipfile, os, glob, shapefile, time
 from bs4 import BeautifulSoup
 from urlparse import urlsplit
 from time import localtime, strftime, gmtime

 # Define NOAA website url
 noaaUrl = 'http://www.ngs.noaa.gov/cgi-bin/ds_proj_sf.prl'

 # Define directories based on current month and year
 dir = 'Z:/PROJECTS/controlfinder/NGS/'
 year_dir = dir + 'NGS_' + strftime("%Y/", localtime())
 month_dir = year_dir + 'NGS_' + strftime("%b_%Y/", localtime())

 # Test if directory 'NGS_YEAR' exists, if not create it
 if (not(os.path.isdir(year_dir))):
 	os.mkdir(year_dir)

 # Test if month directory exists, if not create it
 if (not(os.path.isdir(month_dir))):
 	os.mkdir(month_dir)

 # Test if hmp zip and unzip directories exist, if not create them
 if (not(os.path.isdir(month_dir + 'hmp_zip'))):	
 	os.mkdir(month_dir + 'hmp_zip')
 if (not(os.path.isdir(month_dir + 'hmp_unzip'))):
 	os.mkdir(month_dir + 'hmp_unzip')

 # Create parameter dictionary to be encoded and sent to first page of NOAA url.
 valueDict = {
 'ProjBox': 'GPS1448',
 'PREFIX': 'GPS1448',
 'IncludeSelected': 'N',
 'TypeSelected': 'MOD',
 'StabilSelected': '0',
 'MetaDataFormat': 'HTML',
 'CompressType': 'Zipped'}

 # Open text file with list of hmp project IDs.
 projList = open('hmp_projIDs.txt').readlines()

 # Print start time
 print 'The START time is: ' + strftime("%A %c", localtime()) + '\n'

 # Save the start time for calculation purpose
 starttime = time.clock()

 # Test with shortened project list
 #projList = open('hmp_projIDs_short.txt').readlines()

 # Create edited list to remove newline (\n) characters from each list item. Necessary to do so in this fashion because the last item does not have a trailing newline character.
 projListEdit = []
 a = 0
 while a < len(projList)-1:
 	projListEdit.append(projList[a][:-1])
 	a += 1
 projListEdit.append(projList[len(projList)-1])

 count = 0

 for each in projListEdit:

 	# Replace 'ProjBox' and 'PREFIX' value with new project ID and send post. Open html response and parse out station values using BeautifulSoup. Create list of station values.
 	projstarttime = time.clock()

 	valueDict['ProjBox'] = each
 	valueDict['PREFIX'] = each
 	projData = urllib.urlencode(valueDict)
 	req = urllib2.Request(noaaUrl, projData)
 	response = urllib2.urlopen(req)
 	soup = BeautifulSoup(response.read())
 	
 	# print 'Received response for project code: '+each+'\n'
 	
 	optionList = []
 	
 	for options in soup.find_all('option'):
 		optionList.append(options.get('value')[:-1])

 	del optionList[0:2]

 	# print 'Parsed out station values for project code: '+each+'\n'
 	
 # Url-encoded key/value pairs for parameters other than station values.
 	valueDict2 = {
 	'QueryHidden': each,
 	'MetaDataFormatHidden': 'HTML',
 	'DisplayActive': 'IsDisplayed',
 	'Get Shapefile': 'Get Shapefile',
 	'CompressTypeHidden': 'Zipped',
 	'SortSelected': 'Designation',
 	'PrefixHidden': each,
 	'ControlTypeHidden': 'MOD-0-0-0-0'}

 	paramString = urllib.urlencode(valueDict2)

 # Append each encoded key (MarkSelected) - value (station) pair to the parameter string.
 	for station in optionList:
 		i = urllib.urlencode({'MarkSelected': station})
 		paramString = paramString + '&' + i

 	# print 'Created parameter string including all station values for project code: '+each+'\n'
 	
 # Send request with NOAA url and parameter values encoded in string, open response and send filename into variable, open file in output directory, write contents of received file and close.
 	request = urllib2.Request(noaaUrl, paramString)
 	response = urllib2.urlopen(request)
 	filename = response.info()['Content-Disposition'].split('filename=')[1]
 	f = open(month_dir+'/hmp_zip/'+filename, 'wb')
 	f.write(response.read())
 	f.close()
 	
 	# Update progress
 	count += 1
 	print 'Completed '+str(count)+' of '+str(len(projListEdit))+' projects. Download took ' + strftime("%M minutes and %S seconds", (gmtime(time.clock() - projstarttime)))
 	
 	# print 'Saved zip file for project code: '+each+'\n'

 print '\nFinished downloading hmp shapefiles!!!\n'
 print '\nTotal download time: ' + strftime("%H hours, %M minutes and %S seconds", (gmtime(time.clock() - starttime)))

 # Create list of folder contents
 zipList = os.listdir(month_dir+'/hmp_zip')

 # Unzip and output contents to output directory for all files in input folder
 for i in zipList:
    zip = zipfile.ZipFile(month_dir+'/hmp_zip/'+i)
    zip.extractall(month_dir+'/hmp_unzip/')

 fileList = glob.glob(month_dir+'/hmp_unzip/'+'*.shp')

 w = shapefile.Writer()

 # Create shapefile using pyshp (shapefile) library...no idea how, but it works!	
 for f in fileList:
 	r = shapefile.Reader(f)
 	w._shapes.extend(r.shapes())
 	w.records.extend(r.records())
 w.fields = list(r.fields)
 w.save(month_dir+'merged_shape')

 # Create .prj file for GCS_North_American_1983_HARN
 prj = open(month_dir+'merged_shape.prj', 'w')
 epsg = 'GEOGCS["GCS_North_American_1983_HARN",DATUM["D_North_American_1983_HARN",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]'
 prj.write(epsg)
 prj.close()
	GPS1448
	GPS1536
	GPS1563
	GPS1751
	GPS1846
	GPS1874
	GPS1875
	GPS1998
	GPS2213
	GPS2471
	GPS2474
	GPS2518
	GPS2626
	GPS2829
	GPS2870
	GPS3009
	L26028
	L26041
	L26191
	L26401
	L26624
	L26768
	L26874
	L26993
	L27016
	L27110
	L27162
	L27163
	L27186
	L27187
	L27253
	L27314
	L27316
	L27614
	L27615
	L27930
	L27931
	L28089
	# This script automates the HMP download process. This script downloads all the HMP shapefiles according to a text file containing the relevant project IDs. The script then unzips the files and merges the shapefiles.

	# The hmp_download.py script depends on a great many internal and external settings to run smoothly.
	# First, the computer running it should have a Python environment as well as an editor that can display script content in a readable format, like IDLE or Notepad++.
	# Second, the script must be able to reach our internal servers as well as the external websites (NOAA.gov).
	# Third, the script must have valid packages to decompress, and following this decompression each of the resulting shapefiles must contain a valid geometry in order to be merged.
	# What this means is that if an error is encountered:
	# * Before anything even seems to happen, the computer may not have BeautifulSoup installed.
	# Read the software requirements and try again.
	# * During the download, look through the download directory to determine the final file downloaded.
	# Attempt a manual download of this file.
	# The site is given in the manual method, and will be listed in the python file in comments.
	# If Python-savvy, you can then modify and rerun the script to resume downloading where you left off, otherwise continue manually.
	# hmp_unzipmerge may be run separately from the download script after all files have been downloaded.
	# * Following a successful download, check the integrity of the .zip files.
	# If any may not be unzipped manually, this is what is causing the error.
	# Attempt a re-download and, in worst cases, merge only functional files.
	# * Following successful unzipping, one shapefile may contain a corrupt or empty geometry.
	# Although qGIS can automatically merge only valid geometries, do examine the files individually in qGIS to determine the problem dataset so that it may be flagged and reported to Brenda for removal from the list.
	# At this point, if problems are being encountered, a manual merge is necessary.
	# In qGIS, select 'Merge shapefiles to one' under the 'Data Management' submenu in the 'Vector' menu and responding to the resulting prompt.
	# * Following unzipping, but all shapefiles are valid, the computer performing the operations may not have the correct path specified to 'shapefile.py.'
	# After ensuring that the correct version of the script is on the machine, move it to C:\PythonXX (where XX is the version, likely 27).
	# Add this directory to the system path by visiting the 'System' control panel item, selecting 'Advanced system settings,' then clicking on 'Environment Variables...' under the 'Advanced' tab.
	# Double-click on the 'Path' variable and without deleting anything, append a semicolon and the directory's path to the end of the variable definition.

	import urllib2, urllib, zipfile, os, glob, shapefile, time
	from bs4 import BeautifulSoup
	from urlparse import urlsplit
	from time import localtime, strftime, gmtime

	# Define NOAA website url
	noaaUrl = 'http://www.ngs.noaa.gov/cgi-bin/ds_proj_sf.prl'

	# Define directories based on current month and year
	dir = 'Z:/PROJECTS/controlfinder/NGS/'
	year_dir = dir + 'NGS_' + strftime("%Y/", localtime())
	month_dir = year_dir + 'NGS_' + strftime("%b_%Y/", localtime())

	# Test if directory 'NGS_YEAR' exists, if not create it
	if (not(os.path.isdir(year_dir))):
	os.mkdir(year_dir)

	# Test if month directory exists, if not create it
	if (not(os.path.isdir(month_dir))):
	os.mkdir(month_dir)

	# Test if hmp zip and unzip directories exist, if not create them
	if (not(os.path.isdir(month_dir + 'hmp_zip'))):
	os.mkdir(month_dir + 'hmp_zip')
	if (not(os.path.isdir(month_dir + 'hmp_unzip'))):
	os.mkdir(month_dir + 'hmp_unzip')

	# Create parameter dictionary to be encoded and sent to first page of NOAA url.
	valueDict = {
	'ProjBox': 'GPS1448',
	'PREFIX': 'GPS1448',
	'IncludeSelected': 'N',
	'TypeSelected': 'MOD',
	'StabilSelected': '0',
	'MetaDataFormat': 'HTML',
	'CompressType': 'Zipped'}

	# Open text file with list of hmp project IDs.
	projList = open('hmp_projIDs.txt').readlines()

	# Print start time
	print 'The START time is: ' + strftime("%A %c", localtime()) + '\n'

	# Save the start time for calculation purpose
	starttime = time.clock()

	# Test with shortened project list
	#projList = open('hmp_projIDs_short.txt').readlines()

	# Create edited list to remove newline (\n) characters from each list item. Necessary to do so in this fashion because the last item does not have a trailing newline character.
	projListEdit = []
	a = 0
	while a < len(projList)-1:
	projListEdit.append(projList[a][:-1])
	a += 1
	projListEdit.append(projList[len(projList)-1])

	count = 0

	for each in projListEdit:

	# Replace 'ProjBox' and 'PREFIX' value with new project ID and send post. Open html response and parse out station values using BeautifulSoup. Create list of station values.
	projstarttime = time.clock()

	valueDict['ProjBox'] = each
	valueDict['PREFIX'] = each
	projData = urllib.urlencode(valueDict)
	req = urllib2.Request(noaaUrl, projData)
	response = urllib2.urlopen(req)
	soup = BeautifulSoup(response.read())

	# print 'Received response for project code: '+each+'\n'

	optionList = []

	for options in soup.find_all('option'):
	optionList.append(options.get('value')[:-1])

	del optionList[0:2]

	# print 'Parsed out station values for project code: '+each+'\n'

	# Url-encoded key/value pairs for parameters other than station values.
	valueDict2 = {
	'QueryHidden': each,
	'MetaDataFormatHidden': 'HTML',
	'DisplayActive': 'IsDisplayed',
	'Get Shapefile': 'Get Shapefile',
	'CompressTypeHidden': 'Zipped',
	'SortSelected': 'Designation',
	'PrefixHidden': each,
	'ControlTypeHidden': 'MOD-0-0-0-0'}

	paramString = urllib.urlencode(valueDict2)

	# Append each encoded key (MarkSelected) - value (station) pair to the parameter string.
	for station in optionList:
	i = urllib.urlencode({'MarkSelected': station})
	paramString = paramString + '&' + i

	# print 'Created parameter string including all station values for project code: '+each+'\n'

	# Send request with NOAA url and parameter values encoded in string, open response and send filename into variable, open file in output directory, write contents of received file and close.
	request = urllib2.Request(noaaUrl, paramString)
	response = urllib2.urlopen(request)
	filename = response.info()['Content-Disposition'].split('filename=')[1]
	f = open(month_dir+'/hmp_zip/'+filename, 'wb')
	f.write(response.read())
	f.close()

	# Update progress
	count += 1
	print 'Completed '+str(count)+' of '+str(len(projListEdit))+' projects. Download took ' + strftime("%M minutes and %S seconds", (gmtime(time.clock() - projstarttime)))

	# print 'Saved zip file for project code: '+each+'\n'

	print '\nFinished downloading hmp shapefiles!!!\n'
	print '\nTotal download time: ' + strftime("%H hours, %M minutes and %S seconds", (gmtime(time.clock() - starttime)))

	# Create list of folder contents
	zipList = os.listdir(month_dir+'/hmp_zip')

	# Unzip and output contents to output directory for all files in input folder
	for i in zipList:
	zip = zipfile.ZipFile(month_dir+'/hmp_zip/'+i)
	zip.extractall(month_dir+'/hmp_unzip/')

	fileList = glob.glob(month_dir+'/hmp_unzip/'+'*.shp')

	w = shapefile.Writer()

	# Create shapefile using pyshp (shapefile) library...no idea how, but it works!
	for f in fileList:
	r = shapefile.Reader(f)
	w._shapes.extend(r.shapes())
	w.records.extend(r.records())
	w.fields = list(r.fields)
	w.save(month_dir+'merged_shape')

	# Create .prj file for GCS_North_American_1983_HARN
	prj = open(month_dir+'merged_shape.prj', 'w')
	epsg = 'GEOGCS["GCS_North_American_1983_HARN",DATUM["D_North_American_1983_HARN",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]'
	prj.write(epsg)
	prj.close()