Created
June 28, 2019 01:45
-
-
Save dneuman/d92035a63d404f0b9a4ce69552b49a6f to your computer and use it in GitHub Desktop.
Download and process NOAA 5-min data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Jun 25 18:45:37 2019 | |
@author: dan | |
https://www.ncdc.noaa.gov/crn/qcdatasets.html | |
Change `filebase` for your local storage location | |
Run setup() to download data and do all the necessary processing. About 4 GB of data. | |
Run plot() to plot comparison with 5-min mean. Options are 'mid', 'time', | |
and 'week' | |
""" | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import glob | |
plt.style.use('weather') | |
filebase = '/Users/dan/Documents/Python/tmp_compare/data/' | |
# list of non-CONUS stations and stations with less 2006 data | |
exclude = ['AK_Fairbanks_11_NE', 'AK_Sitka_1_NE', 'AK_St._Paul_4_NE', | |
'AK_Utqiagvik_formerly_Barrow_4_ENE', 'HI_Hilo_5_S', | |
'HI_Mauna_Loa_5_NNE', 'AR_Batesville_8_WNW', | |
'TX_Bronte_11_NNE', 'SD_Pierre_24_S', | |
'ND_Northgate_5_ESE', 'OR_Corvallis_10_SSW', 'WA_Quinault_4_NE', | |
'AL_Russellville_4_SSE', 'AL_Scottsboro_2_NE','AL_Fairhope_3_NE', | |
'PA_Avondale_2_N', 'AL_Cullman_3_ENE', | |
'AL_Courtland_2_WSW', 'AL_Valley_Head_1_SSW'] | |
def get_names(): | |
names = pd.read_csv(filebase+'names.txt', sep='\t', index_col=0) | |
return names | |
def save_names(names): | |
names.to_csv(filebase+'names.txt', sep='\t') | |
def get_data(): | |
'''Run once to download all the data. 'names.txt' starts as a simple | |
list of stations to download. | |
''' | |
names = get_names() | |
base = 'https://www1.ncdc.noaa.gov/pub/data/uscrn/products/subhourly01/' | |
pre = '/CRNS0101-05-' | |
sfx = '.txt' | |
for i in names.index: | |
name = names.loc[i] # view into names | |
print(name.name+' start') | |
# check for file | |
files = glob.glob(filebase+f'{name.name}-*.txt') | |
# check if files exist but need update | |
if (len(files) > 0) and (np.isnan(name.Lat)): | |
url = base + '2006' + pre + '2006-' + name.name + sfx | |
df = pd.read_csv(url, | |
sep='\s+', | |
header=None, | |
index_col=False, | |
usecols=[0, 6, 7], | |
names=['WBANNO', 'Long', 'Lat'], | |
nrows=2, | |
) | |
name.WBANNO = df.WBANNO[0] | |
name.Long = df.Long[0] | |
name.Lat = df.Lat[0] | |
save_names(names) | |
continue | |
for year in range(2006, 2019): | |
print(year) | |
furl = filebase+f'{name.name}-{year}.txt' | |
if furl in files: continue | |
url = base+f'{year}'+pre+f'{year}-'+name.name+sfx | |
try: | |
df = pd.read_csv(url, | |
sep='\s+', | |
header=None, | |
index_col=False, | |
usecols=[0, 1, 2, 6, 7, 8], | |
names=['WBANNO', 'date', 'time', | |
'Long', 'Lat', 't'], | |
parse_dates={'dt': [1, 2]}, | |
na_values=-9999) | |
except: | |
print(f'{year} not found') | |
continue | |
if np.isnan(name.Lat): | |
name.WBANNO = df.WBANNO[0] | |
name.Long = df.Long[0] | |
name.Lat = df.Lat[0] | |
save_names(names) | |
df.to_csv(filebase+f'{name.name}-{year}.txt', sep='\t', | |
index=False, columns=['dt', 't']) | |
print(name.name+' end') | |
def list_data(names): | |
for i in names.index: | |
n = names.loc[i] | |
files = glob.glob(filebase+f'{n.name}-*.txt') | |
number = len(files) | |
if number < 13: | |
print(f'{n.name} {number}') | |
def read_data(name): | |
files = glob.glob(filebase+f'{name}-*.txt') | |
files.sort() | |
df = pd.concat([pd.read_csv(f, | |
sep='\t', index_col=0, parse_dates=True, | |
) for f in files]) | |
df.interpolate(method='time', inplace=True) | |
df = df.loc[df.index.year < 2019] | |
return df | |
def process(): | |
''' Run once to process data for faster plotting | |
''' | |
names = get_names() | |
for name in names.index: | |
print(name) | |
mf = read_data(name) | |
mf['y'] = mf.index.year | |
mf['d'] = mf.index.dayofyear | |
mgb = mf.groupby(['y', 'd']) | |
df = mgb.mean() | |
df['mid'] = (mgb.max() + mgb.min())/2. | |
df['diff'] = df.mid - df.t | |
# calculate 9am local time | |
l = names.Long[name] | |
hh = (9. - l * 24./360.) % 24 | |
h = np.floor(hh) | |
mm = (hh - h) * 60 | |
m = (np.floor(mm/5 + .5) * 5) | |
sf = mf.loc[mf.index.hour == h, ['y', 't']] | |
sf = sf.loc[sf.index.minute == m] | |
wf = sf.loc[sf.index.dayofweek == 1] # weekly sample | |
yf = df.groupby('y').mean() | |
yf['time'] = sf.groupby('y').mean() | |
yf['week'] = wf.groupby('y').mean() | |
yf.to_csv(filebase+f'{name}_annual.txt') | |
def read_annual(name): | |
yf = pd.read_csv(filebase+f'{name}_annual.txt', index_col=0) | |
return yf | |
def set_weights(): | |
''' Run once to calculate weights for each station | |
''' | |
names = get_names() | |
for i in names.index: | |
n = names.loc[i] | |
y1 = n.Lat | |
x1 = n.Long*np.cos(y1 * np.pi/180.) | |
weights = [] | |
for j in names.index: | |
nn = names.loc[j] | |
y2 = nn.Lat | |
x2 = nn.Long*np.cos(y1 * np.pi/180.) | |
weights.append((x1-x2)**2 + (y1-y2)**2) | |
weights.sort() | |
n.Weight = weights[1] # 1st is always 0 | |
if n.Weight < .05: | |
n.Weight = weights[2]/2. | |
save_names(names) | |
def setup(): | |
get_data() | |
set_weights() | |
process() | |
def plot(sample='mid'): | |
names = get_names() | |
# exclude stations with incomplete data in 2006, plus non-CONUS | |
names.drop(labels=exclude, inplace=True) | |
fig = plt.figure(f'Average-{sample}', clear=True) | |
ax1 = fig.add_subplot(211) | |
ax2 = fig.add_subplot(212) | |
af = pd.DataFrame(index=list(range(2006,2019))) | |
df = af.copy() | |
wtscale = 0 | |
# plot stations, collect weights for scaling later | |
for name in names.index: | |
wtscale += names.Weight[name] | |
yf = read_annual(name) | |
af[name] = yf.t - yf.t[2006] | |
df[name] = yf[sample] - yf[sample][2006] | |
ax1.plot(af[name], color='k', alpha=0.1, lw=1.0) | |
ax2.plot(df[name], color='k', alpha=0.1, lw=1.0) | |
# Weight each station | |
for name in af.columns: | |
af[name] *= names.Weight[name]/wtscale | |
df[name] *= names.Weight[name]/wtscale | |
# Calculate means of weighted stations (by adding) | |
for fr, ax in zip([af, df], [ax1, ax2]): | |
fr['Avg'] = fr.sum(axis=1) | |
ax.plot(fr.Avg, color='r',lw=3.0) | |
x = fr.index.values | |
slope, intercept = np.polyfit(x, fr.Avg.values, 1) | |
y = slope * x + intercept | |
ax.plot(x, y, color='b', lw=2.0) | |
decade = np.around(slope*10, decimals=3) | |
val = str(decade) | |
text = f'Trend: {val}°C/decade' | |
ax.annotate(text, (x[7], y[7]), | |
xytext=(-20,20), textcoords='offset points') | |
ax1.set_title('Temperature Trend in CONUS Using Different Techniques') | |
ax1.set_ylabel('Anomoly in °C') | |
ax2.set_ylabel('Anomoly in °C') | |
ax1.annotate('Daily Temperature from Mean of 5-min Values', | |
(.01, .9), xycoords='axes fraction') | |
tdict = {'time': 'Daily local 9 am Temperature', | |
'week': 'Weekly Sunday 9 am Temperature', | |
'mid': 'Daily Temperature from Average of Hi/Lo Values'} | |
ax2.annotate(tdict[sample], (.01, .9), xycoords='axes fraction') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment