dneuman · June 28, 2019 01:45
diff --git a/tmp_compare.txt b/tmp_compare.txt
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Jun 25 18:45:37 2019

 @author: dan

 https://www.ncdc.noaa.gov/crn/qcdatasets.html

 Change `filebase` for your local storage location
 Run setup() to download data and do all the necessary processing. About 4 GB of data.
 Run plot() to plot comparison with 5-min mean. Options are 'mid', 'time',
   and 'week'

 """

 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 import glob

 plt.style.use('weather')
 filebase = '/Users/dan/Documents/Python/tmp_compare/data/'

 # list of non-CONUS stations and stations with less 2006 data
 exclude = ['AK_Fairbanks_11_NE', 'AK_Sitka_1_NE', 'AK_St._Paul_4_NE',
           'AK_Utqiagvik_formerly_Barrow_4_ENE', 'HI_Hilo_5_S',
           'HI_Mauna_Loa_5_NNE', 'AR_Batesville_8_WNW',
           'TX_Bronte_11_NNE', 'SD_Pierre_24_S',
           'ND_Northgate_5_ESE', 'OR_Corvallis_10_SSW', 'WA_Quinault_4_NE',
           'AL_Russellville_4_SSE', 'AL_Scottsboro_2_NE','AL_Fairhope_3_NE',
           'PA_Avondale_2_N', 'AL_Cullman_3_ENE',
           'AL_Courtland_2_WSW', 'AL_Valley_Head_1_SSW']


 def get_names():
    names = pd.read_csv(filebase+'names.txt', sep='\t', index_col=0)
    return names

 def save_names(names):
    names.to_csv(filebase+'names.txt', sep='\t')

 def get_data():
    '''Run once to download all the data. 'names.txt' starts as a simple
       list of stations to download.
    '''
    names = get_names()
    base = 'https://www1.ncdc.noaa.gov/pub/data/uscrn/products/subhourly01/'
    pre = '/CRNS0101-05-'
    sfx = '.txt'
    for i in names.index:
        name = names.loc[i]  # view into names
        print(name.name+' start')
        # check for file
        files = glob.glob(filebase+f'{name.name}-*.txt')
        # check if files exist but need update
        if (len(files) > 0) and (np.isnan(name.Lat)):
            url = base + '2006' + pre + '2006-' + name.name + sfx
            df = pd.read_csv(url,
                             sep='\s+',
                             header=None,
                             index_col=False,
                             usecols=[0, 6, 7],
                             names=['WBANNO', 'Long', 'Lat'],
                             nrows=2,
                             )
            name.WBANNO = df.WBANNO[0]
            name.Long = df.Long[0]
            name.Lat = df.Lat[0]
            save_names(names)
            continue
        for year in range(2006, 2019):
            print(year)
            furl = filebase+f'{name.name}-{year}.txt'
            if furl in files: continue
            url = base+f'{year}'+pre+f'{year}-'+name.name+sfx
            try:
                df = pd.read_csv(url,
                                 sep='\s+',
                                 header=None,
                                 index_col=False,
                                 usecols=[0, 1, 2, 6, 7, 8],
                                 names=['WBANNO', 'date', 'time',
                                        'Long', 'Lat', 't'],
                                 parse_dates={'dt': [1, 2]},
                                 na_values=-9999)
            except:
                print(f'{year} not found')
                continue
            if np.isnan(name.Lat):
                name.WBANNO = df.WBANNO[0]
                name.Long = df.Long[0]
                name.Lat = df.Lat[0]
                save_names(names)

            df.to_csv(filebase+f'{name.name}-{year}.txt', sep='\t',
                      index=False, columns=['dt', 't'])
        print(name.name+' end')

 def list_data(names):
    for i in names.index:
        n = names.loc[i]
        files = glob.glob(filebase+f'{n.name}-*.txt')
        number = len(files)
        if number < 13:
            print(f'{n.name}  {number}')

 def read_data(name):
    files = glob.glob(filebase+f'{name}-*.txt')
    files.sort()
    df = pd.concat([pd.read_csv(f,
                                sep='\t', index_col=0, parse_dates=True,
                                ) for f in files])
    df.interpolate(method='time', inplace=True)
    df = df.loc[df.index.year < 2019]
    return df

 def process():
    ''' Run once to process data for faster plotting
    '''
    names = get_names()
    for name in names.index:
        print(name)
        mf = read_data(name)
        mf['y'] = mf.index.year
        mf['d'] = mf.index.dayofyear

        mgb = mf.groupby(['y', 'd'])
        df = mgb.mean()
        df['mid'] = (mgb.max() + mgb.min())/2.
        df['diff'] = df.mid - df.t

        # calculate 9am local time
        l = names.Long[name]
        hh = (9. - l * 24./360.) % 24
        h = np.floor(hh)
        mm = (hh - h) * 60
        m = (np.floor(mm/5 + .5) * 5)
        sf = mf.loc[mf.index.hour == h, ['y', 't']]
        sf = sf.loc[sf.index.minute == m]
        wf = sf.loc[sf.index.dayofweek == 1]  # weekly sample

        yf = df.groupby('y').mean()
        yf['time'] = sf.groupby('y').mean()
        yf['week'] = wf.groupby('y').mean()
        yf.to_csv(filebase+f'{name}_annual.txt')

 def read_annual(name):
    yf = pd.read_csv(filebase+f'{name}_annual.txt', index_col=0)
    return yf

 def set_weights():
    ''' Run once to calculate weights for each station
    '''
    names = get_names()
    for i in names.index:
        n = names.loc[i]
        y1 = n.Lat
        x1 = n.Long*np.cos(y1 * np.pi/180.)
        weights = []
        for j in names.index:
            nn = names.loc[j]
            y2 = nn.Lat
            x2 = nn.Long*np.cos(y1 * np.pi/180.)
            weights.append((x1-x2)**2 + (y1-y2)**2)
        weights.sort()
        n.Weight = weights[1]  # 1st is always 0
        if n.Weight < .05:
            n.Weight = weights[2]/2.
    save_names(names)

 def setup():
    get_data()
    set_weights()
    process()

 def plot(sample='mid'):
    names = get_names()
    # exclude stations with incomplete data in 2006, plus non-CONUS
    names.drop(labels=exclude, inplace=True)
    fig = plt.figure(f'Average-{sample}', clear=True)
    ax1 = fig.add_subplot(211)
    ax2 = fig.add_subplot(212)
    af = pd.DataFrame(index=list(range(2006,2019)))
    df = af.copy()
    wtscale = 0
    # plot stations, collect weights for scaling later
    for name in names.index:
        wtscale += names.Weight[name]
        yf = read_annual(name)
        af[name] = yf.t - yf.t[2006]
        df[name] = yf[sample] - yf[sample][2006]
        ax1.plot(af[name], color='k', alpha=0.1, lw=1.0)
        ax2.plot(df[name], color='k', alpha=0.1, lw=1.0)
    # Weight each station
    for name in af.columns:
        af[name] *= names.Weight[name]/wtscale
        df[name] *= names.Weight[name]/wtscale
    # Calculate means of weighted stations (by adding)
    for fr, ax in zip([af, df], [ax1, ax2]):
        fr['Avg'] = fr.sum(axis=1)
        ax.plot(fr.Avg, color='r',lw=3.0)
        x = fr.index.values
        slope, intercept = np.polyfit(x, fr.Avg.values, 1)
        y = slope * x + intercept
        ax.plot(x, y, color='b', lw=2.0)
        decade = np.around(slope*10, decimals=3)
        val = str(decade)
        text = f'Trend: {val}°C/decade'
        ax.annotate(text, (x[7], y[7]),
                    xytext=(-20,20), textcoords='offset points')
    ax1.set_title('Temperature Trend in CONUS Using Different Techniques')
    ax1.set_ylabel('Anomoly in °C')
    ax2.set_ylabel('Anomoly in °C')
    ax1.annotate('Daily Temperature from Mean of 5-min Values',
                (.01, .9), xycoords='axes fraction')
    tdict = {'time': 'Daily local 9 am Temperature',
             'week': 'Weekly Sunday 9 am Temperature',
             'mid': 'Daily Temperature from Average of Hi/Lo Values'}
    ax2.annotate(tdict[sample], (.01, .9), xycoords='axes fraction')
    plt.show()
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on Tue Jun 25 18:45:37 2019

	@author: dan

	https://www.ncdc.noaa.gov/crn/qcdatasets.html

	Change `filebase` for your local storage location
	Run setup() to download data and do all the necessary processing. About 4 GB of data.
	Run plot() to plot comparison with 5-min mean. Options are 'mid', 'time',
	and 'week'

	"""

	import pandas as pd
	import matplotlib.pyplot as plt
	import numpy as np
	import glob

	plt.style.use('weather')
	filebase = '/Users/dan/Documents/Python/tmp_compare/data/'

	# list of non-CONUS stations and stations with less 2006 data
	exclude = ['AK_Fairbanks_11_NE', 'AK_Sitka_1_NE', 'AK_St._Paul_4_NE',
	'AK_Utqiagvik_formerly_Barrow_4_ENE', 'HI_Hilo_5_S',
	'HI_Mauna_Loa_5_NNE', 'AR_Batesville_8_WNW',
	'TX_Bronte_11_NNE', 'SD_Pierre_24_S',
	'ND_Northgate_5_ESE', 'OR_Corvallis_10_SSW', 'WA_Quinault_4_NE',
	'AL_Russellville_4_SSE', 'AL_Scottsboro_2_NE','AL_Fairhope_3_NE',
	'PA_Avondale_2_N', 'AL_Cullman_3_ENE',
	'AL_Courtland_2_WSW', 'AL_Valley_Head_1_SSW']


	def get_names():
	names = pd.read_csv(filebase+'names.txt', sep='\t', index_col=0)
	return names

	def save_names(names):
	names.to_csv(filebase+'names.txt', sep='\t')

	def get_data():
	'''Run once to download all the data. 'names.txt' starts as a simple
	list of stations to download.
	'''
	names = get_names()
	base = 'https://www1.ncdc.noaa.gov/pub/data/uscrn/products/subhourly01/'
	pre = '/CRNS0101-05-'
	sfx = '.txt'
	for i in names.index:
	name = names.loc[i] # view into names
	print(name.name+' start')
	# check for file
	files = glob.glob(filebase+f'{name.name}-*.txt')
	# check if files exist but need update
	if (len(files) > 0) and (np.isnan(name.Lat)):
	url = base + '2006' + pre + '2006-' + name.name + sfx
	df = pd.read_csv(url,
	sep='\s+',
	header=None,
	index_col=False,
	usecols=[0, 6, 7],
	names=['WBANNO', 'Long', 'Lat'],
	nrows=2,
	)
	name.WBANNO = df.WBANNO[0]
	name.Long = df.Long[0]
	name.Lat = df.Lat[0]
	save_names(names)
	continue
	for year in range(2006, 2019):
	print(year)
	furl = filebase+f'{name.name}-{year}.txt'
	if furl in files: continue
	url = base+f'{year}'+pre+f'{year}-'+name.name+sfx
	try:
	df = pd.read_csv(url,
	sep='\s+',
	header=None,
	index_col=False,
	usecols=[0, 1, 2, 6, 7, 8],
	names=['WBANNO', 'date', 'time',
	'Long', 'Lat', 't'],
	parse_dates={'dt': [1, 2]},
	na_values=-9999)
	except:
	print(f'{year} not found')
	continue
	if np.isnan(name.Lat):
	name.WBANNO = df.WBANNO[0]
	name.Long = df.Long[0]
	name.Lat = df.Lat[0]
	save_names(names)

	df.to_csv(filebase+f'{name.name}-{year}.txt', sep='\t',
	index=False, columns=['dt', 't'])
	print(name.name+' end')

	def list_data(names):
	for i in names.index:
	n = names.loc[i]
	files = glob.glob(filebase+f'{n.name}-*.txt')
	number = len(files)
	if number < 13:
	print(f'{n.name} {number}')

	def read_data(name):
	files = glob.glob(filebase+f'{name}-*.txt')
	files.sort()
	df = pd.concat([pd.read_csv(f,
	sep='\t', index_col=0, parse_dates=True,
	) for f in files])
	df.interpolate(method='time', inplace=True)
	df = df.loc[df.index.year < 2019]
	return df

	def process():
	''' Run once to process data for faster plotting
	'''
	names = get_names()
	for name in names.index:
	print(name)
	mf = read_data(name)
	mf['y'] = mf.index.year
	mf['d'] = mf.index.dayofyear

	mgb = mf.groupby(['y', 'd'])
	df = mgb.mean()
	df['mid'] = (mgb.max() + mgb.min())/2.
	df['diff'] = df.mid - df.t

	# calculate 9am local time
	l = names.Long[name]
	hh = (9. - l * 24./360.) % 24
	h = np.floor(hh)
	mm = (hh - h) * 60
	m = (np.floor(mm/5 + .5) * 5)
	sf = mf.loc[mf.index.hour == h, ['y', 't']]
	sf = sf.loc[sf.index.minute == m]
	wf = sf.loc[sf.index.dayofweek == 1] # weekly sample

	yf = df.groupby('y').mean()
	yf['time'] = sf.groupby('y').mean()
	yf['week'] = wf.groupby('y').mean()
	yf.to_csv(filebase+f'{name}_annual.txt')

	def read_annual(name):
	yf = pd.read_csv(filebase+f'{name}_annual.txt', index_col=0)
	return yf

	def set_weights():
	''' Run once to calculate weights for each station
	'''
	names = get_names()
	for i in names.index:
	n = names.loc[i]
	y1 = n.Lat
	x1 = n.Longnp.cos(y1 np.pi/180.)
	weights = []
	for j in names.index:
	nn = names.loc[j]
	y2 = nn.Lat
	x2 = nn.Longnp.cos(y1 np.pi/180.)
	weights.append((x1-x2)2 + (y1-y2)2)
	weights.sort()
	n.Weight = weights[1] # 1st is always 0
	if n.Weight < .05:
	n.Weight = weights[2]/2.
	save_names(names)

	def setup():
	get_data()
	set_weights()
	process()

	def plot(sample='mid'):
	names = get_names()
	# exclude stations with incomplete data in 2006, plus non-CONUS
	names.drop(labels=exclude, inplace=True)
	fig = plt.figure(f'Average-{sample}', clear=True)
	ax1 = fig.add_subplot(211)
	ax2 = fig.add_subplot(212)
	af = pd.DataFrame(index=list(range(2006,2019)))
	df = af.copy()
	wtscale = 0
	# plot stations, collect weights for scaling later
	for name in names.index:
	wtscale += names.Weight[name]
	yf = read_annual(name)
	af[name] = yf.t - yf.t[2006]
	df[name] = yf[sample] - yf[sample][2006]
	ax1.plot(af[name], color='k', alpha=0.1, lw=1.0)
	ax2.plot(df[name], color='k', alpha=0.1, lw=1.0)
	# Weight each station
	for name in af.columns:
	af[name] *= names.Weight[name]/wtscale
	df[name] *= names.Weight[name]/wtscale
	# Calculate means of weighted stations (by adding)
	for fr, ax in zip([af, df], [ax1, ax2]):
	fr['Avg'] = fr.sum(axis=1)
	ax.plot(fr.Avg, color='r',lw=3.0)
	x = fr.index.values
	slope, intercept = np.polyfit(x, fr.Avg.values, 1)
	y = slope * x + intercept
	ax.plot(x, y, color='b', lw=2.0)
	decade = np.around(slope*10, decimals=3)
	val = str(decade)
	text = f'Trend: {val}°C/decade'
	ax.annotate(text, (x[7], y[7]),
	xytext=(-20,20), textcoords='offset points')
	ax1.set_title('Temperature Trend in CONUS Using Different Techniques')
	ax1.set_ylabel('Anomoly in °C')
	ax2.set_ylabel('Anomoly in °C')
	ax1.annotate('Daily Temperature from Mean of 5-min Values',
	(.01, .9), xycoords='axes fraction')
	tdict = {'time': 'Daily local 9 am Temperature',
	'week': 'Weekly Sunday 9 am Temperature',
	'mid': 'Daily Temperature from Average of Hi/Lo Values'}
	ax2.annotate(tdict[sample], (.01, .9), xycoords='axes fraction')
	plt.show()