|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import matplotlib.dates as mdates |
|
import pandas as pd |
|
from datetime import datetime, timedelta |
|
import re |
|
import sys |
|
|
|
def read_jrnl_sport(jrnlfile, colormap='Pastel1'): |
|
# convert datafile to Pandas DataFrame |
|
df = pd.read_csv(jrnlfile, sep='@', names=['tmp', 'sport']) |
|
|
|
# extract date and copy to `date` column |
|
temp = df.tmp.str.extract('.*\[(.*)\].*') |
|
df['date'] = temp.astype('datetime64[ns]') |
|
|
|
# set date as index |
|
df.set_index('date', inplace=True) |
|
|
|
# extract description and copy to `desc` column |
|
df['desc'] = df['tmp'].str.slice(13) |
|
|
|
# set `sport` column to categorical |
|
df['sport'] = df.sport.astype('category') |
|
|
|
# make `sport_id` column with numerical ids |
|
# corresponding to categories |
|
df['sport_id'] = df.sport.cat.codes |
|
|
|
# assign one color to each sport activity |
|
allsports = df.sport.cat.categories |
|
list_of_colors = plt.cm.get_cmap(colormap).colors |
|
cdict = dict(zip(np.unique(allsports), list_of_colors)) |
|
|
|
# remove `tmp` column |
|
return df.drop(['tmp'], axis=1), cdict |
|
|
|
|
|
def select_period(df, year=None, date=None, days=30, forward=False): |
|
''' |
|
Select period either by specifying a single year |
|
or a specific date with an interval in days. |
|
|
|
> df_sel = select_period(df, year=2021) |
|
|
|
Select the last 30 days starting now: |
|
> df_sel = select_period(df, days=30) |
|
|
|
Select the last 30 days starting on a specific date: |
|
> df_sel = select_period(df, date='2020-06-19', days=30) |
|
''' |
|
if year is not None: |
|
mask = df.index.year == year |
|
else: |
|
if date is None: |
|
date = datetime.now() |
|
else: |
|
date = datetime.strptime(date, '%Y-%m-%d') |
|
if forward: |
|
start_date = date |
|
end_date = date + timedelta(days=days) |
|
else: |
|
start_date = date - timedelta(days=days) |
|
end_date = date |
|
mask = (df.index >= start_date) & (df.index <= end_date) |
|
return df.loc[mask] |
|
|
|
|
|
def get_runs(df): |
|
#---------------------------------- |
|
# FIRST, get trail runs |
|
trails = df[df.sport=='TRAIL'] |
|
|
|
# extracts distances from `desc` column for trail runs |
|
kilometers = [] |
|
for i, r in trails.iterrows(): |
|
kk = re.search(r'([0-9]*[.])?[0-9]+km', r.desc) |
|
if kk is not None: |
|
kilometers.append(kk.group().replace('km','')) |
|
else: |
|
kilometers.append(None) |
|
kkarr = np.asarray(kilometers, dtype=np.float) |
|
trails = trails.assign(distance=kkarr) |
|
|
|
#---------------------------------- |
|
# SECOND, get normal runs |
|
runs = df[df.sport=='CORSA'] |
|
|
|
# extracts time and distances from `desc` column for runs |
|
minutes = [] |
|
kilometers = [] |
|
for i, r in runs.iterrows(): |
|
mm = re.search(r'([0-9]*[.])?[0-9]+m', r.desc) |
|
kk = re.search(r'([0-9]*[.])?[0-9]+km', r.desc) |
|
if mm is not None: |
|
minutes.append(mm.group().replace('m','')) |
|
else: |
|
minutes.append(None) |
|
if kk is not None: |
|
kilometers.append(kk.group().replace('km','')) |
|
else: |
|
kilometers.append(None) |
|
mmarr = np.asarray(minutes, dtype=np.float) |
|
kkarr = np.asarray(kilometers, dtype=np.float) |
|
runs = runs.assign(distance=kkarr) |
|
runs = runs.assign(time=mmarr) |
|
|
|
# set up some filters |
|
nodist = pd.isna(runs.distance) |
|
notime = pd.isna(runs.time) |
|
both = ~notime & ~nodist |
|
|
|
# calculate speed for each activity |
|
# it will be NaN where either speed or distance is not registered |
|
runs['speed'] = runs['time'] / runs['distance'] |
|
|
|
# calculate avg speed in minutes/km |
|
avgspeed = runs['speed'].mean() |
|
|
|
# fill in runs where only distance has been recorded |
|
# time is based on average speed, and speed = average speed |
|
onlydist = notime & ~nodist |
|
runs.loc[onlydist, 'time'] = runs[onlydist]['distance'] * avgspeed |
|
runs.loc[onlydist, 'speed'] = avgspeed |
|
|
|
# fill in runs where only time has been recorded |
|
# time is based on average speed, and speed = average speed |
|
onlytime = ~notime & nodist |
|
runs.loc[onlytime, 'distance'] = runs[onlytime]['time'] / avgspeed |
|
runs.loc[onlytime, 'speed'] = avgspeed |
|
|
|
#---------------------------------- |
|
# LAST, append trails to the end of runs |
|
# speed and time is of no interest for trail runs |
|
# sorted on dates |
|
return runs.append(trails).sort_index() |
|
|
|
|
|
def plot_sportpie(df, cdict, ax=None): |
|
def label_pie_slice(val): |
|
return f'{val/100*len(df):.0f} ({val:.0f}%)' |
|
|
|
grouped = df.groupby(df['sport'].cat.remove_unused_categories(), sort=False) |
|
sports = grouped['sport'].count().index.values.to_list() |
|
|
|
date_min = df.index.min().strftime('%Y-%m-%d') |
|
date_max = df.index.max().strftime('%Y-%m-%d') |
|
if ax is None: |
|
f, ax = plt.subplots(constrained_layout=True, num=1) |
|
grouped.size().plot(kind='pie', autopct=label_pie_slice, |
|
colors=[cdict[key] for key in sports], ax=ax) |
|
ax.set_title('START: {}\nEND: {}'.format(date_min, date_max), fontsize='x-large') |
|
ax.set_ylabel('') |
|
|
|
|
|
def plot_sportbar(df, cdict, ax=None): |
|
date_min = df.index.min().strftime('%Y-%m-%d') |
|
date_max = df.index.max().strftime('%Y-%m-%d') |
|
sports = df['sport'].cat.categories.to_list() |
|
|
|
if ax is None: |
|
f, ax = plt.subplots(constrained_layout=True, num=2) |
|
df['sport'].value_counts(sort=False).plot(kind='bar', rot=45, ax=ax, |
|
color=[cdict[key] for key in sports]) |
|
ax.set_title('START: {}\nEND: {}'.format(date_min, date_max), fontsize='x-large') |
|
ax.set_ylabel('Number of activities') |
|
|
|
|
|
def plot_runs_hist(df, ax=None): |
|
# calculate statistics |
|
mean = df.distance.mean() |
|
p25 = df.distance.describe().loc['25%'] |
|
p75 = df.distance.describe().loc['75%'] |
|
|
|
# selects only trail runs |
|
tt = df[df.sport=='TRAIL'] |
|
|
|
# set plot options |
|
opt = dict(lw=2, alpha=0.5) |
|
opt_tr = dict(marker=2, color='g', ls='none', ms=10, label='Trail Runs') |
|
|
|
# make plot |
|
if ax is None: |
|
_, ax = plt.subplots(constrained_layout=True, figsize=(8,4), num=3) |
|
df.distance.plot.hist(bins=50, color='k', alpha=0.25, ax=ax, label='') |
|
ax.axvline(mean, color='r', ls='--', label='Mean: {:.1f} km'.format(mean), **opt) |
|
ax.axvline(p25, color='r', ls=':', label='P25: {:.1f} km'.format(p25), **opt) |
|
ax.axvline(p75, color='r', ls=':', label='P75: {:.1f} km'.format(p75), **opt) |
|
if tt.shape[0] > 0: |
|
ax.plot(tt.distance, np.zeros(tt.shape[0]), **opt_tr) |
|
ax.legend() |
|
ax.set_xlabel('Distance (km)') |
|
|
|
|
|
def plot_runs_stats(df): |
|
# trim input dataframe to first sunday |
|
nn = df.index.day_name()=='Sunday' |
|
start_weekly_stats = df[nn].iloc[[0]].index.to_pydatetime()[0] |
|
mm = df.index >= start_weekly_stats |
|
weekly_runs = df[mm].resample('W', closed='left').sum() |
|
avg_weekd = weekly_runs.distance.mean() |
|
max_weekd = weekly_runs.distance.max() |
|
|
|
# calculate stats |
|
runs = df.shape[0] |
|
weeks = (df.index[-1]-df.index[0]).days//7 |
|
freq = runs/weeks |
|
avgdst = df.describe().loc['mean', 'distance'] |
|
pb_dst = df.describe().loc['max', 'distance'] |
|
totdst = df['distance'].sum() |
|
tmp0 = df.describe().loc['mean', 'speed'] |
|
avgs_min = int(tmp0) |
|
avgs_sec = np.round((tmp0 - int(tmp0))*60) |
|
avgspd = "{:.0f}'{:.0f}\"".format(avgs_min, avgs_sec) |
|
|
|
tmp1 = df.describe().loc['min', 'speed'] |
|
pbs_min = int(tmp1) |
|
pbs_sec = np.round((tmp1 - int(tmp1))*60) |
|
pb_spd = "{:.0f}'{:.0f}\"".format(pbs_min, pbs_sec) |
|
|
|
|
|
textstr = f''' |
|
Runs: {runs} |
|
Frequency: {freq:.1f} runs/week |
|
Avg distance: {avgdst:.1f} km |
|
Avg speed: {avgspd} min/km |
|
PB distance: {pb_dst} km |
|
PB speed: {pb_spd} mins/km |
|
Total distance: {totdst:.0f} km |
|
|
|
Avg weekly: {avg_weekd:.1f} km |
|
Max weekly: {max_weekd:.1f} km |
|
''' |
|
|
|
opt = dict(color='k', marker='_', ls='none') |
|
f, ax = plt.subplots(nrows=3, num=4, |
|
sharex=True, figsize=(10, 6)) |
|
# first subplot: TIME |
|
ax[0].bar(df.index, height=df.time, width=1.5, alpha=0.2, color='b') |
|
ax[0].set_ylabel('Time (min)', color='b') |
|
ax[0].tick_params(axis='y', colors='b') |
|
axwt = ax[0].twinx() |
|
axwt.plot(weekly_runs.index, weekly_runs.time, **opt) |
|
axwt.set_ylabel('Weekly Avg', color='k') |
|
axwt.tick_params(axis='y', colors='k') |
|
|
|
# second subplot: DISTANCES |
|
ax[1].bar(df.index, height=df.distance, width=1.5, alpha=0.2, color='g') |
|
ax[1].set_ylabel('Distance (km)', color='g') |
|
ax[1].tick_params(axis='y', colors='g') |
|
axwd = ax[1].twinx() |
|
axwd.plot(weekly_runs.index, weekly_runs.distance, **opt) |
|
axwd.set_ylabel('Weekly Avg', color='k') |
|
axwd.tick_params(axis='y', colors='k') |
|
|
|
# third subplot: SPEED |
|
ax[2].bar(df.index, height=df.speed, width=1.5, alpha=0.2, color='r') |
|
ax[2].plot(df.rolling('7d').mean().speed, '-r') |
|
ax[2].set_ylabel('Speed (min/km)', color='r') |
|
ax[2].tick_params(axis='y', colors='r') |
|
ax[2].tick_params(axis='x', rotation=45) |
|
ax[2].set_ylim(np.floor(df.speed.min()), np.ceil(df.speed.max())) |
|
# turn on horizontal gridlines |
|
for aa in ax: |
|
aa.grid(axis='y') |
|
|
|
# add stats |
|
props = dict(boxstyle='round', facecolor='white', alpha=0.7) |
|
plt.figtext(0.78, 0.5, textstr, va='center', bbox=props) |
|
plt.subplots_adjust(right=0.7, left=0.1, top=0.95, bottom=0.1) |
|
|
|
|
|
def make_plots(df, cdict): |
|
plot_sportpie(df, cdict) |
|
plot_sportbar(df, cdict) |
|
runs = get_runs(df) |
|
plot_runs_hist(runs) |
|
plot_runs_stats(runs) |
|
plt.show() |
|
|
|
|
|
if len(sys.argv) == 1: |
|
print('Usage: python', sys.argv[0], '[year YYYY | days DD | all]') |
|
sys.exit(1) |
|
|
|
if sys.argv[1]=='year': |
|
df, cdict = read_jrnl_sport('~/GOOGLEDRIVE/Apps/journal_sport.txt') |
|
selected_period = select_period(df, year=int(sys.argv[2])) |
|
make_plots(selected_period, cdict) |
|
|
|
elif sys.argv[1]=='days': |
|
df, cdict = read_jrnl_sport('~/GOOGLEDRIVE/Apps/journal_sport.txt') |
|
selected_period = select_period(df, days=int(sys.argv[2])) |
|
make_plots(selected_period, cdict) |
|
|
|
elif sys.argv[1]=='all': |
|
df, cdict = read_jrnl_sport('~/GOOGLEDRIVE/Apps/journal_sport.txt') |
|
make_plots(df, cdict) |
Examples of output plots
Pie chart with all activities in selected period:
Histogram of running distances:
Duration, distances and speed of runs; the grey markers on the duration and distance plots mark weekly averages, while the continuous red line on the speed plot is a running average over a period of a week:
Distances and speed are in kilometers and minutes/kilometers because that's what I'm used to.