Last active
December 12, 2017 21:40
-
-
Save TheDataLeek/c5aff9ca5ee2a8db61fe8dbc5fca95c9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import os | |
import argparse | |
import re | |
from pprint import pprint as pp | |
import pytz | |
import dateutil | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import scipy.io as sc_io | |
import tqdm | |
IMG_FOLDER = './img' | |
ZOE = '@SpatulaFish#8544' | |
if not os.path.exists(IMG_FOLDER): | |
os.mkdir(IMG_FOLDER) | |
days_of_week = [ | |
'Monday', | |
'Tuesday', | |
'Wednesday', | |
'Thursday', | |
'Friday', | |
'Saturday', | |
'Sunday' | |
] | |
def getpath(filename): | |
return os.path.join(IMG_FOLDER, filename) | |
def main(): | |
args = get_args() | |
df = get_data(args.directory) | |
plot_message_counts(df) | |
plot_over_time(df) | |
for user in tqdm.tqdm(df['user'].unique()): | |
try: | |
plot_time_of_day(df, name=user) | |
plot_time_of_week(df, name=user) | |
except TypeError as e: | |
print(f'{user}: {e}') | |
pass | |
def get_data(directory): | |
# 2016-03-16 21:10:32 #general @SpatulaFish#8544: test | |
num = '[0-9]' | |
channel = '#[a-zA-Z\-]+' | |
user = f'@.+#{num}{{4}}' | |
date = f'{num}{{4}}-{num}{{2}}-{num}{{2}}' | |
time = f'{num}{{2}}:{num}{{2}}:{num}{{2}}' | |
message = f'({date} {time}) ({channel}) ({user}): (.*)' | |
message_re = re.compile(message) | |
messages = [] | |
for root, dirs, files in os.walk(directory): | |
for filename in files: | |
with open(os.path.join(root, filename), 'r') as fileobj: | |
for message in re.findall(message_re, fileobj.read()): | |
messages.append(message) | |
df = pd.DataFrame(messages, | |
columns=['timestamp', 'channel', 'user', 'text']) | |
df['timestamp'] = pd.to_datetime(df['timestamp']) | |
df['timestamp'] = df['timestamp'] - pd.Timedelta('07:00:00') | |
df['time_of_day'] = df['timestamp'].apply(lambda x: x - pd.Timestamp(x.date())) | |
df['day_of_week'] = df.timestamp.apply(lambda x: x.dayofweek) | |
pp(sorted(df.user.unique())) | |
return df | |
def get_user_counts(df): | |
user_counts = df[['user', 'channel']].groupby('user').count() | |
user_counts = user_counts.reset_index() | |
user_counts.columns = ['username', 'messagecount'] | |
user_counts = user_counts.sort_values('messagecount') | |
return user_counts | |
def plot_message_counts(df): | |
# plot user message counts | |
user_counts = get_user_counts(df) | |
user_counts = user_counts[user_counts.messagecount > 10] | |
user_counts['log(messagecount)'] = np.log10(user_counts.messagecount.values) | |
for xaxis in ['messagecount', 'log(messagecount)']: | |
user_counts.plot(x='username', | |
y=xaxis, | |
kind='barh', | |
figsize=(8, 8), | |
title=f'{xaxis} per user') | |
plt.tight_layout() | |
plt.savefig(getpath(f'{xaxis}.png')) | |
def plot_over_time(df): | |
# plot over time | |
# Restructure for Michelle | |
# overtime = df[['user', 'timestamp', 'channel']]\ | |
# .groupby([pd.Grouper(key='timestamp', freq='60min'), 'user'])\ | |
# .count() | |
# overtime = overtime.reset_index() | |
# overtime.columns = ['timestamp', 'username', 'count'] | |
# users = sorted(overtime['username'].unique()) | |
# weeks = sorted(overtime['timestamp'].unique()) | |
# user_weeks = pd.DataFrame({'username': users, | |
# **{week: np.zeros(len(users)) | |
# for week in weeks}}) | |
# for user, group in tqdm.tqdm(overtime.groupby('username')): | |
# row = [] | |
# for week in weeks: | |
# weekdata = group[group['timestamp'] == pd.Timestamp(week)] | |
# if len(weekdata) == 0: | |
# row.append(0) | |
# else: | |
# row.append(weekdata['count'].values[0]) | |
# user_weeks.loc[user_weeks['username'] == user] = ([user] + row) | |
# sc_io.savemat('./overtime.mat', {'data': user_weeks.values}) | |
# for week in weeks: | |
# user_weeks[week] = overtime[overtime['timestamp'] == pd.Timestamp(week)].sort_values('username')[['count']] | |
# sc_io.savemat( | |
# 'overtime.mat', | |
# { | |
# 'timestamp': overtime['timestamp'].values, | |
# 'username': overtime['username'].values, | |
# 'count': overtime['count'].values | |
# } | |
# ) | |
overtime = df[['user', 'timestamp', 'channel']]\ | |
.groupby([pd.Grouper(key='timestamp', freq='1W'), 'user'])\ | |
.count() | |
overtime = overtime.reset_index() | |
overtime.columns = ['timestamp', 'username', 'count'] | |
fig = plt.figure(figsize=(8, 8)) | |
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) | |
user_counts = get_user_counts(df) | |
most_frequent = sorted(user_counts[user_counts.messagecount > 10000]['username'].values) | |
freqs = [] | |
fft_N = 2048 | |
for user in most_frequent: | |
user_df = overtime[overtime.username == user] | |
user_df.plot( | |
x='timestamp', | |
y='count', | |
kind='line', | |
ax=ax | |
) | |
freq_data = np.abs(np.fft.fft(user_df['count'].values, fft_N)) | |
freqs.append(freq_data) | |
plt.tight_layout() | |
plt.ylabel('Messages per Week') | |
plt.title('Most Frequent Users Posting Timeline') | |
plt.legend(most_frequent, loc=0) | |
plt.savefig(getpath('overtime.png')) | |
weeks = sorted(overtime['timestamp'].unique()) | |
scatter_df = pd.DataFrame( | |
{ | |
user: np.zeros(len(weeks)) | |
for user in most_frequent | |
} | |
) | |
for i, week in enumerate(weeks): | |
row = [] | |
for user in most_frequent: | |
weekval = overtime[(overtime.username == user) & | |
(overtime.timestamp == pd.Timestamp(week))]['count'] | |
if len(weekval) == 0: | |
row.append(0) | |
else: | |
row.append(weekval.values[0]) | |
scatter_df.iloc[i] = row | |
fig, axarr = plt.subplots(len(most_frequent), len(most_frequent), figsize=(16, 16)) | |
for i, user1 in enumerate(most_frequent): | |
for j, user2 in enumerate(most_frequent): | |
if i >= j: | |
corr = np.correlate( | |
scatter_df[user1].values / scatter_df[user1].std(), | |
scatter_df[user2].values / scatter_df[user2].std(), | |
mode='full' | |
) | |
corr /= len(corr) | |
corr -= 1 | |
axarr[i, j].plot(np.arange(len(corr)), corr, label=f'{user1}\n{user2}') | |
axarr[i, j].plot(np.linspace(0, len(corr), 10), np.zeros(10), 'k-', alpha=0.5) | |
axarr[i, j].set_ylim((-1, 1)) | |
axarr[i, j].set_xlim((0, len(corr))) | |
axarr[i, j].tick_params( | |
axis='both', | |
which='both', | |
direction='in' | |
) | |
axarr[i, j].legend(loc=0) | |
else: | |
user1data = scatter_df[user1].values | |
user2data = scatter_df[user2].values | |
axarr[i, j].plot(np.arange(len(user1data)), | |
user1data, | |
label=user1) | |
axarr[i, j].plot(np.arange(len(user2data)), | |
user2data, | |
label=user2) | |
axarr[i, j].legend(loc=0) | |
plt.tight_layout() | |
plt.savefig(getpath('scatter.png')) | |
#frequency domain | |
plt.figure(figsize=(8, 8)) | |
for name, freq in zip(most_frequent, freqs): | |
plt.plot(range(len(freq)), freq / fft_N, label=name) | |
plt.xlabel('Normalized Frequency') | |
plt.ylabel('FFT Values') | |
plt.legend(loc=0) | |
plt.savefig(getpath('frequency.png')) | |
def plot_time_of_day(df, name=ZOE): | |
# by time of day | |
tod = df[['user', 'time_of_day', 'channel']]\ | |
.groupby(['user', pd.Grouper(key='time_of_day', freq='30T')])\ | |
.count() | |
tod = tod.reset_index() | |
tod.columns = ['user', 'tod', 'count'] | |
tod = tod[tod.user == name].sort_values('tod') | |
tod.plot( | |
x='tod', | |
y='count', | |
kind='bar' | |
) | |
plt.title(f'{name}\'s Day') | |
plt.tight_layout() | |
plt.savefig(getpath(f'{name}_tod.png')) | |
def plot_time_of_week(df, name=ZOE): | |
tod = df[['user', 'time_of_day', 'day_of_week', 'channel']]\ | |
.groupby(['user', 'day_of_week', pd.Grouper(key='time_of_day', freq='30T')])\ | |
.count() | |
tod = tod.reset_index() | |
tod.columns = ['user', 'dow', 'tod', 'count'] | |
tod = tod[tod.user == name].sort_values(['dow', 'tod']) | |
maxval = tod['count'].values.max() | |
fig, axarr = plt.subplots(7, 1, figsize=(8, 8)) | |
for i in range(7): | |
tod[tod.dow == i].plot( | |
x='tod', | |
y='count', | |
kind='bar', | |
ax=axarr[i] | |
) | |
axarr[i].xaxis.set_visible(False) | |
axarr[i].set_title(f'{days_of_week[i]}') | |
legend = axarr[i].legend() | |
legend.remove() | |
axarr[i].set_ylim([0, maxval]) | |
plt.suptitle(f'{name}\'s Week') | |
plt.tight_layout() | |
plt.savefig(getpath(f'{name}_tow.png')) | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-d', '--directory', type=str, default='./cortex', | |
help='Directory with logfiles') | |
args = parser.parse_args() | |
return args | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment