not7cd · March 6, 2019 21:23
diff --git a/fbmsgs_stackplot.py b/fbmsgs_stackplot.py
 #!/usr/bin/env python
 # coding: utf-8
 """
 Messenger history analyzer and plotter

 Short script to analyze past messages and create stackplot from them over time

 USAGE
 1. Download facebook data in json format
 2. `cd` to facebook-your-name/messages
 3. Run script here
 """


 import numpy as np
 import matplotlib.pyplot as plt
 import glob
 import json
 import pandas as pd
 from pandas.plotting import register_matplotlib_converters
 register_matplotlib_converters()


 def aggregate_msg_count(senders, participants):
    tmp = []
    for p in participants:
        single = senders[(senders["sender_name"] == p) & (senders["type"] == "Generic")]
        single = single[["sender_name"]]
        single["sender_name"].replace(p, 1, inplace=True)
        single.rename(columns={"sender_name": p}, inplace=True)
        count = single.groupby(pd.Grouper(freq="D"))
        tmp.append(count.sum())

    out = pd.concat(tmp, axis=1)
    return out


 def aggregate_word_count(senders, participants):
    tmp = []
    for p in participants:
        single = senders[(senders["sender_name"] == p) & (senders["type"] == "Generic")]
        single["content"] = single["content"].apply(lambda s: len(str(s).split(" ")))
        single = single[["content"]]
        single.rename(columns={"content": p}, inplace=True)
        count = single.groupby(pd.Grouper(freq="D"))
        tmp.append(count.sum())

    out = pd.concat(tmp, axis=1)
    return out


 def extract_msg_count(msgs, ignore=["Your Name"]):
    df = pd.DataFrame(msgs["messages"])
    df_ = df[["timestamp_ms", "sender_name", "content", "type"]]
    dti = pd.to_datetime(df_["timestamp_ms"].tolist(), unit="ms")
    df_.index = dti
    participants = [m["name"] for m in msgs["participants"] if m["name"] not in ignore]
    if len(participants) > 3:
        print(len(participants))
        raise ValueError("too big xd")
    ppl = aggregate_word_count(df_, participants)
    return ppl


 def aggregate(df, top=12, freq="M"):
    """aggregate by chosen time span, return top"""
    dfg = df.groupby(pd.Grouper(freq=freq)).sum()
    top_cols = dfg.sum().sort_values(ascending=False)[:top]

    top = dfg.loc[:, dfg.columns.isin(top_cols.index.tolist())]
    return top.reindex(top_cols.index[::-1], axis=1)


 def stackplot_messages(df, legend=False):
    df = df.resample("D").interpolate(method="pchip")
    plt.style.use("default")
    fig, ax = plt.subplots(figsize=(20, 6))

    n_lines = 12
    x = np.linspace(0, 10)
    phase_shift = np.linspace(0, np.pi, n_lines)
    ax.set_prop_cycle("color", [plt.cm.summer(i) for i in np.linspace(0, 1, n_lines)])

    ax.stackplot(df.index.values, df.T, baseline="wiggle", labels=df.columns.values)
    for s in ax.spines:
        ax.spines[s].set_visible(False)
    ax.yaxis.set_visible(False)
    if legend:
        ax.legend(loc="upper left")

    plt.show()


 def collect_messages(files):
    chats = None
    for f in files:
        with open(f) as fp:
            msgs = json.load(fp)
        print(f)
        try:
            tmp = extract_msg_count(msgs)
        except Exception as e:
            print(e, f)
            continue
        if chats is None:
            chats = tmp
        else:
            try:
                chats = pd.concat([chats, tmp], axis=0, sort=True).fillna(0)
                chats = chats.groupby(chats.index).sum()
            except Exception as e:
                print(e, tmp)
    return chats


 def main():
    files = glob.glob("**/message.json", recursive=True)
    df = collect_messages(files)
    df = aggregate(df)
    stackplot_messages(df)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# coding: utf-8
	"""
	Messenger history analyzer and plotter

	Short script to analyze past messages and create stackplot from them over time

	USAGE
	1. Download facebook data in json format
	2. `cd` to facebook-your-name/messages
	3. Run script here
	"""


	import numpy as np
	import matplotlib.pyplot as plt
	import glob
	import json
	import pandas as pd
	from pandas.plotting import register_matplotlib_converters
	register_matplotlib_converters()


	def aggregate_msg_count(senders, participants):
	tmp = []
	for p in participants:
	single = senders[(senders["sender_name"] == p) & (senders["type"] == "Generic")]
	single = single[["sender_name"]]
	single["sender_name"].replace(p, 1, inplace=True)
	single.rename(columns={"sender_name": p}, inplace=True)
	count = single.groupby(pd.Grouper(freq="D"))
	tmp.append(count.sum())

	out = pd.concat(tmp, axis=1)
	return out


	def aggregate_word_count(senders, participants):
	tmp = []
	for p in participants:
	single = senders[(senders["sender_name"] == p) & (senders["type"] == "Generic")]
	single["content"] = single["content"].apply(lambda s: len(str(s).split(" ")))
	single = single[["content"]]
	single.rename(columns={"content": p}, inplace=True)
	count = single.groupby(pd.Grouper(freq="D"))
	tmp.append(count.sum())

	out = pd.concat(tmp, axis=1)
	return out


	def extract_msg_count(msgs, ignore=["Your Name"]):
	df = pd.DataFrame(msgs["messages"])
	df_ = df[["timestamp_ms", "sender_name", "content", "type"]]
	dti = pd.to_datetime(df_["timestamp_ms"].tolist(), unit="ms")
	df_.index = dti
	participants = [m["name"] for m in msgs["participants"] if m["name"] not in ignore]
	if len(participants) > 3:
	print(len(participants))
	raise ValueError("too big xd")
	ppl = aggregate_word_count(df_, participants)
	return ppl


	def aggregate(df, top=12, freq="M"):
	"""aggregate by chosen time span, return top"""
	dfg = df.groupby(pd.Grouper(freq=freq)).sum()
	top_cols = dfg.sum().sort_values(ascending=False)[:top]

	top = dfg.loc[:, dfg.columns.isin(top_cols.index.tolist())]
	return top.reindex(top_cols.index[::-1], axis=1)


	def stackplot_messages(df, legend=False):
	df = df.resample("D").interpolate(method="pchip")
	plt.style.use("default")
	fig, ax = plt.subplots(figsize=(20, 6))

	n_lines = 12
	x = np.linspace(0, 10)
	phase_shift = np.linspace(0, np.pi, n_lines)
	ax.set_prop_cycle("color", [plt.cm.summer(i) for i in np.linspace(0, 1, n_lines)])

	ax.stackplot(df.index.values, df.T, baseline="wiggle", labels=df.columns.values)
	for s in ax.spines:
	ax.spines[s].set_visible(False)
	ax.yaxis.set_visible(False)
	if legend:
	ax.legend(loc="upper left")

	plt.show()


	def collect_messages(files):
	chats = None
	for f in files:
	with open(f) as fp:
	msgs = json.load(fp)
	print(f)
	try:
	tmp = extract_msg_count(msgs)
	except Exception as e:
	print(e, f)
	continue
	if chats is None:
	chats = tmp
	else:
	try:
	chats = pd.concat([chats, tmp], axis=0, sort=True).fillna(0)
	chats = chats.groupby(chats.index).sum()
	except Exception as e:
	print(e, tmp)
	return chats


	def main():
	files = glob.glob("**/message.json", recursive=True)
	df = collect_messages(files)
	df = aggregate(df)
	stackplot_messages(df)


	if __name__ == "__main__":
	main()