Last active
January 29, 2016 11:29
-
-
Save altaurog/ed3a9583c17ac737c758 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Performance benchmark: pandas DataFrame.pivot against crosstab | |
""" | |
from datetime import datetime, timedelta | |
import io | |
import random | |
import sys | |
import time | |
import pandas as pd | |
import psycopg2 | |
CLIENT_COUNT = 80000 | |
DATE_COUNT = 120 | |
def main(dbstr): | |
connection = psycopg2.connect(dbstr) | |
dates = generate_dates(datetime(2015, 1, 1), DATE_COUNT) | |
setup_database(connection, dates) | |
print('crosstab: %.1f' % with_crosstab(connection, dates)) | |
connection.rollback() | |
setup_database(connection, dates) | |
print('pandas: %0.1f / %0.1f' % with_pandas(connection)) | |
def with_crosstab(connection, dates): | |
values = ','.join("('%s'::date)" % d for d in dates) | |
columns = ','.join('"day%s" int' % d for d in dates) | |
query = """ | |
SELECT * from crosstab ( | |
'SELECT client, datestamp, count(*) | |
FROM mytable | |
GROUP BY client, datestamp | |
ORDER BY client, datestamp', | |
$$VALUES %s$$) AS ct ("Client" int, %s) | |
""" % (values, columns) | |
cur = connection.cursor() | |
start = time.time() | |
cur.execute(query) | |
data = cur.fetchall() | |
return time.time() - start | |
def with_pandas(connection): | |
query = """ | |
SELECT client, datestamp, count(*) | |
FROM mytable | |
GROUP BY client, datestamp | |
ORDER BY client, datestamp | |
""" | |
a = time.time() | |
cur = connection.cursor() | |
cur.execute(query) | |
data = cur.fetchall() | |
b = time.time() | |
df = pd.DataFrame(data, columns=['client', 'datestamp', 'count']) | |
data = df.pivot(index='client', columns='datestamp', values='count') | |
return b - a, time.time() - b | |
def setup_database(connection, dates): | |
cur = connection.cursor() | |
cur.execute(""" | |
CREATE TEMP TABLE mytable ( | |
client int, | |
datestamp date, | |
amount int | |
) ON COMMIT DROP | |
""") | |
buf = io.StringIO('\n'.join(generate_data(CLIENT_COUNT, dates))) | |
cur.copy_from(buf, 'mytable') | |
cur.execute("CREATE INDEX ON mytable (client, datestamp)") | |
cur.close() | |
def generate_data(num_clients, dates): | |
for i in range(num_clients): | |
for d in dates: | |
for x in range(random.randint(0,10)): | |
yield '%d\t%s\t1' % (i, d) | |
def generate_dates(startdate, count): | |
return [startdate + timedelta(d) for d in range(count)] | |
if __name__ == '__main__': | |
main(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment