Last active
December 18, 2015 05:59
-
-
Save kwarrick/5736376 to your computer and use it in GitHub Desktop.
Map-reduce a CSV file using UNIX sort utility in just 24 lines of code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# kwarrick@uga.edu | |
import csv | |
import subprocess | |
from itertools import groupby | |
def identity(infile, outfile): | |
def key(row): | |
return row[0] | |
def mapper(row): | |
yield row | |
def reducer(key, values): | |
for value in values: | |
yield value | |
map_reduce_csv(mapper, reducer, key, infile, outfile) | |
def map_reduce_csv(mapper, reducer, key, infile, outfile): | |
""" Map-reduce CSV file using UNIX sort utility. """ | |
sort = subprocess.Popen( | |
['/usr/bin/sort', '-t,'], | |
env={'LC_ALL': 'C'}, | |
stdin=subprocess.PIPE, | |
stdout=subprocess.PIPE, | |
) | |
# map and sort | |
reader = csv.reader(infile) | |
writer = csv.writer(sort.stdin, quoting=csv.QUOTE_NONE) | |
for row in reader: | |
writer.writerows(mapper(row)) | |
sort.stdin.close() | |
# group and reduce | |
reader = csv.reader(sort.stdout) | |
writer = csv.writer(outfile, quoting=csv.QUOTE_NONE) | |
for k, v in groupby(reader, key): | |
writer.writerows(reducer(k, list(v))) | |
sort.stdout.close() | |
if __name__ == '__main__': | |
import fileinput | |
with open('output.csv', 'w') as outfile: | |
identity(fileinput.input(), outfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment