Created
March 17, 2014 16:36
-
-
Save dperconti/9602981 to your computer and use it in GitHub Desktop.
A tag counting script with comments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import codecs | |
from collections import defaultdict | |
# This line will fix your unicode printing woes... still getting the hang of | |
# using codecs for everything, but it's important | |
sys.stdout = codecs.getwriter("utf8")(sys.stdout) | |
# collections.Counter was introduced in 2.7 -- this is 2.6 compatible | |
# this is the real goodness | |
tag_counts = defaultdict(int) | |
# Using sys.argv[1] allows you to use this on other files in the future without | |
# modifying the script | |
with open(sys.argv[1]) as f: | |
# Using `for line in file` syntax allows you to process each line of the | |
# data without loading it all into memory at once | |
for line in f: | |
# Try/Catch for the first brackets which are not considered "proper" JSON | |
try: | |
# Stripping the whitespace and commas is a decent bit cleaner here | |
# Also, this is really a device_info dictionary, not JsonFormat | |
device_info = json.loads(line.strip().strip(',')) | |
# Always catch Exception (or more specific) so that KeyboardInterrupt | |
# (ctrl + C) will still stop the program | |
# Also, keep your try blocks as small as possible | |
except Exception: | |
continue | |
# It's tags... so name it what it is | |
# Man it is dumb that gimme outputs the tags in this format... sorry | |
# !meaculpa | |
tags = eval(device_info["tags"]) | |
# tags could be None | |
if not tags: | |
continue | |
for tag in tags: | |
# Using the magic/beauty of defaultdict/counter here | |
# defaultdict(int) is essentially the same as Counter() | |
# read more here: | |
# http://docs.python.org/2/library/collections.html#counter-objects | |
tag_counts[tag] += 1 | |
# I prefer to print to stdout in my scripts, since you can just | |
# python my_script.py input.txt > out.txt | |
# and get more or less the same result as writing to a file which gives me | |
# added flexibility | |
for tag, count in tag_counts.iteritems(): | |
print tag, "--", count | |
# The rest of the code is an example that I don't want to actually execute, so | |
# I'm exiting early. Don't actually need to use sys.exit(0). | |
sys.exit(0) | |
# If you really want to write to a file, you could use something like this... | |
# Open 2nd file provided on the command line in "write binary" mode | |
# You don't need to use codecs.open(sys.argv[2], 'wb', 'utf8') but it would be safer | |
with open(sys.argv[2], 'wb') as f: | |
for tag, count in tag_counts.iteritems(): | |
f.write(u"{0} -- {1}".format(tag, count)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment