Last active
January 2, 2016 22:37
-
-
Save hufman/17b96b58cb78d416ee81 to your computer and use it in GitHub Desktop.
Animal Crossing Wikia Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
You'll need to `pip install mwparserfromhell` to run this code. Tested in Python 2.7 and 3.2. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import codecs | |
import json | |
import re | |
import sys | |
import mwparserfromhell | |
import logging | |
if sys.version_info[0] == 3: | |
from urllib.parse import quote as urlquote | |
from urllib.request import urlopen as urlopen | |
basestring = str | |
else: | |
from urllib import quote as urlquote | |
from urllib import urlopen as urlopen | |
basestring = basestring | |
def urlquote_titles(titles): | |
# just a single title | |
if isinstance(titles, basestring): | |
return urlquote(titles) | |
# list of titles | |
else: | |
return '|'.join((urlquote(t) for t in titles)) | |
API_BASE = "http://animalcrossing.wikia.com/api.php" | |
CATEGORY_MEMBERS_URL = lambda category: "%s?action=query&list=categorymembers&format=json&cmtype=page&cmlimit=max&cmtitle=Category:%s" % (API_BASE, urlquote(category)) | |
INFOBOX_URL = lambda titles: "%s?action=query&prop=revisions&rvprop=content&rvsection=0&format=json&titles=%s" % (API_BASE, urlquote_titles(titles),) | |
ARTICLE_CATEGORIES_URL = lambda titles: "%s?action=query&prop=categories&format=json&titles=%s" % (API_BASE, urlquote_titles(titles),) | |
def category_members(category): | |
members = [] | |
url = CATEGORY_MEMBERS_URL(category) | |
if url: | |
with urlopen(url) as reader: | |
utf8reader = codecs.getreader('utf-8') | |
data = json.load(utf8reader(reader)) | |
members.extend(data['query']['categorymembers']) | |
if 'query-continue' in data: | |
cookie = data['query-continue']['categorymembers']['cmcontinue'] | |
url = INFOBOX_URL(titles) + "&cmcontinue:%s" % (cookie, ) | |
else: | |
url = None | |
return members | |
def ripout_infobox(firstsection): | |
""" Return the raw infobox data out of the first section of data """ | |
code = mwparserfromhell.parse(firstsection) | |
for template in code.ifilter_templates(recursive=False): | |
if 'Infobox' in template.name: | |
return str(template) | |
return '' | |
def fetch_infobox(titles): | |
""" Return the raw infobox data for the given titles """ | |
url = INFOBOX_URL(titles) | |
with urlopen(url) as reader: | |
utf8reader = codecs.getreader('utf-8') | |
data = json.load(utf8reader(reader)) | |
fetched = {} | |
for pageid, data in data['query']['pages'].items(): | |
revision_data = list(data['revisions'][0].values())[0] | |
fetched[pageid] = { | |
'pageid': data['pageid'], | |
'ns': data['ns'], | |
'title': data['title'], | |
'data': ripout_infobox(revision_data) | |
} | |
return fetched | |
def parse_infobox(infobox_data): | |
code = mwparserfromhell.parse(infobox_data['data']) | |
templates = code.filter_templates() | |
if len(templates) < 1 or 'Infobox' not in templates[0].name: | |
raise ValueError("Could not find Infobox") | |
infobox_code = templates[0] | |
infobox_name = infobox_code.name | |
infobox_classname = infobox_name.replace(' ', '') | |
infobox_classnames = [name for name in globals() if 'Infobox' in name] | |
if infobox_classname in infobox_classnames: | |
infobox_class = globals()[infobox_classname] | |
else: | |
infobox_class = Infobox | |
return infobox_class(infobox_data['pageid'], infobox_data['title'], infobox_data['data']) | |
class GameSpecificValue(object): | |
DEFAULT_GAME = 'NL' | |
def __init__(self, values): | |
self.values = values | |
self.display_game = GameSpecificValue.DEFAULT_GAME | |
def games(self): | |
return list(self.values.keys()) | |
def as_game(self, game=None): | |
if game is None: | |
game = self.display_game | |
return self.values.get(game, self.values.get('')) | |
def __str__(self): | |
return self.as_game() | |
def __repr__(self): | |
return "<GSV: %r>" % (self.values,) | |
class Infobox(object): | |
def __init__(self, pageid, title, data): | |
self.pageid = pageid | |
self.title = title | |
self.data = data | |
code = mwparserfromhell.parse(data) | |
self.code = code.filter_templates()[0] | |
self.params_mw = {} | |
self.params = {} | |
self._parse() | |
def to_dict(self): | |
return dict(self.params) | |
def _parse(self): | |
for param in self.code.params: | |
if param.showkey: # is a key=value param | |
name = str(param.name).strip() | |
value = param.value | |
self.params_mw[name] = value | |
self._parse_param('appearance') | |
self._parse_params() | |
def _parse_params(self): | |
for name in self.params_mw: | |
self._parse_param(name) | |
def _parse_param(self, name): | |
value = self.params_mw.get(name) | |
if not value: | |
# somehow could not load up this name | |
return | |
parser = '_parse_%s' % (name,) | |
if parser in dir(self): | |
value = getattr(self, parser)(value) | |
else: | |
value = self._parse_default(value) | |
self.params[name] = value | |
def _parse_default(self, value): | |
return value.strip_code().strip() | |
def _parse_appearances(self, value): | |
order = ['af', 'af+', 'ac', 'afe+', 'ww', 'cf', 'nl'] | |
templates = value.filter_templates() | |
if len(templates) == 1 and templates[0].name == 'since': | |
since = templates[0].params[0].value | |
index = order.index(since) | |
return order[index:] | |
elif len(templates) == 1 and templates[0].name == 'until': | |
since = templates[0].params[0].value | |
index = order.index(since) | |
return order[:index+1] | |
else: | |
appearances = [] | |
for template in templates: | |
if len(template.params) == 0: | |
appearances.append(template.name.lower()) | |
elif len(template.params) == 1 and \ | |
template.params[0].value == 'shortest': | |
appearances.append(template.name.lower()) | |
else: | |
logging.warn("Unknown Appearance: %s" % (template,)) | |
appearances.append(str(value)) | |
return appearances | |
def _parse_nameother(self, value): | |
names = {} | |
lang = None | |
just_saw_lang = False | |
for node in value.nodes: | |
if isinstance(node, mwparserfromhell.nodes.template.Template): | |
lang = node.name.strip_code() | |
just_saw_lang = True | |
continue | |
if just_saw_lang: | |
text = node.value.strip() | |
names[lang] = text | |
just_saw_lang = False | |
return names | |
def _parse_gamespecific_value(self, value): | |
values = self._parse_categorized_value(value) | |
return GameSpecificValue(values) | |
def _parse_categorized_value(self, value): | |
categorized = {} | |
value = re.sub('<[^>]*?>', '', str(value)) | |
value = re.sub('[{}]', '', value) | |
value = re.sub('[\[\]]', '', value) | |
segment_finder = re.compile(r'\s*((?:[0-9,]+ [^(,]*)|[^(,]*)\s*(?:\(([^)]*)\))?(?:[\s,])*') | |
for segment in segment_finder.finditer(value): | |
parsed_groups = segment.groups() | |
curvalue = parsed_groups[0] | |
if curvalue is None or curvalue.strip() == '': | |
continue | |
curvalue = curvalue.strip() | |
if len(parsed_groups) > 1 and parsed_groups[1] is not None: | |
for category in parsed_groups[1].split(','): | |
category = category.strip() | |
categorized[category] = curvalue | |
else: | |
categorized[''] = curvalue | |
return categorized | |
_parse_location = _parse_gamespecific_value | |
_parse_price = _parse_gamespecific_value | |
_parse_shadow = _parse_gamespecific_value | |
_parse_size = _parse_gamespecific_value | |
_parse_timeday = _parse_categorized_value | |
if __name__ == '__main__': | |
allfish = category_members('Fish') | |
allfishnames = (f['title'] for f in allfish) | |
data = fetch_infobox(allfishnames) | |
data = fetch_infobox('Char') | |
for pageid, pagedata in data.items(): | |
try: | |
infobox = parse_infobox(pagedata) | |
print("%s" % (infobox.to_dict(), )) | |
except Exception as e: | |
print("Error parsing %s: %s" % (pagedata['title'], e)) | |
#print(pagedata['data']) | |
raise |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
import animalcrossing | |
class TestParsingParams(unittest.TestCase): | |
def setUp(self): | |
self.instance = animalcrossing.Infobox('0', 'Test', '{{Test}}') | |
def testSingleValue(self): | |
ret = self.instance._parse_categorized_value('Large') | |
self.assertEqual(ret, {'': 'Large'}) | |
def testDoubleValue(self): | |
ret = self.instance._parse_categorized_value('Large, Small') | |
self.assertEqual(ret, {'': 'Small'}) | |
def testSingleParensValue(self): | |
ret = self.instance._parse_categorized_value('Large (One)') | |
self.assertEqual(ret, {'One': 'Large'}) | |
def testDoubleParensValue(self): | |
ret = self.instance._parse_categorized_value('Large (One, Two)') | |
self.assertEqual(ret, {'One': 'Large', 'Two': 'Large'}) | |
def testTwoSingleParensValue(self): | |
ret = self.instance._parse_categorized_value('Small (One), Large (Two)') | |
self.assertEqual(ret, {'One': 'Small', 'Two': 'Large'}) | |
def testTwoSingleExtraParensValue(self): | |
ret = self.instance._parse_categorized_value('Small (One), Large (Two, Three)') | |
self.assertEqual(ret, {'One': 'Small', 'Two': 'Large', 'Three': 'Large'}) | |
def testSingleTwoSingleExtraParensValue(self): | |
ret = self.instance._parse_categorized_value('Bonus, Small (One), Large (Two, Three)') | |
self.assertEqual(ret, {'': 'Bonus', 'One': 'Small', 'Two': 'Large', 'Three': 'Large'}) | |
def testBells(self): | |
ret = self.instance._parse_categorized_value('10,000 Bells, 8,000 Bells (WW)') | |
self.assertEqual(ret, {'': '10,000 Bells', 'WW': '8,000 Bells'}) | |
class TestGSV(unittest.TestCase): | |
def testEmptyValue(self): | |
gsv = animalcrossing.GameSpecificValue({}) | |
self.assertEqual([], gsv.games()) | |
self.assertEqual(None, gsv.as_game()) | |
def testAllValue(self): | |
gsv = animalcrossing.GameSpecificValue({'':'Yes'}) | |
self.assertEqual([''], gsv.games()) | |
self.assertEqual('Yes', gsv.as_game()) | |
def testTwoValue(self): | |
animalcrossing.GameSpecificValue.DEFAULT_GAME = 'NL' | |
gsv = animalcrossing.GameSpecificValue({'':'Yes', 'WW':'No'}) | |
self.assertEqual(['', 'WW'], gsv.games()) | |
self.assertEqual('Yes', gsv.as_game()) | |
self.assertEqual('No', gsv.as_game('WW')) | |
def testGameValue(self): | |
animalcrossing.GameSpecificValue.DEFAULT_GAME = 'NL' | |
gsv = animalcrossing.GameSpecificValue({'':'Yes', 'WW':'No'}) | |
self.assertEqual('NL', gsv.display_game) | |
self.assertEqual(['', 'WW'], gsv.games()) | |
gsv.display_game = 'WW' | |
self.assertEqual('No', gsv.as_game()) | |
def testDefaultValue(self): | |
animalcrossing.GameSpecificValue.DEFAULT_GAME = 'WW' | |
gsv = animalcrossing.GameSpecificValue({'':'Yes', 'WW':'No'}) | |
self.assertEqual('WW', gsv.display_game) | |
self.assertEqual(['', 'WW'], gsv.games()) | |
self.assertEqual('No', gsv.as_game()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment