Last active
February 12, 2018 14:15
-
-
Save hiropppe/2c2288c9118f44f7ef0c54680bdbd7c4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf8 -*- | |
from __future__ import unicode_literals | |
import bz2 | |
import codecs | |
import mwparserfromhell | |
import multiprocessing | |
import re | |
import sys | |
import unicodedata | |
from functools import partial | |
from gensim.corpora import wikicorpus | |
from itertools import imap | |
from multiprocessing.pool import Pool | |
from tqdm import tqdm | |
chr = unichr | |
DEFAULT_IGNORED_NS = ( | |
'wikipedia:', 'category:', 'file:', 'portal:', 'template:', 'mediawiki:', | |
'user:', 'help:', 'book:', 'draft:' | |
) | |
IGNORE_PREFIX = ('アップロードログ', '削除記録/', '削除依頼/', '検証/', '進行中の荒らし行為/', '井戸端/', 'wp:', | |
'利用者:', 'user:', 'ウィキプロジェクト', 'pj:', 'メインページ/', '索引 ', '著作権/', 'location map/') | |
IGNORE_SUFFIX = ('.jpg', '.jpeg', '.png', '.gif', '.svg', '.js', '.css', 'の一覧', '/削除') | |
IGNORE_SUBSTR = ('wikipedia:', 'template:', 'listes:', 'category:', '過去ログ:', 'ファイル:', '画像:', | |
'section:', '/過去ログ', '/history', '/log', '/sandbox') | |
IGNORE_PATTERNS = (re.compile(r'^\d+$'), | |
re.compile(r'^\d+年(?:\d+月)?(?:\d+日)?$'), | |
re.compile(r'^\d+月(?:\d+日)?$'), | |
re.compile(r'^\d+日$'), | |
re.compile(r'^\d+\.\d+.\d+.\d+$'), | |
re.compile(r'^\w+(?:\:\w+)+$'), | |
re.compile(r'^[\u3041-\u3097]{,2}$') | |
) | |
HIRAGANA = set(map(chr, range(12353, 12353+86))) | |
KATAKANA = set(map(chr, range(12449, 12449+90))) | |
sys.stdout = codecs.getwriter('utf8')(sys.stdout) | |
sys.stderr = codecs.getwriter('utf8')(sys.stderr) | |
def normalize(anchor): | |
anchor = unicodedata.normalize('NFKC', anchor) | |
return anchor | |
def is_useless_entity(title): | |
title = title.lower() | |
if (title.startswith(IGNORE_PREFIX) or | |
title.endswith(IGNORE_SUFFIX) or | |
title in HIRAGANA or | |
title in KATAKANA or | |
any(s in title for s in IGNORE_SUBSTR) or | |
any(r.match(title) for r in IGNORE_PATTERNS)): | |
return True | |
else: | |
return False | |
class WikiDumpReader(object): | |
def __init__(self, dump_file, ignored_ns=DEFAULT_IGNORED_NS): | |
self._dump_file = dump_file | |
self._ignored_ns = ignored_ns | |
with bz2.BZ2File(self._dump_file) as f: | |
self._language = re.search(r'xml:lang="(.*)"', f.readline()).group(1) | |
@property | |
def language(self): | |
return self._language | |
def __iter__(self): | |
with bz2.BZ2File(self._dump_file) as f: | |
for (title, wiki_text, wiki_id) in wikicorpus.extract_pages(f): | |
if any([title.lower().startswith(ns) for ns in self._ignored_ns]): | |
continue | |
yield (title, wiki_text, wiki_id.decode('utf8')) | |
def _return_it(value): | |
return value | |
def anchor_generator(dump_reader, | |
pool_size=1, | |
chunk_size=100): | |
if pool_size > 1: | |
pool = Pool(pool_size) | |
imap_func = partial(pool.imap_unordered, chunksize=chunk_size) | |
else: | |
imap_func = imap | |
for (title, wiki_txt, wiki_id) in imap_func(_return_it, dump_reader): | |
wiki_code = mwparserfromhell.parse(wiki_txt) | |
# anchor | |
for node in wiki_code.nodes: | |
if isinstance(node, mwparserfromhell.nodes.Wikilink): | |
entity = node.title.strip_code() | |
if not is_useless_entity(entity): | |
# Add only when anchor text not equals the title (= node.text has value). | |
if node.text and re.match(r'^[\s\u3000]*$', node.text.strip_code()) is None: | |
anchor = node.text.strip_code() | |
anchor = normalize(anchor) | |
yield anchor, entity | |
def main(wiki_dump, out, pool_size): | |
dump_reader = WikiDumpReader(wiki_dump) | |
generator = anchor_generator(dump_reader, pool_size=pool_size) | |
for anchor, entity in tqdm(generator): | |
out.write(u'{:s}\t{:s}\n'.format(anchor, entity)) | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
'--pages_article_dump', '-a', type=str, default=None, required=True, | |
help='Wikipedia pages-articles dump file (bz2)') | |
parser.add_argument( | |
'--pool_size', '-p', type=int, default=multiprocessing.cpu_count(), required=False, | |
help='') | |
parser.add_argument( | |
'--out', '-o', type=str, required=False, | |
help='Output file.' | |
) | |
args = parser.parse_args() | |
if args.out: | |
out = codecs.open(args.out, mode='w', encoding='utf8') | |
else: | |
out = sys.stdout | |
args = parser.parse_args() | |
main(args.pages_article_dump, out, args.pool_size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment