Created
January 31, 2019 01:42
-
-
Save raphaelmerx/72ad4c6bbcd485a1d15315cc9aec00f3 to your computer and use it in GitHub Desktop.
Extract vocabulary from lines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
""" Example usage: python get_vocab.py --input lines_tet.txt --output vocab.tet. """ | |
from __future__ import print_function | |
import os | |
import sys | |
import inspect | |
import warnings | |
import argparse | |
import codecs | |
from collections import Counter | |
# hack for python2/3 compatibility | |
from io import open | |
argparse.open = open | |
def create_parser(subparsers=None): | |
if subparsers: | |
parser = subparsers.add_parser('get-vocab', | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
description="Generates vocabulary") | |
else: | |
parser = argparse.ArgumentParser( | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
description="Generates vocabulary") | |
parser.add_argument( | |
'--input', '-i', type=argparse.FileType('r'), default=sys.stdin, | |
metavar='PATH', | |
help="Input file (default: standard input).") | |
parser.add_argument( | |
'--output', '-o', type=argparse.FileType('w'), default=sys.stdout, | |
metavar='PATH', | |
help="Output file (default: standard output)") | |
return parser | |
def get_vocab(train_file, vocab_file): | |
c = Counter() | |
for line in train_file: | |
for word in line.strip('\r\n \xa0').split(' '): | |
if word: | |
c[word] += 1 | |
for key,f in sorted(c.items(), key=lambda x: x[1], reverse=True): | |
vocab_file.write(key + "\n") | |
if __name__ == "__main__": | |
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) | |
newdir = os.path.join(currentdir, 'subword_nmt') | |
if os.path.isdir(newdir): | |
warnings.simplefilter('default') | |
warnings.warn( | |
"this script's location has moved to {0}. This symbolic link will be removed in a future version. Please point to the new location, or install the package and use the command 'subword-nmt'".format(newdir), | |
DeprecationWarning | |
) | |
# python 2/3 compatibility | |
if sys.version_info < (3, 0): | |
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) | |
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) | |
sys.stdin = codecs.getreader('UTF-8')(sys.stdin) | |
else: | |
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) | |
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) | |
sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) | |
parser = create_parser() | |
args = parser.parse_args() | |
# read/write files as UTF-8 | |
if args.input.name != '<stdin>': | |
args.input = codecs.open(args.input.name, encoding='utf-8') | |
if args.output.name != '<stdout>': | |
args.output = codecs.open(args.output.name, 'w', encoding='utf-8') | |
get_vocab(args.input, args.output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment