Last active
February 7, 2021 23:33
-
-
Save Gunni/7d2e14b49d3f0483666843e31b2b358f to your computer and use it in GitHub Desktop.
I hate number words
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import nltk as nltk | |
from word2number import w2n | |
def replaceTextnumberWithNumber(text): | |
#print(f'-- BEFORE --\n{text}') | |
tagged_number_words = 'ten/CD thousand/CD nine/CD hundred/CD ninety/CD eight/CD seven/CD six/CD five/CD four/CD three/CD two/CD one/CD eighty/CD seventy/CD sixty/CD fifty/CD forty/CD thirty/CD twenty/CD nineteen/CD eighteen/CD seventeen/CD sixteen/CD fifteen/CD fourteen/CD thirteen/CD twelve/CD eleven/CD zero/CD' | |
tagged_number_words_tuples = [nltk.tag.str2tuple(t) for t in tagged_number_words.split()] | |
my_tagger = nltk.UnigramTagger([ tagged_number_words_tuples ], backoff=nltk.DefaultTagger('IGNORE')) | |
my_grammar = 'NumberWord: {<CD>+}' | |
parser = nltk.RegexpParser(my_grammar) | |
parsed = parser.parse(my_tagger.tag(nltk.word_tokenize(text.lower()))) | |
#print(parsed) | |
for tag in [tree.leaves() for tree in parsed.subtrees() if tree.label() == 'NumberWord']: | |
ut = nltk.untag(tag) | |
num = w2n.word_to_num(' '.join(ut)) | |
r = re.compile(re.escape(' '.join(ut)), re.IGNORECASE) | |
text = r.sub(str(num), text) | |
#print('-- AFTER --') | |
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
from .helpers import replaceTextnumberWithNumber | |
class TestReplaceTextnumberWithNumber(unittest.TestCase): | |
def test_number(self): | |
self.assertEqual(replaceTextnumberWithNumber('four'), '4') | |
def test_number_in_a_sentence(self): | |
self.assertEqual(replaceTextnumberWithNumber('There were forty two of them'), 'There were 42 of them') | |
def test_multiple_numbers_in_a_sentence(self): | |
self.assertEqual(replaceTextnumberWithNumber( | |
'Example Chapter Title: Chapter Twenty (End of Book One)' | |
), 'Example Chapter Title: Chapter 20 (End of Book 1)') | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment