sadovnychyi · September 20, 2017 21:22 · honnibal · Sep 20, 2017
diff --git a/acora b/acora
 python3 spacy_vs_acora.py acora
 Total 10036 matches in 0.4687957763671875s
 Filename: spacy_vs_acora.py

 Line #    Mem usage    Increment   Line Contents
 ================================================
    14     80.6 MiB      0.0 MiB   @memory_profiler.profile()
    15                             def main(test):
    16     88.6 MiB      8.0 MiB     random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
    17     87.7 MiB     -0.9 MiB     random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
    18     87.7 MiB      0.0 MiB     phrases = [random_phrase() for i in range(N_PHRASES)]
    19     88.6 MiB      0.9 MiB     random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
    20     88.6 MiB      0.0 MiB     texts = [random_text() for i in range(N_TEXTS)]
    21
    22     88.6 MiB      0.0 MiB     total = 0
    23     88.6 MiB      0.0 MiB     start = None
    24
    25     88.6 MiB      0.0 MiB     if test == 'spacy':
    26                                 nlp = spacy.load('en_dummy')
    27                                 matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
    28                                 start = time.time()
    29                                 for text in nlp.pipe(texts):
    30                                   total += len(matcher(text))
    31                               else:
    32   3089.6 MiB   3001.0 MiB       matcher = acora.AcoraBuilder(*phrases).build()
    33   3089.6 MiB      0.0 MiB       start = time.time()
    34   3089.6 MiB      0.0 MiB       for text in texts:
    35   3089.6 MiB      0.0 MiB         total += len(matcher.findall(text))
    36   3089.6 MiB      0.0 MiB     print('Total %s matches in %ss' % (total, time.time() - start))

diff --git a/pyahocorasick b/pyahocorasick
 python3 spacy_vs_acora.py pyahocorasick
 Total 9934 matches in 0.1664886474609375s
 Filename: spacy_vs_acora.py

 Line #    Mem usage    Increment   Line Contents
 ================================================
    15     80.1 MiB      0.0 MiB   @memory_profiler.profile()
    16                             def main(test):
    17     88.1 MiB      7.9 MiB     random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
    18     87.1 MiB     -1.0 MiB     random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
    19     87.1 MiB      0.0 MiB     phrases = [random_phrase() for i in range(N_PHRASES)]
    20     88.1 MiB      1.0 MiB     random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
    21     88.1 MiB      0.0 MiB     texts = [random_text() for i in range(N_TEXTS)]
    22
    23     88.1 MiB      0.0 MiB     total = 0
    24     88.1 MiB      0.0 MiB     start = None
    25
    26     88.1 MiB      0.0 MiB     if test == 'spacy':
    27                                 nlp = spacy.load('en_dummy')
    28                                 matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
    29                                 start = time.time()
    30                                 for text in nlp.pipe(texts):
    31                                   total += len(matcher(text))
    32     88.1 MiB      0.0 MiB     elif test == 'pyahocorasick':
    33     88.1 MiB      0.0 MiB       matcher = ahocorasick.Automaton()
    34    249.4 MiB    161.3 MiB       for phrase in phrases:
    35    249.4 MiB      0.0 MiB         matcher.add_word(phrase, phrase)
    36    249.6 MiB      0.2 MiB       matcher.make_automaton()
    37    249.6 MiB      0.0 MiB       start = time.time()
    38    251.0 MiB      1.4 MiB       for text in texts:
    39    251.0 MiB      0.0 MiB         total += len(list(matcher.iter(text)))
    40                               else:
    41                                 matcher = acora.AcoraBuilder(*phrases).build()
    42                                 start = time.time()
    43                                 for text in texts:
    44                                   total += len(matcher.findall(text))
    45    251.0 MiB      0.0 MiB     print('Total %s matches in %ss' % (total, time.time() - start))
diff --git a/spacy b/spacy
 python3 spacy_vs_acora.py spacy

    Warning: no model found for 'en_dummy'

    Only loading the 'en' tokenizer.

 Total 10050 matches in 2.5495481491088867s
 Filename: spacy_vs_acora.py

 Line #    Mem usage    Increment   Line Contents
 ================================================
    14     80.1 MiB      0.0 MiB   @memory_profiler.profile()
    15                             def main(test):
    16     88.2 MiB      8.1 MiB     random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
    17     87.2 MiB     -1.0 MiB     random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
    18     87.2 MiB      0.0 MiB     phrases = [random_phrase() for i in range(N_PHRASES)]
    19     88.2 MiB      1.0 MiB     random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
    20     88.2 MiB      0.0 MiB     texts = [random_text() for i in range(N_TEXTS)]
    21
    22     88.2 MiB      0.0 MiB     total = 0
    23     88.2 MiB      0.0 MiB     start = None
    24
    25     88.2 MiB      0.0 MiB     if test == 'spacy':
    26     89.6 MiB      1.4 MiB       nlp = spacy.load('en_dummy')
    27    337.0 MiB    247.4 MiB       matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
    28    337.0 MiB      0.0 MiB       start = time.time()
    29    343.6 MiB      6.6 MiB       for text in nlp.pipe(texts):
    30    343.6 MiB      0.0 MiB         total += len(matcher(text))
    31                               else:
    32                                 matcher = acora.AcoraBuilder(*phrases).build()
    33                                 start = time.time()
    34                                 for text in texts:
    35                                   total += len(matcher.findall(text))
    36    343.6 MiB      0.0 MiB     print('Total %s matches in %ss' % (total, time.time() - start))
diff --git a/spacy_vs_acora.py b/spacy_vs_acora.py
 import random
 import string
 import acora
 import spacy
 import ahocorasick
 import sys
 import time
 import memory_profiler


 N_PHRASES = 100000
 N_TEXTS = 1000


 @memory_profiler.profile()
 def main(test):
  random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
  random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
  phrases = [random_phrase() for i in range(N_PHRASES)]
  random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
  texts = [random_text() for i in range(N_TEXTS)]

  total = 0
  start = None

  if test == 'spacy':
    nlp = spacy.load('en_dummy')
    matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
    start = time.time()
    for text in nlp.pipe(texts):
      total += len(matcher(text))
  elif test == 'pyahocorasick':
    matcher = ahocorasick.Automaton()
    for phrase in phrases:
      matcher.add_word(phrase, phrase)
    matcher.make_automaton()
    start = time.time()
    for text in texts:
      total += len(list(matcher.iter(text)))
  else:
    matcher = acora.AcoraBuilder(*phrases).build()
    start = time.time()
    for text in texts:
      total += len(matcher.findall(text))
  print('Total %s matches in %ss' % (total, time.time() - start))


 if __name__ == '__main__':
  main(sys.argv[-1])
	python3 spacy_vs_acora.py acora
	Total 10036 matches in 0.4687957763671875s
	Filename: spacy_vs_acora.py

	Line # Mem usage Increment Line Contents
	================================================
	14 80.6 MiB 0.0 MiB @memory_profiler.profile()
	15 def main(test):
	16 88.6 MiB 8.0 MiB random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
	17 87.7 MiB -0.9 MiB random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
	18 87.7 MiB 0.0 MiB phrases = [random_phrase() for i in range(N_PHRASES)]
	19 88.6 MiB 0.9 MiB random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
	20 88.6 MiB 0.0 MiB texts = [random_text() for i in range(N_TEXTS)]
	21
	22 88.6 MiB 0.0 MiB total = 0
	23 88.6 MiB 0.0 MiB start = None
	24
	25 88.6 MiB 0.0 MiB if test == 'spacy':
	26 nlp = spacy.load('en_dummy')
	27 matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
	28 start = time.time()
	29 for text in nlp.pipe(texts):
	30 total += len(matcher(text))
	31 else:
	32 3089.6 MiB 3001.0 MiB matcher = acora.AcoraBuilder(*phrases).build()
	33 3089.6 MiB 0.0 MiB start = time.time()
	34 3089.6 MiB 0.0 MiB for text in texts:
	35 3089.6 MiB 0.0 MiB total += len(matcher.findall(text))
	36 3089.6 MiB 0.0 MiB print('Total %s matches in %ss' % (total, time.time() - start))
	python3 spacy_vs_acora.py pyahocorasick
	Total 9934 matches in 0.1664886474609375s
	Filename: spacy_vs_acora.py

	Line # Mem usage Increment Line Contents
	================================================
	15 80.1 MiB 0.0 MiB @memory_profiler.profile()
	16 def main(test):
	17 88.1 MiB 7.9 MiB random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
	18 87.1 MiB -1.0 MiB random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
	19 87.1 MiB 0.0 MiB phrases = [random_phrase() for i in range(N_PHRASES)]
	20 88.1 MiB 1.0 MiB random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
	21 88.1 MiB 0.0 MiB texts = [random_text() for i in range(N_TEXTS)]
	22
	23 88.1 MiB 0.0 MiB total = 0
	24 88.1 MiB 0.0 MiB start = None
	25
	26 88.1 MiB 0.0 MiB if test == 'spacy':
	27 nlp = spacy.load('en_dummy')
	28 matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
	29 start = time.time()
	30 for text in nlp.pipe(texts):
	31 total += len(matcher(text))
	32 88.1 MiB 0.0 MiB elif test == 'pyahocorasick':
	33 88.1 MiB 0.0 MiB matcher = ahocorasick.Automaton()
	34 249.4 MiB 161.3 MiB for phrase in phrases:
	35 249.4 MiB 0.0 MiB matcher.add_word(phrase, phrase)
	36 249.6 MiB 0.2 MiB matcher.make_automaton()
	37 249.6 MiB 0.0 MiB start = time.time()
	38 251.0 MiB 1.4 MiB for text in texts:
	39 251.0 MiB 0.0 MiB total += len(list(matcher.iter(text)))
	40 else:
	41 matcher = acora.AcoraBuilder(*phrases).build()
	42 start = time.time()
	43 for text in texts:
	44 total += len(matcher.findall(text))
	45 251.0 MiB 0.0 MiB print('Total %s matches in %ss' % (total, time.time() - start))
	python3 spacy_vs_acora.py spacy

	Warning: no model found for 'en_dummy'

	Only loading the 'en' tokenizer.

	Total 10050 matches in 2.5495481491088867s
	Filename: spacy_vs_acora.py

	Line # Mem usage Increment Line Contents
	================================================
	14 80.1 MiB 0.0 MiB @memory_profiler.profile()
	15 def main(test):
	16 88.2 MiB 8.1 MiB random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
	17 87.2 MiB -1.0 MiB random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
	18 87.2 MiB 0.0 MiB phrases = [random_phrase() for i in range(N_PHRASES)]
	19 88.2 MiB 1.0 MiB random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
	20 88.2 MiB 0.0 MiB texts = [random_text() for i in range(N_TEXTS)]
	21
	22 88.2 MiB 0.0 MiB total = 0
	23 88.2 MiB 0.0 MiB start = None
	24
	25 88.2 MiB 0.0 MiB if test == 'spacy':
	26 89.6 MiB 1.4 MiB nlp = spacy.load('en_dummy')
	27 337.0 MiB 247.4 MiB matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
	28 337.0 MiB 0.0 MiB start = time.time()
	29 343.6 MiB 6.6 MiB for text in nlp.pipe(texts):
	30 343.6 MiB 0.0 MiB total += len(matcher(text))
	31 else:
	32 matcher = acora.AcoraBuilder(*phrases).build()
	33 start = time.time()
	34 for text in texts:
	35 total += len(matcher.findall(text))
	36 343.6 MiB 0.0 MiB print('Total %s matches in %ss' % (total, time.time() - start))
	import random
	import string
	import acora
	import spacy
	import ahocorasick
	import sys
	import time
	import memory_profiler


	N_PHRASES = 100000
	N_TEXTS = 1000


	@memory_profiler.profile()
	def main(test):
	random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16)))
	random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5)))
	phrases = [random_phrase() for i in range(N_PHRASES)]
	random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))])
	texts = [random_text() for i in range(N_TEXTS)]

	total = 0
	start = None

	if test == 'spacy':
	nlp = spacy.load('en_dummy')
	matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases))
	start = time.time()
	for text in nlp.pipe(texts):
	total += len(matcher(text))
	elif test == 'pyahocorasick':
	matcher = ahocorasick.Automaton()
	for phrase in phrases:
	matcher.add_word(phrase, phrase)
	matcher.make_automaton()
	start = time.time()
	for text in texts:
	total += len(list(matcher.iter(text)))
	else:
	matcher = acora.AcoraBuilder(*phrases).build()
	start = time.time()
	for text in texts:
	total += len(matcher.findall(text))
	print('Total %s matches in %ss' % (total, time.time() - start))


	if __name__ == '__main__':
	main(sys.argv[-1])