Last active
September 20, 2017 21:22
-
-
Save sadovnychyi/90aa96a4dbaed71a466e82cc8ebe0a35 to your computer and use it in GitHub Desktop.
Spacy vs Acora for multi keyword search benchmarking
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python3 spacy_vs_acora.py acora | |
Total 10036 matches in 0.4687957763671875s | |
Filename: spacy_vs_acora.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
14 80.6 MiB 0.0 MiB @memory_profiler.profile() | |
15 def main(test): | |
16 88.6 MiB 8.0 MiB random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16))) | |
17 87.7 MiB -0.9 MiB random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5))) | |
18 87.7 MiB 0.0 MiB phrases = [random_phrase() for i in range(N_PHRASES)] | |
19 88.6 MiB 0.9 MiB random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))]) | |
20 88.6 MiB 0.0 MiB texts = [random_text() for i in range(N_TEXTS)] | |
21 | |
22 88.6 MiB 0.0 MiB total = 0 | |
23 88.6 MiB 0.0 MiB start = None | |
24 | |
25 88.6 MiB 0.0 MiB if test == 'spacy': | |
26 nlp = spacy.load('en_dummy') | |
27 matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases)) | |
28 start = time.time() | |
29 for text in nlp.pipe(texts): | |
30 total += len(matcher(text)) | |
31 else: | |
32 3089.6 MiB 3001.0 MiB matcher = acora.AcoraBuilder(*phrases).build() | |
33 3089.6 MiB 0.0 MiB start = time.time() | |
34 3089.6 MiB 0.0 MiB for text in texts: | |
35 3089.6 MiB 0.0 MiB total += len(matcher.findall(text)) | |
36 3089.6 MiB 0.0 MiB print('Total %s matches in %ss' % (total, time.time() - start)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python3 spacy_vs_acora.py pyahocorasick | |
Total 9934 matches in 0.1664886474609375s | |
Filename: spacy_vs_acora.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
15 80.1 MiB 0.0 MiB @memory_profiler.profile() | |
16 def main(test): | |
17 88.1 MiB 7.9 MiB random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16))) | |
18 87.1 MiB -1.0 MiB random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5))) | |
19 87.1 MiB 0.0 MiB phrases = [random_phrase() for i in range(N_PHRASES)] | |
20 88.1 MiB 1.0 MiB random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))]) | |
21 88.1 MiB 0.0 MiB texts = [random_text() for i in range(N_TEXTS)] | |
22 | |
23 88.1 MiB 0.0 MiB total = 0 | |
24 88.1 MiB 0.0 MiB start = None | |
25 | |
26 88.1 MiB 0.0 MiB if test == 'spacy': | |
27 nlp = spacy.load('en_dummy') | |
28 matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases)) | |
29 start = time.time() | |
30 for text in nlp.pipe(texts): | |
31 total += len(matcher(text)) | |
32 88.1 MiB 0.0 MiB elif test == 'pyahocorasick': | |
33 88.1 MiB 0.0 MiB matcher = ahocorasick.Automaton() | |
34 249.4 MiB 161.3 MiB for phrase in phrases: | |
35 249.4 MiB 0.0 MiB matcher.add_word(phrase, phrase) | |
36 249.6 MiB 0.2 MiB matcher.make_automaton() | |
37 249.6 MiB 0.0 MiB start = time.time() | |
38 251.0 MiB 1.4 MiB for text in texts: | |
39 251.0 MiB 0.0 MiB total += len(list(matcher.iter(text))) | |
40 else: | |
41 matcher = acora.AcoraBuilder(*phrases).build() | |
42 start = time.time() | |
43 for text in texts: | |
44 total += len(matcher.findall(text)) | |
45 251.0 MiB 0.0 MiB print('Total %s matches in %ss' % (total, time.time() - start)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python3 spacy_vs_acora.py spacy | |
Warning: no model found for 'en_dummy' | |
Only loading the 'en' tokenizer. | |
Total 10050 matches in 2.5495481491088867s | |
Filename: spacy_vs_acora.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
14 80.1 MiB 0.0 MiB @memory_profiler.profile() | |
15 def main(test): | |
16 88.2 MiB 8.1 MiB random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16))) | |
17 87.2 MiB -1.0 MiB random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5))) | |
18 87.2 MiB 0.0 MiB phrases = [random_phrase() for i in range(N_PHRASES)] | |
19 88.2 MiB 1.0 MiB random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))]) | |
20 88.2 MiB 0.0 MiB texts = [random_text() for i in range(N_TEXTS)] | |
21 | |
22 88.2 MiB 0.0 MiB total = 0 | |
23 88.2 MiB 0.0 MiB start = None | |
24 | |
25 88.2 MiB 0.0 MiB if test == 'spacy': | |
26 89.6 MiB 1.4 MiB nlp = spacy.load('en_dummy') | |
27 337.0 MiB 247.4 MiB matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases)) | |
28 337.0 MiB 0.0 MiB start = time.time() | |
29 343.6 MiB 6.6 MiB for text in nlp.pipe(texts): | |
30 343.6 MiB 0.0 MiB total += len(matcher(text)) | |
31 else: | |
32 matcher = acora.AcoraBuilder(*phrases).build() | |
33 start = time.time() | |
34 for text in texts: | |
35 total += len(matcher.findall(text)) | |
36 343.6 MiB 0.0 MiB print('Total %s matches in %ss' % (total, time.time() - start)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import string | |
import acora | |
import spacy | |
import ahocorasick | |
import sys | |
import time | |
import memory_profiler | |
N_PHRASES = 100000 | |
N_TEXTS = 1000 | |
@memory_profiler.profile() | |
def main(test): | |
random_word = lambda: ''.join(random.choices(string.ascii_letters, k=random.randint(2, 16))) | |
random_phrase = lambda: ' '.join(random_word() for i in range(random.randint(2, 5))) | |
phrases = [random_phrase() for i in range(N_PHRASES)] | |
random_text = lambda: ' '.join([random.choice([random.choice(phrases), random_word()]) for i in range(random.randint(8, 32))]) | |
texts = [random_text() for i in range(N_TEXTS)] | |
total = 0 | |
start = None | |
if test == 'spacy': | |
nlp = spacy.load('en_dummy') | |
matcher = spacy.matcher.PhraseMatcher(vocab=nlp.vocab, phrases=nlp.pipe(phrases)) | |
start = time.time() | |
for text in nlp.pipe(texts): | |
total += len(matcher(text)) | |
elif test == 'pyahocorasick': | |
matcher = ahocorasick.Automaton() | |
for phrase in phrases: | |
matcher.add_word(phrase, phrase) | |
matcher.make_automaton() | |
start = time.time() | |
for text in texts: | |
total += len(list(matcher.iter(text))) | |
else: | |
matcher = acora.AcoraBuilder(*phrases).build() | |
start = time.time() | |
for text in texts: | |
total += len(matcher.findall(text)) | |
print('Total %s matches in %ss' % (total, time.time() - start)) | |
if __name__ == '__main__': | |
main(sys.argv[-1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice benchmark!
I made another version that moves the tokenization outside the timer, to check how the matcher itself performs. I might've stuffed up the benchmark, but I get 10112 matches in 0.01 seconds. Code below.