Skip to content

Instantly share code, notes, and snippets.

Forked from zapalote/
Created March 6, 2021 05:41
Show Gist options
  • Save evelaguti/3092fa16b4a00411416e519309de6123 to your computer and use it in GitHub Desktop.
Save evelaguti/3092fa16b4a00411416e519309de6123 to your computer and use it in GitHub Desktop.
Example of multi-threading and memory mapped file processing.
# extraction pattern: ngram TAB year TAB match_count TAB volume_count NEWLINE
# out: unique_ngram TAB sum(match_count) NEWLINE
import re
import os, sys, mmap
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
abv = re.compile(r'^(([A-Z]\.){1,})(_|[^\w])') # A.B.C.
word = re.compile(r'^(([a-zA-Z]){1,})(_|[^\w])') # ABC | Abc | abc
base = "googlebooks-eng-all-1gram-20120701-"
files = ['a','b','c','d','e','f','g','h','i','j',\
def process_file(file):
global base
vocab = {}
fsize = Path(base+file).stat().st_size
tot = 0
print(f"processing {base+file}")
with open(base+file, "r+b") as fp:
# use a progress bar
with tqdm(total=fsize, desc=file) as pbar:
# map the entire file into memory, normally much faster than buffered i/o
mm = mmap.mmap(fp.fileno(), 0)
# iterate over the block, until next newline
for line in iter(mm.readline, b""):
t = ''
# convert the bytes to a utf-8 string and split the fields
term = line.decode("utf-8").split("\t")
# catch patterns such as A.B.C. (old-style abbreviations)
m_abv =[0])
if m_abv:
# remove punctuation
t = re.sub(r'[^\w]', '',
m_word =[0])
if m_word:
t =
# add it to dictionary if not yet included and add its match_count
if t in vocab:
vocab[t] += int(term[2])
vocab[t] = int(term[2])
# update the progress bar
tot += len(line)
pbar.update(tot - pbar.n)
# output vocabulary and counts to csv file
outf = "gbooks-en-" + file + ".csv"
with open(outf, "w") as fp:
for term in vocab:
# use as many threads as possible, default: min(32, os.cpu_count()+4)
with ThreadPoolExecutor() as executor:
result =, files)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment