Skip to content

Instantly share code, notes, and snippets.

@tammoippen
Created August 10, 2017 12:02
Show Gist options
  • Save tammoippen/3a3148649950720157b9655b2868cb8b to your computer and use it in GitHub Desktop.
Save tammoippen/3a3148649950720157b9655b2868cb8b to your computer and use it in GitHub Desktop.
Some algorithms from the paper "Bed-Tree: An All-Purpose Index Structure for String Similarity Search Based on Edit Distance", Zhang et al. (doi 10.1145/1807167.1807266)
from collections import defaultdict
from functools import lru_cache
from typing import Dict, Iterable, List, Tuple
import mmh3
import numpy as np
from slugify import slugify
def verify_edit_distance(s1: str, s2: str, th: int) -> bool:
'''Test, whether the edit distance between `s1` and `s2` is below `th` in
O(max(len(s1), len(s2)))
Parameter:
s1: str First query string.
s2: str Second query string.
th: int Edit distance threshold.
Returns:
bool: True if edit distance <= th, False otherwise.
'''
ls1 = len(s1)
ls2 = len(s2)
if abs(ls1 - ls2) > th:
return False
t = np.zeros((2, ls2+1))
for j in range(min(ls2+1, 1+th)):
t[0][j] = j
for i in range(1, ls1+1):
m = th + 1
for j in range(max(0, i-th), min(ls2, i+th)+1):
d1, d2, d3 = th+1, th+1, th+1
if j < i + th:
d1 = t[0][j] + 1
if j > 0:
d2 = t[1][j-1] + 1
d3 = t[0][j-1] + int(s1[i-1] != s2[j-1])
t[1][j] = min(d1, d2, d3)
m = min(m, t[1][j])
if m > th:
return False
t[0] = t[1]
return True
def lower_bound_edit_distance(v_i: List[int], v_j: List[int], n: int=2) -> float:
'''Computes the lower bound edit distance on bucket vectors
Computes the lower bound edit distance on n-gram in `bucket space` between bucket
vectors `v_i` and `v_j`.
Parameter:
v_i: List[int] Bucket n-gram vector.
v_j: List[int] Bucket n-gram vector.
n: int n-gram size.
Returns:
float: lower bound (absolute) edit distance
'''
assert len(v_i) == len(v_j)
a, b = 0, 0
for l in range(len(v_i)):
if v_i[l] > v_j[l]:
a += (v_i[l] - v_j[l]) / n
if v_i[l] < v_j[l]:
b += (v_i[l] - v_j[l]) / n
return max(a, b)
@lru_cache(maxsize=4096) # stores 64 x 2-gram hashes
def str_hash(t: str, seed=42) -> int:
'''Use of Murmur3 after this blog:
https://research.neustar.biz/2012/02/02/choosing-a-good-hash-function-part-3/
'''
return mmh3.hash(t, seed=42)
def ngrams(t: str, n: int=2) -> Dict[str, int]:
'''Compute frequency of n-grams in text t.
Parameter:
t: str Input string.
n: int Size of the n-grams.
Returns:
Dict[str, int]: The n-gram frequency: n-gram -> frequency
'''
res = defaultdict(int)
if t:
for j in range(n-1, 0, -1):
res['^'*j + t[:n-j]] += 1
for j in range(1, n):
res[t[-(n-j):] + '$'*j] += 1
for i in range(len(t) - (n-1)):
res[t[i:i+n]] += 1
return res
def ngrams_pos(t: str, n: int=2) -> List[Tuple[str, int]]:
'''Compute positional n-grams of text t.
Parameter:
t: str Input string.
n: int Size of the n-grams.
Returns:
List[Tuple[str, int]]: The positional n-grams.
'''
res = list()
if t:
i = 0
for j in range(n-1, 0, -1):
res += [('^'*j + t[:n-j], i)]
i += 1
for j in range(len(t) - (n-1)):
res += [(t[j:j+n], i)]
i += 1
for j in range(1, n):
res += [(t[-(n-j):] + '$'*j, i)]
i += 1
return res
def ngram2bucket(ngrams: Dict[str, int], n_buckets: int=10) -> Iterable[int]:
'''Assign n-gram frequency onto n buckets by hashing the n-gram and counting
the frequencies for each bucket.
Parameter:
ngrams: Dict[str, int] The n-gram frequency: n-gram -> frequency
n_buckets: int Number of target buckets.
Returns:
narray dtype=int: Frequency of bucket occurences.
'''
res = [0] * n_buckets
for ng, n in ngrams.items():
res[str_hash(ng) % n_buckets] += n
return res
def zorder(xs: Iterable[int], sig_bits: int=None) -> int:
'''Compute the z-order of a list of ints, i.e. most significant bits of all
ints come first and least significant bits come last. For example the list
[3, 2, 1, 3] has bits [0b11, 0b10, 0b01, 0b11]. The z-order of these ints
is: 0b11011011
bin(zorder([3, 2, 1, 3])) -> '0b11011011' = 219
Parameter:
xs: Iterable[int] Some list / narray / ... of (positive) integer values.
sig_bits: int The num. of significant bits for counts xs. Assigning
an appropriat value, speeds up computation.
Returns:
int: z-order of given list
'''
res = 0
if sig_bits is None:
sig_bits = max(xs).bit_length()
max_count = 1 << sig_bits
idx = len(xs)*sig_bits-1
for i in range(sig_bits-1, -1, -1):
for x in xs:
if x >= max_count:
return zorder(xs, sig_bits=None) # computes the sig_bits from input
bit = (x & (1 << i)) >> i
res |= bit << idx
idx -= 1
return res
def gram_counting_order(t: str, n: int=2, n_buckets: int=10, sig_bits: int=None):
'''Compute the Gram Counting Order from paper (doi: 10.1145/1807167.1807266)
Is the same as:
zorder(ngram2bucket(ngrams(t, n=n), n_buckets=n_buckets), sig_bits=sig_bits)
Parameter:
t: str The input string.
n: int The size of the n-grams.
n_buckets: int The number of buckets to count the n-grams.
sig_bits: int The num. of significant bits for counts in bucket vectors.
Returns:
int: Gram Counting Order of t.
'''
def ngram2bucket_fast(t: str, n: int=2, n_buckets: int=10) -> Iterable[int]:
res = [0] * n_buckets
if t:
for j in range(n-1, 0, -1):
res[str_hash('^'*j + t[:n-j]) % n_buckets] += 1
for j in range(1, n):
res[str_hash(t[-(n-j):] + '$'*j) % n_buckets] += 1
for i in range(len(t) - (n-1)):
res[str_hash(t[i:i+n]) % n_buckets] += 1
return res
return zorder(
ngram2bucket_fast(
slugify(t),
n=n,
n_buckets=n_buckets
),
sig_bits=sig_bits
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment