Created
November 23, 2018 21:31
-
-
Save rsirres/7ce41ff2aa926935f2844279b759cbf6 to your computer and use it in GitHub Desktop.
Implementation of levensthein automata in nimlang
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://julesjacobs.github.io/2015/06/17/disqus-levenshtein-simple-and-fast.html | |
import sugar | |
import sequtils | |
import tables | |
import strutils | |
import hashes | |
type | |
SparseLevenshteinAutomaton = object | |
max_edits: int | |
s: string | |
State = tuple[indices: seq[int], values: seq[int]] | |
proc hash(x: State): Hash = | |
## Piggyback on the already available string hash proc. | |
## | |
## Without this proc nothing works! | |
result = x.indices.hash !& x.values.hash | |
result = !$result | |
proc start(self: SparseLevenshteinAutomaton):State = | |
return (newSeq[int](self.max_edits), newSeq[int](self.max_edits)) | |
proc step(self: SparseLevenshteinAutomaton, n: State, c:char): State = | |
var new_indices: seq[int] | |
var new_values: seq[int] | |
if len(n.indices) != 0 and n.indices[0] == 0 and n.values[0] < self.max_edits: | |
new_indices = @[0] | |
new_values = @[n.values[0] + 1] | |
else: | |
new_indices = @[] | |
new_values = @[] | |
for j, i in n.indices: | |
if i == len(self.s): break | |
var cost = if self.s[i] == c: 0 else: 1 | |
var val = n.values[j] + cost | |
if len(new_indices) != 0 and new_indices[^1] == i: | |
val = min(val, new_values[^1] + 1) | |
if j+1 < len(n.indices) and n.indices[j+1] == i+1: | |
val = min(val, n.values[j+1] + 1) | |
if val <= self.max_edits: | |
new_indices.add(i+1) | |
new_values.add(val) | |
return (new_indices, new_values) | |
proc is_match(self: SparseLevenshteinAutomaton, n: State): bool = | |
return len(n.indices) != 0 and n.indices[^1] == len(self.s) | |
proc can_match(self: SparseLevenshteinAutomaton, n: tuple[indices: seq[int], values: seq[int]]):bool = | |
return len(n.indices) != 0 | |
proc transitions(self: SparseLevenshteinAutomaton, n: tuple[indices: seq[int], values: seq[int]]): seq[char] = | |
return deduplicate( lc[self.s[i] | (i <- n.indices, i < len(self.s)), char] ) | |
var counter = @[0] # list is a hack for mutable lexical scoping | |
var states = initTable[State, int]() | |
var trans:seq[(int, int, char)] = @[] | |
var matching: seq[int] = @[] | |
let lev = SparseLevenshteinAutomaton(s:"woof", max_edits:2) | |
# proc explore(state: State) : int = | |
# let key = state # lists can't be hashed in Python so convert to a tuple | |
# if key in states: return states[key] | |
# var i = counter[0] | |
# counter[0] += 1 | |
# states[key] = i | |
# if lev.is_match(state): matching.add(i) | |
# for c in concat(lev.transitions(state), @['*']): | |
# var newstate = lev.step(state, c) | |
# var j = explore(newstate) | |
# trans.add((i, j, c)) | |
# return i | |
# var i = explore(lev.start()) | |
var state = lev.start() | |
let s = "woofies" | |
for c in s: | |
state = lev.step(state, c) | |
echo lev.can_match(state) # True, "w" can match "bannana" with distance 1 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment