Created
November 22, 2022 23:01
-
-
Save pchiusano/fb8c33b09d1773cbdbc42538e9a887ca to your computer and use it in GitHub Desktop.
Suffix tree prototyping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- id is the doc id, a is the text type, v is the value type | |
unique type Trie id txt | |
= Remainder txt (Trie.Pos id) | |
| Branch Nat (Dataset Value (Trie.Pos id)) (Map txt (Value (Trie id txt))) | |
--Remainder id txt Nat | |
-- there can be multiple docs at this suffix | |
-- Branch size | |
--| Branch Nat (Dataset Value id) (Map txt (Value (Trie id txt))) | |
unique type Trie.Pos id = Pos id Nat | |
Trie.Pos.position : Pos id -> Nat | |
Trie.Pos.position = cases Pos _ i -> i | |
Trie.Pos.id : Pos id -> id | |
Trie.Pos.id = cases Pos id _ -> id | |
Trie.size : Trie id txt -> Nat | |
Trie.size = cases | |
Remainder _ _ -> 1 | |
Branch n _ _ -> n | |
Trie.debug.show : Trie Text Text ->{Remote} Text | |
Trie.debug.show = | |
use Text ++ | |
go indent = cases | |
Remainder txt (Pos doc pos) -> indent ++ "Remainder " ++ doc ++ " \"" ++ txt ++ "\" " ++ Nat.toText pos | |
Branch n ds subs -> | |
subsL = Map.toList subs | |
showPos = cases Pos id position -> "pos " ++ id ++ " " ++ (Nat.toText position) | |
showDs ds = match Storage.ram.value '(Dataset.toList ds) with | |
[] -> "[]" | |
ds -> "[" ++ (List.map showPos ds |> Text.join ", ") ++ "]" | |
showSub = cases (subChar, sub) -> | |
indent ++ " \"" ++ subChar ++ "\"\n" ++ go (indent ++ " ") (Value.get sub) | |
showSubs = Text.join "\n" (List.map showSub subsL) | |
indent ++ "Branch (size=" ++ Nat.toText n ++ ") (here=" ++ (showDs ds) ++ ")\n" | |
++ showSubs | |
-- Branch 14 7 | |
-- a | |
-- Branch | |
-- b | |
go "" | |
-- Todo: merge could be parallel, much like fromSuffixes, with the same thresholding | |
Dataset.thresholdSave : Nat -> Dataset d a ->{Storage d} Dataset d a | |
Dataset.thresholdSave maxUnsaved d = | |
if Dataset.unsavedDepth d >= maxUnsaved then Dataset.save d | |
else d | |
Trie.merge : Rope.Chunk txt -> (forall a . a ->{Remote} Value a) -> Trie id txt -> Trie id txt ->{Remote} Trie id txt | |
Trie.merge T save = cases | |
Remainder txt pos, t -> Trie.insert T save txt pos t | |
t, Remainder txt pos -> Trie.insert T save txt pos t | |
Branch sz1 here1 subs1, Branch sz2 here2 subs2 -> | |
sz = sz1 + sz2 | |
here = Storage.saveWith save '(Dataset.thresholdSave 5 (Dataset.union here1 here2)) | |
combine vt1 vt2 = save (Trie.merge T save (Value.get vt1) (Value.get vt2)) | |
Branch sz here (Map.unionWith combine subs1 subs2) | |
Trie.insert : Rope.Chunk txt -> (forall a . a ->{Remote} Value a) -> txt -> Pos id -> Trie id txt ->{Remote} Trie id txt | |
Trie.insert T save txt pos = cases | |
Remainder txt2 pos2 -> match (Chunk.uncons T txt, Chunk.uncons T txt2) with | |
(Some (hd1,tl1), Some (hd2,tl2)) -> | |
rem1 = Remainder tl1 pos | |
rem2 = Remainder tl2 pos2 | |
if hd1 === hd2 then | |
Branch 2 Dataset.empty (Map.fromList [(hd1, save (Trie.merge T save rem1 rem2 ))]) | |
else | |
Branch 2 Dataset.empty (Map.fromList [(hd1, save rem1), (hd2, save rem2)]) | |
(None, Some (hd2, tl2)) -> | |
Branch 2 (saveWith save '(Dataset.fromList [pos])) (Map.fromList [(hd2, save (Remainder tl2 pos2))]) | |
(Some (hd, t1), None) -> | |
Branch 2 (saveWith save '(Dataset.fromList [pos2])) (Map.fromList [(hd, save (Remainder t1 pos))]) | |
(None, None) -> Branch 2 (saveWith save '(Dataset.fromList [pos, pos2])) Map.empty | |
Branch sz here subs -> match Chunk.uncons T txt with | |
None -> Branch (sz + 1) (saveWith save '(Dataset.insert pos here)) subs | |
Some (hd, tl) -> | |
f = cases | |
None -> Some (save (Remainder tl pos)) | |
Some t -> Some (save (Trie.insert T save tl pos (Value.get t))) | |
Branch (sz + 1) here (Map.alter f hd subs) | |
-- Remainder | |
-- doc1 - "zabra." | |
-- doc2 - "zabracadabra." | |
-- "." is a unique suffix in doc1 and in doc2, so it will become a Remainder | |
-- | |
> Remote.pure.run do | |
t1 = Trie.fromSuffixes Chunk.Text Value.pure 100 "doc1" "ab" | |
t2 = Trie.fromSuffixes Chunk.Text Value.pure 100 "doc2" "b" | |
msg = Trie.merge Chunk.Text Value.pure t1 t2 |> Trie.debug.show | |
trace ("\n" ++ msg) () | |
-- todo: ensure that Trie.fromSuffixes never produces a Remainder with an empty remainder | |
up.List.filterNot pred = List.filter (x -> not (pred x)) | |
Trie.empty = Branch 0 Dataset.empty Map.empty | |
Trie.fromSuffixes : Rope.Chunk txt | |
-> (forall a . a ->{Remote} Value a) | |
-> Nat | |
-> id | |
-> txt | |
->{Remote} Trie id txt | |
Trie.fromSuffixes T save parThreshold id txt = | |
txtSize = Rope.Chunk.size T txt | |
go : List.Nonempty (txt,Nat) ->{Remote} Value (Trie id txt) | |
go = cases | |
Nonempty (txt,i) [] -> save (Remainder txt (Pos id i)) | |
txts -> | |
subs = List.groupBy (at1 >> Chunk.take T 1) (toList txts) |> Map.toList | |
rec = cases (ch, cs0) -> | |
cs : Nonempty (txt, Nat) | |
cs = Nonempty.map (cases (suffix, i) -> (Chunk.drop T 1 suffix, i)) cs0 | |
if Nonempty.size cs > parThreshold then | |
(ch, Left (Remote.forkAt here! '(go cs))) | |
else | |
(ch, Right (go cs)) | |
ts = List.map rec subs |> List.map cases | |
(ch, Left t) -> (ch, Remote.await t) | |
(ch, Right t) -> (ch, t) | |
save (Branch (List.Nonempty.size txts) Dataset.empty (Map.fromList ts)) | |
nonempty = List.filter (not << Rope.Chunk.isEmpty T) | |
match Rope.Chunk.suffixes T txt |> nonempty |> List.indexed with | |
[] -> Trie.empty | |
h +: t -> Value.get (go (Nonempty h t)) | |
Trie.subtrie : Rope.Chunk txt -> txt -> Trie id txt ->{Remote} Trie id txt | |
Trie.subtrie T txt t = | |
match Chunk.uncons T txt with | |
None -> t | |
Some (hd, txt) -> match t with | |
Branch _ _ children -> | |
match Map.lookup hd children with | |
None -> Trie.empty | |
Some t -> Trie.subtrie T txt (Value.get t) | |
_ -> Trie.empty |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment