Last active
October 17, 2022 08:25
-
-
Save Heimdell/a06df902a0e0028ea5a25dfbfdb95af7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-} | |
{-# HLINT ignore "Redundant multi-way if" #-} | |
{-# LANGUAGE ApplicativeDo #-} | |
{-# LANGUAGE BlockArguments #-} | |
{-# LANGUAGE DerivingStrategies #-} | |
{-# LANGUAGE FlexibleInstances #-} | |
{-# LANGUAGE ImportQualifiedPost #-} | |
{-# LANGUAGE InstanceSigs #-} | |
{-# LANGUAGE LambdaCase #-} | |
{-# LANGUAGE MultiWayIf #-} | |
{-# LANGUAGE NamedFieldPuns #-} | |
{-# LANGUAGE ScopedTypeVariables #-} | |
{-# LANGUAGE DeriveTraversable #-} | |
{- | Implementation of Thompson scanner (regexp). | |
Mostly taken verbatim from | |
https://www.cs.kent.ac.uk/people/staff/sjt/craft2e/regExp.pdf. | |
Higly unefficient in generation of DFA, slightly inefficient in matching | |
(we're calling `<=` on `Set Int` here multiple times per character). | |
-} | |
module Lexer where | |
import Control.Monad.Except (throwError) | |
import Control.Monad.State ( runState, evalState, get, put, State ) | |
import Data.Map (Map) | |
import Data.Map qualified as Map | |
import Data.Maybe (listToMaybe) | |
import Data.Set (Set) | |
import Data.Set qualified as Set | |
import Data.String (IsString) | |
import GHC.Exts (IsString(fromString)) | |
import Debug.Trace (traceShowM) | |
-- | Regular expression. | |
-- | |
data Reg a | |
= Eps -- ^ Empty string | |
| Lit a -- ^ One char | |
| And (Reg a) (Reg a) -- ^ Sequence | |
| Or (Reg a) (Reg a) -- ^ Selection | |
| Star (Reg a) -- ^ Kleene star (0 or more repeats) | |
deriving stock (Eq, Ord, Functor, Foldable) | |
-- | Alphabet - set of all used characters ("literals"). | |
-- | |
literals :: Ord a => Reg a -> Set a | |
literals = foldMap Set.singleton | |
-- | ToString(). | |
-- | |
instance Show (Reg Char) where | |
show :: Reg Char -> String | |
show = \case | |
Eps -> "" | |
Lit c -> [c] | |
And reg reg' -> show reg <> show reg' | |
Or reg reg' -> "(" <> show reg <> "|" <> show reg' <> ")" | |
Star reg -> "(" <> show reg <> ")*" | |
-- | FromString(). | |
-- | |
instance IsString (Reg Char) where | |
fromString :: String -> Reg Char | |
fromString = foldr (And . Lit) Eps | |
-- | Indetermistic finite automaton. | |
-- | |
data NFA c s = NFA | |
{ nfaStates :: Set s -- ^ All automaton states | |
, nfaMoves :: Set (Move c s) -- ^ All automaton moves | |
, nfaStart :: s -- ^ Starting (current) state | |
, nfaEnds :: Set s -- ^ All ending states | |
} | |
deriving stock (Eq, Ord, Show, Foldable) | |
-- | Automaton moves. | |
-- | |
data Move c s | |
= Move s c s -- ^ Move on given literal (character) | |
| EMove s s -- ^ Non-labeled transition (thus nondeterministic) | |
deriving stock (Eq, Ord, Show, Functor, Foldable, Traversable) | |
-- | Closure of the first state for given alphabet. | |
-- | |
trans :: (Ord s, Eq c) => NFA c s -> Set c -> Set s | |
trans nfa = foldr (oneTrans nfa) startset | |
where | |
startset = closure nfa (Set.singleton (nfaStart nfa)) | |
-- | Closure on the state that happens if we accept this char in this state set. | |
-- | |
oneTrans :: (Ord s, Eq c) => NFA c s -> c -> Set s -> Set s | |
oneTrans nfa c s = closure nfa (oneMove nfa c s) | |
-- | Closure. | |
-- | |
-- NFA closure is a set of states, that is built from smaller /kernel/ set of | |
-- states. It is a set of states that is reachable with `EMove` transitions. | |
-- | |
-- Mechanically, we contract all the states, `EMove`-connected to the /kernel/ | |
-- into one set. | |
-- | |
closure :: Ord s => NFA c s -> Set s -> Set s | |
closure NFA {nfaStart, nfaMoves, nfaStates, nfaEnds} = | |
limit \set -> Set.union set $ Set.fromList | |
[ s | |
| x <- Set.toList set -- for each kernel state, | |
, EMove y s <- Set.toList nfaMoves -- select `EMove`-s, | |
, y == x -- connected to that kernel state | |
] | |
-- | Continiously apply function to the argument until it stops changing it. | |
-- | |
limit :: Eq a => (a -> a) -> a -> a | |
limit f a = | |
let b = f a | |
in | |
if a == b | |
then a | |
else limit f b | |
-- | Get a set of states that we reach from closure if we accept given literal. | |
-- | |
oneMove :: (Ord s, Eq c) => NFA c s -> c -> Set s -> Set s | |
oneMove NFA {nfaStart, nfaMoves, nfaStates, nfaEnds} c x = | |
Set.fromList | |
[ s | |
| t <- Set.toList x -- each closure state | |
, Move z d s <- Set.toList nfaMoves -- find all moves | |
, z == t -- originating from that state | |
, d == c -- and accepting given symbol | |
] | |
-- | Convert regular expresion into `NFA`. We use `Int`-s as state names. | |
-- | |
build :: Ord c => Reg c -> NFA c Int | |
build = \case | |
Eps -> NFA | |
{ nfaStates = Set.fromList [0, 1] -- begin, end | |
, nfaMoves = Set.singleton (EMove 0 1) | |
, nfaStart = 0 | |
, nfaEnds = Set.singleton 1 | |
} | |
Lit c -> NFA | |
{ nfaStates = Set.fromList [0, 1] | |
, nfaStart = 0 | |
, nfaMoves = Set.singleton (Move 0 c 1) -- one move on given literal | |
, nfaEnds = Set.singleton 1 | |
} | |
And reg reg' -> nfaAnd (build reg) (build reg') -- join graphs sequentally | |
Or reg reg' -> nfaOr (build reg) (build reg') -- join graphs alongside | |
Star reg -> nfaStar (build reg) -- make a loop | |
where | |
-- We add 2 states: starting and finishing, and connect both subgraphs to | |
-- them. | |
-- | |
-- States are renamed such that 0 would not belong to any of them, and their | |
-- nodes do not intersect. The number line looks like that: | |
-- | |
-- 0 [nfa1] [nfa2] (m1 + m2 + 1) | |
-- | |
-- where m1, m2 are the sizes of nfa1 and nfa2. | |
-- | |
nfaOr :: Ord c => NFA c Int -> NFA c Int -> NFA c Int | |
nfaOr nfa1 nfa2 = NFA | |
{ nfaStart = 0 | |
, nfaStates = Set.unions [states1, states2, newStates] | |
, nfaMoves = Set.unions [moves1, moves2, newMoves] | |
, nfaEnds = Set.singleton (m1 + m2 + 1) | |
} | |
where | |
m1 = Set.size (nfaStates nfa1) | |
m2 = Set.size (nfaStates nfa2) | |
states1 = Set.map (+ 1) (nfaStates nfa1) -- shift right 1 point | |
states2 = Set.map (+ (m1 + 1)) (nfaStates nfa2) -- shr (1 + nfa1) pts | |
newStates = Set.fromList [0, m1 + m2 + 1] | |
moves1 = Set.map (fmap (+ 1)) (nfaMoves nfa1) | |
moves2 = Set.map (fmap (+ (m1 + 1))) (nfaMoves nfa2) | |
newMoves = Set.fromList | |
[ | |
EMove 0 1 -- start -> nfa1 | |
, EMove 0 (m1 + 1) -- start -> nfa2 | |
, EMove m1 (m1 + m2 + 1) -- nfa1 -> finish | |
, EMove (m1 + m2) (m1 + m2 + 1) -- nfa2 -> finish | |
] | |
-- No states are added, we connect graphs in sequence. | |
-- | |
-- States are renamed such that the starting node of @nfa2@ is the finishing | |
-- node of @nfa1@. | |
-- | |
-- Number line: | |
-- | |
-- [nfa1... joint] | |
-- [joint ...nfa2] | |
-- | |
-- We also add no new moves, because graphs become correctly connected | |
-- naturally. | |
-- | |
nfaAnd :: Ord c => NFA c Int -> NFA c Int -> NFA c Int | |
nfaAnd nfa1 nfa2 = NFA | |
{ nfaStart = 0 | |
, nfaStates = states1 `Set.union` states2 | |
, nfaMoves = moves1 `Set.union` moves2 | |
, nfaEnds = Set.singleton (m1 + m2 - 2) | |
} | |
where | |
m1 = Set.size (nfaStates nfa1) | |
m2 = Set.size (nfaStates nfa2) | |
states1 = nfaStates nfa1 | |
states2 = Set.map (+ (m1 - 1)) (nfaStates nfa2) -- shr (1 - nfa1) pts | |
-- ... so they can join | |
moves1 = nfaMoves nfa1 | |
moves2 = Set.map (fmap (+ (m1 - 1))) (nfaMoves nfa2) | |
-- We add startin anf final point, and wire moves such that graph becomes | |
-- a loop around nfa1. | |
-- | |
-- Number line | |
-- | |
-- 0 [nfa1] (m1 + 1) | |
-- | |
nfaStar :: Ord c => NFA c Int -> NFA c Int | |
nfaStar nfa1 = NFA | |
{ nfaStart = 0 | |
, nfaStates = states1 `Set.union` newStates | |
, nfaMoves = moves1 `Set.union` newMoves | |
, nfaEnds = Set.singleton (m1 + 1) | |
} | |
where | |
m1 = Set.size (nfaStates nfa1) | |
newStates = Set.fromList [0, m1 + 1] | |
states1 = Set.map (+ 1) (nfaStates nfa1) | |
moves1 = Set.map (fmap (+ 1)) (nfaMoves nfa1) | |
newMoves = Set.fromList | |
[ EMove 0 1 -- start -> nfa1 | |
, EMove 0 (m1 + 1) -- start -> finish (remove to get Kleene plus) | |
, EMove m1 1 -- nfa1 -> more nfa1 | |
, EMove m1 (m1 + 1) -- nfa1 -> finish | |
] | |
-- | Remove (retract) all empty moves (EMoves). | |
-- | |
-- Naturally, we use state sets instead of states, because all EMove-connected | |
-- states are indistinguishable, as nothing prevents you from using them | |
-- at any time. | |
-- | |
-- So we pull all the states, connected to the kernels into one set, which | |
-- counts as state for DFA. | |
-- | |
-- NFA (Set State) == DFA, basically. | |
-- | |
-- We can view DFA states as EMove-connected groups of NFA states. | |
-- | |
-- Is done by running `addStep` until no new info is gained. | |
-- | |
determine | |
:: Ord c | |
=> NFA c Int -- ^ Automata | |
-> Set c -- ^ Alphabet | |
-> NFA c (Set Int) | |
determine nfa@NFA {nfaStart, nfaMoves, nfaStates, nfaEnds} alph = | |
limit (addStep nfa alph) start | |
where | |
start = NFA | |
{ nfaStart = startState | |
, nfaStates = Set.singleton startState | |
, nfaMoves = Set.empty | |
, nfaEnds = if | |
| nfaEnds `Set.disjoint` startState -> Set.empty | |
| otherwise -> Set.singleton startState | |
} | |
startState = closure nfa (Set.singleton nfaStart) | |
-- | Add all possible transitions of NFA into DFA. | |
-- | |
addStep | |
:: forall c | |
. Ord c | |
=> NFA c Int -- ^ NFA | |
-> Set c -- ^ Alphabet | |
-> NFA c (Set Int) -- ^ DFA | |
-> NFA c (Set Int) | |
addStep nfa alph dfa = foldr add dfa (Set.toList (nfaStates dfa)) | |
where | |
add :: Set Int -> NFA c (Set Int) -> NFA c (Set Int) | |
add s dfa = Set.foldr (addMove nfa s) dfa alph | |
-- | Add all NFA transitions on given literal (character) to the DFA. | |
-- | |
addMove | |
:: Ord c | |
=> NFA c Int -- ^ NFA | |
-> Set Int -- ^ Current set of states == Current DFA state | |
-> c -- ^ Literal (character) | |
-> NFA c (Set Int) -- ^ DFA | |
-> NFA c (Set Int) | |
addMove | |
nfa@NFA { nfaEnds = term } | |
x | |
c | |
dfa@NFA { nfaStart, nfaMoves, nfaEnds, nfaStates} | |
= | |
NFA | |
{ nfaStart | |
, nfaMoves = moves' | |
, nfaStates = states' | |
, nfaEnds = ends' | |
} | |
where | |
states' = nfaStates `Set.union` Set.singleton new | |
moves' = nfaMoves `Set.union` Set.singleton (Move x c new) | |
ends' -- andd `new` to the end states, if intersects with NFA's end states | |
| term `Set.disjoint` new = nfaEnds | |
| otherwise = nfaEnds `Set.union` Set.singleton new | |
new = oneTrans nfa c x -- calculate state on `Move ... c ...` transition | |
-- | For some arcane reasons, my automatas are polluted by additional DFA state. | |
-- | |
-- This DFA state is an empty set of NFA states and has all possible | |
-- connections with all other graph nodes. | |
-- | |
-- For time being, instead of wrapping `closure`/`oneTrans` with `Maybe` | |
-- to cut off creation of null-state, I decided to filter it out afterwards. | |
-- | |
-- TODO: prevent that from happening instead. | |
-- | |
-- We remove null-set from states, and purge all moves from and to it. | |
-- | |
filterNFA :: Ord s => NFA c (Set s) -> NFA c (Set s) | |
filterNFA NFA { nfaStart, nfaEnds, nfaMoves, nfaStates } = | |
NFA | |
{ nfaStart | |
, nfaEnds | |
, nfaMoves = Set.filter (not . any Set.null) nfaMoves | |
, nfaStates = Set.filter (not . Set.null) nfaStates | |
} | |
rename :: Ord c => NFA c (Set Int) -> NFA c Int | |
rename nfa = evalState (traverseNFA renumber nfa) (Map.empty, 0) | |
where | |
renumber :: Set Int -> State (Map (Set Int) Int, Int) Int | |
renumber set = do | |
(m, ptr) <- get | |
case Map.lookup set m of | |
Nothing -> do | |
put (Map.insert set ptr m, ptr + 1) | |
return ptr | |
Just any -> do | |
return any | |
traverseNFA :: (Applicative f, Ord t, Ord c) => (s -> f t) -> NFA c s -> f (NFA c t) | |
traverseNFA f NFA {nfaStart, nfaStates, nfaMoves, nfaEnds} = do | |
nfaStart <- f nfaStart | |
nfaEnds <- Set.fromList <$> traverse f (Set.toList nfaEnds) | |
nfaMoves <- Set.fromList <$> (traverse.traverse) f (Set.toList nfaMoves) | |
nfaStates <- Set.fromList <$> traverse f (Set.toList nfaStates) | |
return NFA {nfaStart, nfaStates, nfaMoves, nfaEnds} | |
-- | Run DFA on a string to check if string fits the rules. | |
-- | |
match | |
:: (Ord s, Ord c) => NFA c s -- ^ DFA | |
-> [c] -- ^ Input | |
-> Either (Set c, [c]) () | |
match dfa = \case | |
[] -- end of string should councide with automata bein in final state | |
| inEndState dfa -> return () | |
| otherwise -> throwError (allCharsExpectedHere dfa, []) | |
(c : s) -> do | |
dfa' <- move dfa c s | |
match dfa' s | |
-- | Consume character, move to new state. | |
-- | |
move | |
:: (Eq s, Ord c) | |
=> NFA c s -- ^ DFA | |
-> c -- ^ Next character | |
-> [c] -- ^ Rest of the string (for error reports) | |
-> Either (Set c, [c]) (NFA c s) -- ^ Error or new automata | |
move dfa c s = do | |
case | |
[ dfa {nfaStart = d} -- I reuse starting state as current | |
| Move s c' d <- Set.toList (nfaMoves dfa) -- all moves | |
, s == nfaStart dfa -- from current one | |
, c == c' -- on that char | |
] | |
of | |
[] -> throwError (allCharsExpectedHere dfa, c : s) -- no moves, die | |
nfa : _ -> return nfa -- no more than one move should exist | |
-- | Find all characters that are exprected in current state. | |
-- | |
allCharsExpectedHere :: (Eq s, Ord c) => NFA c s -> Set c | |
allCharsExpectedHere dfa = Set.fromList | |
[ c | |
| Move s c _ <- Set.toList (nfaMoves dfa) | |
, s == nfaStart dfa | |
] | |
-- | Check if DFA is in its final state. | |
-- | |
inEndState :: Ord s => NFA c s -> Bool | |
inEndState dfa = Set.member (nfaStart dfa) (nfaEnds dfa) | |
-- | Convert regiular expression to DFA. | |
-- | |
dfa :: Reg Char -> NFA Char (Set Int) | |
dfa reg = filterNFA $ determine (build reg) (literals reg) | |
-------------------------------------------------------------------------------- | |
auto :: NFA Char (Set Int) | |
auto = dfa $ Star (fromString "hello " `Or` fromString "hi ") | |
nfa :: NFA Char Int | |
nfa = build $ Star (fromString "hello " `Or` fromString "hi ") | |
testOk :: Either (Set Char, String) () | |
testOk = match auto "hi hi hello hello hi " | |
testErr :: Either (Set Char, String) () | |
testErr = match auto "hi hi hell hi hello hi" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment