-
-
Save AmadorMunozBerzosa/3288771aaa199cc73acc7781bff4d49e to your computer and use it in GitHub Desktop.
module Krow.Regex.ActivePatterns | |
open System.Text | |
let (|Regex|_|) (pattern:IRegex) input = | |
if input = null then | |
None | |
else | |
try | |
let match' = RegularExpressions.Regex.Match(input, pattern |> Regex.evaluate) | |
if match'.Success then | |
Some( List.tail [ for groups in match'.Groups -> groups.Value ]) | |
else | |
None | |
with _ -> None | |
let (|Regexs|) (pattern:IRegex) input = | |
if input = null then [] else | |
try | |
let matches = RegularExpressions.Regex.Matches(input, pattern |> Regex.evaluate) | |
[ for match' in matches do (List.tail [ for group in match'.Groups -> group.Value ]) ] | |
with e -> [] |
[<AutoOpen>] | |
module Krow.Regex.Evaluation | |
open Krow.Regex.Types | |
open System.Text | |
module Regex = | |
let escape s = (RegularExpressions.Regex.Escape s).Replace("]", "\]") | |
let unescape (s:string) = (RegularExpressions.Regex.Unescape (s.Replace("\]", "]"))) | |
[<AutoOpen>] | |
module private Helpers = | |
let groupable (regex:IRegex) = | |
match regex with | |
| :? Regex.Sequence | :? Regex.OneOf -> Regex.NonCapturing regex :> IRegex | |
| _ -> regex | |
let listGroupable (regex:IRegex) = | |
match regex with | |
| :? Regex.OneOf -> Regex.NonCapturing regex :> IRegex | |
| _ -> regex | |
let rec charsetContent charSet = | |
match charSet with | |
| CharSet.OneOf chars -> | |
let string = (new string(chars |> List.toArray)).Replace("/",@"\/") | |
$@"{string}" | |
| CharSet.Range (first,last) -> | |
$@"{first}-{last}" | |
| CharSet.Multiple charSets -> | |
charSets |> List.map charsetContent |> List.reduce (+) | |
let referenceString reference = | |
match reference with | |
| Group.Positional number -> number |> string | |
| Group.Named string -> string | |
|> escape | |
let rec evaluate (regex:IRegex) = | |
match regex with | |
| :? Regex.Literal as literal -> | |
let (Regex.Literal literal) = literal | |
escape literal | |
| :? Regex.Raw as literal -> | |
let (Regex.Raw literal) = literal | |
literal | |
| :? Regex.OneOf as oneOf -> | |
let (Regex.OneOf regexs) = oneOf | |
regexs | |
|> List.map evaluate |> String.concat "|" | |
| :? Regex.Sequence as sequence -> | |
let (Regex.Sequence regexs) = sequence | |
regexs |> List.map (listGroupable >> evaluate) |> String.concat "" | |
| :? Regex.NonCapturing as grouping -> | |
let (Regex.NonCapturing regex) = grouping | |
$@"(?:{evaluate regex})" | |
| :? Mode.WithModes as withModes -> | |
let (Mode.WithModes (modes, regex)) = withModes | |
let modeChar = function | |
| Mode.CaseInsensitive -> "i" | |
| Mode.Multiline -> "m" | |
| Mode.ExplicitCapture -> "n" | |
| Mode.IgnoreUnescapedWhiteSpace -> "x" | |
let modeList = modes |> List.map modeChar |> String.concat "" | |
$@"(?{modeList}:{regex})" | |
| :? Group.Reference as reference -> | |
let string = reference |> referenceString | |
match reference with | |
| Group.Positional _ -> $@"\{string}" | |
| Group.Named _ -> $@"\k<{string}>" | |
| :? Look.Look as look -> | |
match look with | |
| Look.Ahead regex -> $@"(?={regex |> evaluate})" | |
| Look.Behind regex -> $@"(?<={regex |> evaluate})" | |
| :? Look.Negated as look -> | |
let (Look.Negated look) = look | |
match look with | |
| Look.Ahead regex -> $@"(?!{regex |> evaluate})" | |
| Look.Behind regex -> $@"(?<!{regex |> evaluate})" | |
| :? Group.Group as group -> | |
let (Group.Group(group,regex)) = group | |
let regex = regex |> evaluate | |
match group with | |
// Capturing | |
| Group.Capturing -> $@"({regex})" | |
| Group.CapturingWithName name -> $@"(?<{name}>{regex})" | |
// Non capturing | |
| Group.NonBacktrackingGrouping -> $@"(?>{regex})" | |
// Balancing | |
| Group.UnCapturing reference -> | |
$@"(?<-{reference |> referenceString}>{regex})" | |
| Group.Balancing (newName, reference) -> | |
$@"(?<{newName |> escape}-{reference |> referenceString}>{regex})" | |
| :? SpecialChar.SpecialChar as special -> | |
match special with | |
| SpecialChar.WildCard -> @"." | |
| SpecialChar.Bell -> @"\a" | |
| SpecialChar.Backspace -> @"\b" | |
| SpecialChar.Tab -> @"\t" | |
| SpecialChar.VerticalTab -> @"\v" | |
| SpecialChar.CarriageReturn -> @"\r" | |
| SpecialChar.NewLine -> @"\n" | |
| SpecialChar.Escaped -> @"\e" | |
| SpecialChar.Octal oct -> $@"\{oct}" | |
| SpecialChar.Hexadecimal hex -> $@"\x{hex}" | |
| SpecialChar.ASCII ascii -> $@"\u{ascii}" | |
| :? Anchor.Anchor as anchor -> | |
match anchor with | |
| Anchor.Start -> @"\A" | |
| Anchor.StartOfLine -> @"^" | |
| Anchor.End -> @"\z" | |
| Anchor.EndOfLine -> @"$" | |
| Anchor.Boundary -> @"\b" | |
| Anchor.NotBoundary -> @"\B" | |
| Anchor.AfterMatch -> @"\G" | |
| :? CharSet.CharSet as charSet -> | |
$"[{charsetContent charSet}]" | |
| :? CharSet.Negated as negated -> | |
let (CharSet.Negated charSet) = negated | |
$"[^{charsetContent charSet}]" | |
| :? CharClass.CharClass as charClass -> | |
match charClass with | |
| CharClass.InUnicodeBlock block -> $@"\p{{{block}}}" | |
| CharClass.LetterOrDigit -> @"\w" | |
| CharClass.WhitespaceChar -> @"\s" | |
| CharClass.Digit -> @"\d" | |
| :? CharClass.Negated as negated -> | |
let (CharClass.Negated charClass) = negated | |
match charClass with | |
| CharClass.InUnicodeBlock block -> $@"\P{{{block}}}" | |
| CharClass.LetterOrDigit -> @"\W" | |
| CharClass.WhitespaceChar -> @"\S" | |
| CharClass.Digit -> @"\D" | |
| :? Quantity.Quantified as quantified -> | |
match quantified with | |
| Quantity.Greedy (regex,quantity) -> | |
let regex = regex |> groupable |> evaluate | |
match quantity with | |
| Quantity.Exactly amount -> | |
$@"{regex}{{{amount}}}" | |
| Quantity.AtLeast amount -> | |
if amount = 0 then | |
$@"{regex}*" | |
else if amount = 1 then | |
$@"{regex}+" | |
else | |
$@"{regex}{{{amount},}}" | |
| Quantity.Between (min,max) -> | |
if min = 0 && max = 1 then | |
$@"{regex}?" | |
else | |
$@"{regex}{{{min},{max}}}" | |
| Quantity.Lazy (regex,quantity) -> | |
let greedQuantified = Quantity.Greedy(regex,quantity) |> evaluate | |
greedQuantified + "?" | |
| :? Condition.Conditional as conditional -> | |
let evaluateCondition = function | |
| Condition.Regex regex -> regex |> evaluate | |
| Condition.Reference reference -> reference |> referenceString | |
$@"(?({conditional.If |> evaluateCondition}){conditional.Then |> evaluate}|{conditional.Else |> evaluate})" | |
| _ -> failwith "Not supported" |
module Examples | |
open Krow.Regex | |
let bounded (regex:IRegex) = | |
Anchor.Start + regex + Anchor.End | |
let lineBounded (regex:IRegex) = | |
Anchor.StartOfLine + regex + Anchor.EndOfLine | |
let separatedList separator (regex:IRegex) = | |
regex + (separator + regex) * (0,()) | |
module Guid = | |
let hexDigit = CharSet.Range('0', '9') / CharSet.Range('a', 'f') | |
let guid = | |
Regex.Sequence [ | |
hexDigit * 8 + "-" | |
hexDigit * 4 + "-" | |
CharSet.Range('1', '5') + "-" | |
CharSet.OneOf ['8';'9';'a';'b'] | |
hexDigit * 3 + "-" | |
hexDigit * 12 | |
] | |
module Email = | |
let allowedSpecialChars = CharSet.OneOf [ | |
'!';'#';'$';'%';'&';''';'*';'+';'/';'=';'?';'^';'_';'`';'{';'|';'}';'~';'-' | |
] | |
let alphaNumeric = CharSet.Range('a','z') / CharSet.Range('0','9') | |
let alphaNumericOrHyphen = alphaNumeric / "-" | |
module Hex = | |
let group1 = | |
["01";"08";"0B";"0C";"0E";"1F";"21";"23";"5B";"5D";"7F"] | |
|> List.map (fun a -> SpecialChar.Hexadecimal a :> IRegex) | |
|> Regex.OneOf | |
let group2 = | |
["01";"09";"0B";"0C";"0E";"7F"] | |
|> List.map (fun a -> SpecialChar.Hexadecimal a :> IRegex) | |
|> Regex.OneOf | |
let part = group1 / ( @"\" + group2) | |
let name = part * (0,()) | |
let quotedName = "\"" + name + "\"" | |
module User = | |
let stringPart = (alphaNumeric / allowedSpecialChars) * (1,()) | |
let stringName = separatedList "." stringPart | |
let name = stringName / Hex.quotedName | |
module Ip = | |
let ipPart = | |
Regex.OneOf [ | |
"25" + CharSet.Range('0','5') | |
"2" + CharSet.Range('0','4') + CharSet.Range('0','9') | |
CharSet.OneOf ['0';'1'] + CharSet.Range('0','9') + CharSet.Range('0','9') | |
] | |
let hexPart = | |
alphaNumericOrHyphen * (0,()) + alphaNumeric + ":" + Hex.name | |
let lastPart = ipPart / hexPart | |
let address = "[" + (ipPart + ".") * 3 + lastPart + "]" | |
module Domain = | |
let part = (alphaNumeric * (1,())) |> separatedList "-" | |
let name = part |> separatedList "." | |
let email = bounded (User.name + "@" + (Ip.address / Domain.name)) |
[<AutoOpen>] | |
module Krow.Regex.Operators | |
open Regex | |
type RegexSequence = RegexSequence with | |
static member (?<-) (RegexSequence, first:Sequence, second:Sequence) = | |
let (Sequence list1) = first | |
let (Sequence list2) = second | |
Sequence(list1 @ list2) | |
static member (?<-) (RegexSequence, first:IRegex, second:Sequence) = | |
(?<-) RegexSequence (Sequence [first]) second | |
static member (?<-) (RegexSequence, first:Sequence, second:IRegex) = | |
(?<-) RegexSequence first (Sequence [second]) | |
static member (?<-) (RegexSequence, first:IRegex, second:IRegex) = | |
(?<-) RegexSequence (Sequence [first]) (Sequence [second]) | |
static member (?<-) (RegexSequence, first:string, second:IRegex) = | |
(?<-) RegexSequence (Sequence [Literal first]) (Sequence [second]) | |
static member (?<-) (RegexSequence, first:IRegex, second:string) = | |
(?<-) RegexSequence (Sequence [first]) (Sequence [Literal second]) | |
static member inline (?<-) (RegexSequence, first, second) = | |
first + second | |
let inline (+) first second : 'R = ( (?<-) RegexSequence first second) | |
type RegexOneOf = RegexOneOf with | |
static member (?<-) (RegexOneOf, first:OneOf, second:OneOf) = | |
let (OneOf list1) = first | |
let (OneOf list2) = second | |
OneOf(list1 @ list2) | |
static member (?<-) (RegexOneOf, first:IRegex, second:OneOf) = | |
(?<-) RegexOneOf (OneOf [first]) second | |
static member (?<-) (RegexOneOf, first:OneOf, second:IRegex) = | |
(?<-) RegexOneOf first (OneOf [second]) | |
static member (?<-) (RegexOneOf, first:IRegex, second:IRegex) = | |
OneOf [first;second] | |
static member (?<-) (RegexOneOf, first:CharSet.CharSet, second:CharSet.CharSet) = | |
match first,second with | |
| CharSet.Multiple charsets1, CharSet.Multiple charsets2 -> | |
CharSet.Multiple (charsets1 @ charsets2) | |
| CharSet.Multiple charsets1, charset2 -> | |
CharSet.Multiple (charsets1 @ [charset2]) | |
| charset1, CharSet.Multiple charsets2 -> | |
CharSet.Multiple (charset1 :: charsets2) | |
| charset1, charset2 -> | |
CharSet.Multiple [charset1;charset2] | |
static member (?<-) (RegexOneOf, CharSet.Negated first, CharSet.Negated second) = | |
CharSet.Negated ((?<-) RegexOneOf first second) | |
static member (?<-) (RegexOneOf, first:string, second:IRegex) = | |
(?<-) RegexOneOf (OneOf [Literal first]) (OneOf [second]) | |
static member (?<-) (RegexOneOf, first:IRegex, second:string) = | |
(?<-) RegexOneOf (OneOf [first]) (OneOf [Literal second]) | |
static member inline (?<-) (RegexOneOf, first, second) = | |
first / second | |
let inline (/) first second : 'R = ( (?<-) RegexOneOf first second) | |
type RegexQuantification = RegexQuantification with | |
static member (?<-) (RegexQuantification, regex, quantity) = | |
Quantity.Greedy(regex, Quantity.Exactly quantity) | |
static member (?<-) (RegexQuantification, regex, quantity) = | |
Quantity.Greedy(regex, Quantity.Between quantity) | |
static member (?<-) (RegexQuantification, regex, quantity) = | |
let quantity, () = quantity | |
Quantity.Greedy(regex, Quantity.AtLeast quantity) | |
static member inline (?<-) (RegexQuantification, first, second) = | |
first * second | |
let inline ( * ) first second : 'R = ( (?<-) RegexQuantification first second) | |
type RegexLazyQuantification = RegexLazyQuantification with | |
static member (?<-) (RegexLazyQuantification, regex, quantity) = | |
Quantity.Lazy(regex, Quantity.Exactly quantity) | |
static member (?<-) (RegexLazyQuantification, regex, quantity) = | |
Quantity.Lazy(regex, Quantity.Between quantity) | |
static member (?<-) (RegexLazyQuantification, regex, quantity) = | |
let quantity, () = quantity | |
Quantity.Lazy(regex, Quantity.AtLeast quantity) | |
static member inline (?<-) (RegexLazyQuantification, first, second) = | |
first *? second | |
let inline ( *? ) first second : 'R = ( (?<-) RegexLazyQuantification first second) | |
type RegexNegation = RegexNegation with | |
static member (?<-) (RegexNegation, charClass:CharClass.CharClass, _) = | |
CharClass.Negated charClass | |
static member (?<-) (RegexNegation, charClass:CharClass.Negated, _) = | |
let (CharClass.Negated charClass) = charClass | |
charClass | |
static member (?<-) (RegexNegation, charClass:CharSet.CharSet, _) = | |
CharSet.Negated charClass | |
static member (?<-) (RegexNegation, charClass:CharSet.Negated, _) = | |
let (CharSet.Negated charClass) = charClass | |
charClass | |
static member (?<-) (RegexNegation, look:Look.Look, _) = | |
Look.Negated look | |
static member (?<-) (RegexNegation, look:Look.Negated, _) = | |
let (Look.Negated look) = look | |
look | |
static member inline (?<-) (RegexNegation, first, _) = | |
!first | |
let inline (!) first : 'R = ( (?<-) RegexNegation first ()) | |
let aaa = !(Look.Ahead (Literal "aa")) |
[<AutoOpen>] | |
module Krow.Regex.Types | |
type IRegex = interface end | |
module Regex = | |
type Literal = | |
| Literal of string | |
interface IRegex | |
type Raw = | |
| Raw of string | |
interface IRegex | |
type OneOf = | |
| OneOf of IRegex list | |
interface IRegex | |
type Sequence = | |
| Sequence of IRegex list interface IRegex | |
type internal NonCapturing = | |
| NonCapturing of IRegex | |
interface IRegex | |
module Mode = | |
type Mode = | |
| CaseInsensitive | |
| Multiline | |
| ExplicitCapture | |
| IgnoreUnescapedWhiteSpace | |
type WithModes = | |
| WithModes of Mode list * IRegex | |
interface IRegex | |
module Look = | |
type Look = | |
| Ahead of IRegex | |
| Behind of IRegex | |
interface IRegex | |
type Negated = | |
|Negated of Look interface IRegex | |
module Group = | |
type Reference = | |
| Positional of int | |
| Named of string | |
interface IRegex | |
type Kind = | |
// Capturing | |
| Capturing | |
| CapturingWithName of string | |
// Non capturing | |
| NonBacktrackingGrouping | |
// Balancing | |
| UnCapturing of Reference // Balancing while omitting first arg | |
| Balancing of string * Reference | |
type Group = | |
| Group of Kind * IRegex | |
interface IRegex | |
module SpecialChar = | |
type SpecialChar = | |
| WildCard | |
| Bell | |
| Backspace | |
| Tab | |
| VerticalTab | |
| CarriageReturn | |
| NewLine | |
| Escaped | |
| Octal of string | |
| Hexadecimal of string | |
| ASCII of string | |
interface IRegex | |
module Anchor = | |
type Anchor = | |
| Start | |
| StartOfLine | |
| End | |
| EndOfLine | |
| Boundary | |
| NotBoundary | |
| AfterMatch | |
interface IRegex | |
module CharSet = | |
type CharSet = | |
| OneOf of char list | |
| Range of char * char | |
| Multiple of CharSet list | |
interface IRegex | |
type Negated = | |
| Negated of CharSet | |
interface IRegex | |
module CharClass = | |
type CharClass = | |
| InUnicodeBlock of string | |
| LetterOrDigit | |
| WhitespaceChar | |
| Digit | |
interface IRegex | |
type Negated = | |
| Negated of CharClass | |
interface IRegex | |
module Quantity = | |
type Quantity = | |
| Exactly of int | |
| AtLeast of int | |
| Between of int * int | |
type Quantified = | |
| Greedy of IRegex * Quantity | |
| Lazy of IRegex * Quantity | |
interface IRegex | |
module Condition = | |
type Condition = | |
| Regex of IRegex | |
| Reference of Group.Reference | |
type Conditional = | |
{ If: Condition; Then: IRegex; Else: IRegex } | |
interface IRegex |
It is working very well.
Except for |||
that doesn't seem to overload like +
does.
Literal "this" ||| Literal "that"
worksLiteral "this" ||| "that"
doesn't work
I'm not sure why
So I changed to '/'. I know is not symmetrical and it doesn't seem commutative. OTOH it confers the meaning well.
For instance: Literal "Err" / "Warn" / "Info"
is very readable.
Also RegularExpressions.Regex.Escape
doesn't escape the character ]
which is a problem when doing OneOf, so I changed it to:
let escape s = (RegularExpressions.Regex.Escape s).Replace("]", "\]")
Here is an example:
let parseErrWarnInfo = """
Err (1, 7) - (1, 12): "This shows over there as an error".
Warn (2, 7) - (2, 12): "This shows over there as a warning".
Info (3, 7) - (3, 12): "This shows over there as information".
"""
let digitsCap = MoreThanOnce Digit |> Capturing
let spaces = ManyTimesOrNone WhitespaceChar
let coords = "(" + digitsCap + "," + spaces + digitsCap + ")"
let notOneOf v = Seq.toList v |> NotOneOf
let errWarnInfo =
Sequence [
Capturing(Literal "Err" / "Warn" / "Info") + " "
coords + " - "
coords + ": "
"\"" + Capturing (notOneOf "\"" |> MoreThanOnceLazily) + "\"."
]
|> evaluate
printfn "Regex: %s" errWarnInfo
// (Err|Warn|Info)\ \((\d+),\s*(\d+)\)\ -\ \((\d+),\s*(\d+)\):\ "([^"]+?)"\.
I also changed the operator for Either
so it preserves the original order. I doesn't make a difference to the regex, but the rearranging was a little disconcerting.
With a couple of active patterns:
let (|Regex|_|) pattern input =
if input = null then None else
try
let m = RegularExpressions.Regex.Match(input, pattern)
if m.Success then Some(List.tail [ for g in m.Groups -> g.Value ])
else None
with e -> None
let (|Regexs|) pattern input =
if input = null then [| |] else
try
let ms = RegularExpressions.Regex.Matches(input, pattern)
[| for m in ms do yield (List.tail [ for g in m.Groups -> g.Value ]) |]
with e -> [| |]
then it can be used very nicely like this:
match parseErrWarnInfo with Regexs errWarnInfo r -> printfn "Matches found:\n%A" r
match parseErrWarnInfo with
| Regex errWarnInfo r -> printfn "First Match: %A" r
| _ -> printfn "No match found"
match parseErrWarnInfo with Regexs errWarnInfo r -> printfn "Matches found:\n%A" r
// Matches found:
// [|["Err"; "1"; "7"; "1"; "12"; "This shows over there as an error"];
// ["Warn"; "2"; "7"; "2"; "12"; "This shows over there as a warning"];
// ["Info"; "3"; "7"; "3"; "12"; "This shows over there as information"]|]
match parseErrWarnInfo with
| Regex errWarnInfo r -> printfn "First Match: %A" r
| _ -> printfn "No match found"
// First Match: ["Err"; "1"; "7"; "1"; "12"; "This shows over there as an error"]
Guid:
let hexDigit = InRange('0', '9') / InRange('a', 'f')
let hexDigits n = Exactly(uint32 n, hexDigit)
Sequence [
hexDigits 8 + "-"
hexDigits 4 + "-"
InRange('1', '5')
hexDigits 3 + "-"
oneOf "89ab"
hexDigits 3 + "-"
hexDigits 12
]
|> evaluate
|> printfn "%s"
// (?:[0-9]|[a-f]){8}-(?:[0-9]|[a-f]){4}-[1-5](?:[0-9]|[a-f]){3}-[89ab](?:[0-9]|[a-f]){3}-(?:[0-9]|[a-f]){12}
email:
let allowed = NotOneOfEscaped (escape "<>()[].,;:@" + evaluate WhitespaceChar) |> MoreThanOnce
let listSep sep elems = elems + ManyTimesOrNone (Literal sep + elems)
listSep "." allowed + "@" + listSep "." allowed
|> evaluate
|> printfn "%s"
in this case allowed
needed to include \s
without further escaping. For this case I added:
...
| LiteralRegex of string
| OneOfEscaped of string
| NotOneOfEscaped of string
...
| LiteralRegex rx -> rx
| OneOfEscaped string -> sprintf @"[%s]" string
| NotOneOfEscaped string -> sprintf @"[^%s]" string
...
to allow for cases not contemplated or for composing with regex from other sources.
Here is a version of the evaluator the uses
sprintf
for those in prior versions of F#: