Created
July 5, 2022 02:15
-
-
Save RomanHargrave/5328db7bdcd7ff0f78fe9402f44c0831 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use crate::lang::Atom; | |
use nom::character::complete::{ | |
char, | |
multispace1, | |
}; | |
use nom::sequence::{delimited, preceded, terminated}; | |
use nom::combinator::{ | |
map, | |
map_res, | |
map_opt, | |
verify, | |
value | |
}; | |
use nom::branch::alt; | |
use nom::bytes::complete::{ | |
is_not, | |
take_while_m_n | |
}; | |
use nom::error::{ | |
FromExternalError, | |
ParseError | |
}; | |
use nom::IResult; | |
use nom::multi::fold_many0; | |
/// The [nom::combinator::value] combinator applied consuming a single character and emitting an | |
/// arbitrary value. | |
/// | |
/// The application | |
/// ``` | |
/// char_val!('a' => '\u{07}'); | |
/// ``` | |
/// | |
/// Is directly equivalent to | |
/// ``` | |
/// nom::combinator::value('\u{07}', nom::character::complete::char('a')); | |
/// ``` | |
#[macro_export] | |
macro_rules! char_val { | |
($from:literal => $to:expr) => { | |
nom::combinator::value( | |
$to, | |
nom::character::complete::char($from) | |
) | |
} | |
} | |
// Much of this is derived from the escaped string example packaged with Nom, in part because my | |
// brain was not working at the time. The basic idea here is to break the body of a string into | |
// fragments based on whether the parser that consumes a given span of the string body produces | |
// one, many, or no characters (String, Char, and Void respectively) and then recombine them | |
// appropriately. Effectively, this will take in input string with escapes, and produce an owned | |
// string representing the logical value of the input string body. | |
/// R7RS §6.7. hexadecimal escape sequence parser, invoked following the escape character. The | |
/// parser will consume text matching the expression `x[0-9A-F]{1,8}`. | |
pub fn hex_scalar_seq<'s, E>(i: &'s str) -> IResult<&'s str, char, E> | |
where | |
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError> | |
{ | |
// summary: extract the hexadecimal sequence between 'x' and ';', convert it to a 32-bit | |
// integer, and convert that integer to a character. | |
map_opt( | |
map_res( | |
// consume x..; and return text between delimiters (x, ;) | |
preceded( | |
char('x'), | |
take_while_m_n(1, 8, |c: char| c.is_ascii_hexdigit()), | |
), | |
// convert hexadecimal sequence to 32-bit integer | |
|r| u32::from_str_radix(r, 16) | |
), | |
// convert integer to character | |
char::from_u32 | |
)(i) | |
} | |
const ESCAPE_LEADER: char = '\\'; | |
/// Parse an escape sequence other than whitespace | |
fn escaped_char<'s, E>(i: &'s str) -> IResult<&'s str, char, E> | |
where | |
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError> | |
{ | |
// Expect a sequence starting with \ | |
preceded( | |
char(ESCAPE_LEADER), | |
// Immediately following \, test for acceptable values. Branches are tested in order, so it | |
// may pay to shuffle this around later such that branches are explored in order of most | |
// common first to least common last. | |
alt(( | |
// Escape sequences defined by spec. Do not remove or alter. | |
terminated(hex_scalar_seq, char(';')), // x…; scalar | |
char_val!('a' => '\u{07}'), // Alarm | |
char_val!('b' => '\u{08}'), // Backspace | |
char_val!('t' => '\u{09}'), // Tab | |
char_val!('n' => '\u{0A}'), // Linefeed | |
char_val!('r' => '\u{0D}'), // Return | |
char_val!('"' => '"'), | |
char_val!('\\' => '\\'), | |
char_val!('|' => '|'), | |
)) | |
)(i) | |
} | |
/// Parse escaped whitespace, specifically a backslash followed by any amount of whitespace. | |
fn escaped_whitespace<'s, E>(i: &'s str) -> IResult<&'s str, &'s str, E> | |
where | |
E: ParseError<&'s str> | |
{ | |
// Look for a \ followed by >1 whitespace characters | |
preceded( | |
char(ESCAPE_LEADER), | |
multispace1 | |
)(i) | |
} | |
/// Parse normal string components, given the string delimiter. | |
fn normal_text<'s, E>( | |
delim: char | |
) -> impl FnMut(&'s str) -> IResult<&'s str, &'s str, E> | |
where | |
E: ParseError<&'s str> | |
{ | |
move |i| | |
verify( | |
// Take as many characters as possible until " or \ are reached. | |
is_not([ESCAPE_LEADER, delim].as_slice()), | |
// Convert the result to an error if the above parser consumed 0 characters. | |
|s: &str| !s.is_empty() | |
)(i) | |
} | |
/// Part of a string collected during parsing. | |
#[derive(Debug, Clone)] | |
enum Fragment<'s> { | |
/// A length of text not containing any special sequences. | |
String(&'s str), | |
/// A single character, corresponding to escape sequences that produce a single character. | |
Char(char), | |
/// Nothing, having no length. | |
Void | |
} | |
/// Produces a function that, with a specified delimiter, consumes part of a string, producing the | |
/// appropriate [Fragment] variant (String, Character, Void) for the first applicable span of text, | |
/// returning the remainder and variant. | |
fn string_fragment<'s, E>( | |
delim: char | |
) -> impl FnMut(&'s str) -> IResult<&'s str, Fragment<'s>, E> | |
where | |
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError> | |
{ | |
move |i| | |
// alt() tests in order. normal_text should occur more frequently than other branches so should | |
// be tested first (preferred), followed by regular character escapes, finally whitespace escapes. | |
alt(( | |
// Parsers producing a string of characters have their result placed into the String variant | |
map(normal_text(delim), Fragment::String), | |
// Likewise, any single character parser results are placed into the Char variant | |
map(escaped_char, Fragment::Char), | |
// Anything parser that "skips" portions of the input string produces the Void variant | |
value(Fragment::Void, escaped_whitespace), | |
))(i) | |
} | |
/// Consume a block of text containing escape sequences. The parser transforms escape sequences at | |
/// parse time, producing an owned string representing the transformed input. | |
fn string_body<'s, E>( | |
delim: char | |
) -> impl FnMut(&'s str) -> IResult<&'s str, String, E> | |
where | |
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError> | |
{ | |
move |i| | |
fold_many0( | |
// Apply the string_fragment parser until it fails | |
string_fragment(delim), | |
String::new, | |
// Every time the string_fragment parser succeeds, the below function is called with its output | |
// and another value. The initial state of the other value is computed by calling the parameter | |
// prior to this function, which in this case is String::new. Notably, we are going to take the | |
// initial value as mutable, which is not /terribly/ common in other applications of this | |
// pattern, but allows for construction of the string without reallocating a new string on each | |
// application (though the string will still have two grow :shrug:). | |
|mut str, frag| | |
match frag { | |
// When a string fragment is encountered, append the entire string to this string. This | |
// is the case applicable to the normal_text parser. | |
Fragment::String(s) => { | |
str.push_str(s); | |
str | |
} | |
// When a character fragment is encountered, append the character. Applicable to | |
// character escapes. | |
Fragment::Char(c) => { | |
str.push(c); | |
str | |
} | |
// The void fragment will not result in any changes to the "accumulator" string | |
Fragment::Void => str | |
} | |
)(i) | |
} | |
/// Parse a complete string residing between two delimiters. | |
pub fn delimited_string<'s, E>( | |
delimiter: char | |
) -> impl FnMut(&'s str) -> IResult<&'s str, String, E> | |
where | |
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError> | |
{ | |
move |i| | |
// Parse the string and yield an owned String | |
delimited( | |
// Expect to see " | |
char(delimiter), | |
// Followed by the string body and any escaped chars, etc... | |
string_body(delimiter), | |
// Expect the closing " | |
char(delimiter) | |
)(i) | |
} | |
/// Consume a "-delimited span of text and emit an [Atom::String] owning its computed value, or | |
/// an error if the string is not valid. | |
pub fn string<'s, E>(i: &'s str) -> IResult<&'s str, Atom<'s>, E> | |
where | |
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError> | |
{ | |
map( | |
delimited_string('"'), | |
// Move the computed string body to an Atom::String variant | |
Atom::String | |
)(i) | |
} | |
/// Consume a |-delimited string, which is used for complex identifiers | |
pub fn long_identifier<'s, E>(i: &'s str) -> IResult<&'s str, Atom<'s>, E> | |
where | |
E: ParseError<&'s str> + FromExternalError<&'s str, std::num::ParseIntError> | |
{ | |
map( | |
delimited_string('|'), | |
Atom::String | |
)(i) | |
} | |
#[cfg(test)] | |
mod test { | |
use super::*; | |
use nom::error::ErrorKind; | |
type WantError<'s> = (&'s str, ErrorKind); | |
#[test] | |
fn parses_scalar_seq() { | |
let parse = | |
hex_scalar_seq::<WantError>; | |
assert_eq!(parse("xAE"), Ok(("", '\u{AE}')), | |
"Parses one-byte scalar"); | |
assert_eq!(parse("xae"), Ok(("", '\u{AE}')), | |
"Parser is not case-sensitive"); | |
assert_eq!(parse("xAe"), Ok(("", '\u{AE}')), | |
"Parser is not case-sensitive"); | |
// we do not want the parser to consume ; as it is not applicable to its use in the character | |
// literal parser | |
assert_eq!(parse("xAE;"), Ok((";", '\u{AE}')), | |
"Parser should not parse string/ident hex scalar terminator"); | |
parse("x") | |
.expect_err("Parser should not parse hex scalar seq with 0 bits"); | |
assert_eq!(parse("x000000AE"), Ok(("", '\u{AE}')), | |
"Parser should parse hex scalar seq up to 32 bits"); | |
assert_eq!(parse("x00000000AE"), Ok(("AE", '\0')), | |
"Parser should not consume more than 32-bits worth of hex chars"); | |
} | |
/// Test that the whitespace escape parser consumes any quantity of whitespace and halts at the | |
/// first non-whitespace character. | |
#[test] | |
fn parses_escaped_whitespace() { | |
let parse = | |
escaped_whitespace::<WantError>; | |
assert_eq!(parse(r"\ "), Ok(("", " ")), | |
"Parser should consume whitespace"); | |
assert_eq!(parse(r"\ "), Ok(("", " ")), | |
"Parser should consume any amount of contiguous whitespace"); | |
assert_eq!(parse("\\ \n "), Ok(("", " \n ")), | |
"Parser should consume any whitespace, including linefeed"); | |
assert_eq!(parse(r"\ a "), Ok(("a ", " ")), | |
"Parser should halt where contiguous whitespace breaks"); | |
assert_eq!(parse(r"\ a "), Ok(("a", " ")), | |
"Parser should halt where contiguous whitespace breaks"); | |
} | |
/// Test that the normal text parser consumes any quantity of normal text and stops at the | |
/// first backslash or quote. Also test that the parser produces an error when asked to parse | |
/// text beginning with an unexpected special character. | |
#[test] | |
fn parses_normal_text() { | |
let mut parse = | |
normal_text::<WantError>('"'); | |
// the parser should entirely consume an input containing no escape leader nor delimiter | |
assert_eq!(parse("one two 3."), Ok(("", "one two 3.")), | |
"Parser consumes normal text"); | |
// the parser should stop at its configured delimiter | |
assert_eq!(parse("literally.\""), Ok(("\"", "literally.")), | |
"Parser does not consume past delimiter"); | |
assert_eq!(normal_text::<WantError>('|')("pop|tarts"), Ok(("|tarts", "pop")), | |
"Parser does not consume past delimiter"); | |
// the parser should always stop at \ | |
assert_eq!(parse(r"literally.\"), Ok((r"\", "literally.")), | |
"Parser does not consume escape leader"); | |
// the parser should always stop at \ | |
assert_eq!(parse(r"pop\tarts"), Ok((r"\tarts", "pop")), | |
"Parser does not consume escape leader"); | |
// e.g. unless some other character that might otherwise be a delimiter (|) is this parser's | |
// configured delimiter, it should consume it. | |
assert_eq!(parse("pop|tarts"), Ok(("", "pop|tarts")), | |
"Parser consumes other delimiters unless told otherwise"); | |
// the following two cases will not return Ok() because | |
parse("\\Strawberry Pop Tarts may be a cheap and inexpensive source of incendiary devices.") | |
.expect_err("Consumed text beginning with backslash"); | |
parse("\"Strawberry Pop Tarts may be a cheap and inexpensive source of incendiary devices.") | |
.expect_err("Consumed text beginning with quote"); | |
} | |
macro_rules! expect_char_escape { | |
($($input:literal => $output:literal, $msg:literal,)+) => { | |
$( assert_eq!( | |
escaped_char::<WantError>($input), | |
Ok(("", $output)), | |
"{}: unexpected result for input {}", $msg, $input | |
); )+ | |
} | |
} | |
/// Test that the character escape sequence parser calls complex escape sequence parsers as | |
/// needed and returns the expected character for a given escape sequence as defined in R7RS §6.7. | |
#[test] | |
fn parses_char_escape() { | |
expect_char_escape!( | |
r"\xAE;" => '\u{AE}', "Should parse hexadecimal scalar escape seq", | |
r"\a" => '\u{07}', "Should parse alarm escape seq", | |
r"\n" => '\u{0A}', "Should parse linefeed escape seq", | |
r"\r" => '\u{0D}', "Should parse carriage return escape seq", | |
"\\\"" => '\u{22}', "Should parse double quote escape seq", | |
r"\\" => '\u{5C}', "Should parse backslash escape seq", | |
r"\|" => '\u{7C}', "Should parse vertical line escape seq", | |
); | |
// should not parse undefined character escape. if adding new escapes, ensure that this | |
// remains undefined (e.g. update it *here* to something not in the escape parser) | |
escaped_char::<WantError>("z") | |
.expect_err("Should not parse undefined escape sequence"); | |
} | |
// TODO assert_matches! stabilization would be very, very welcome | |
macro_rules! expect_match { | |
($left:expr, $right:pat_param, $fail_msg:expr) => { | |
match $left { | |
$right => (), | |
thing @ _ => panic!( | |
"{}: expected {} but got {:?} instead", | |
$fail_msg, stringify!($right), thing | |
) | |
} | |
} | |
} | |
/// Test that the fragment parser emits the correct fragment variants for a given input. | |
#[test] | |
fn emits_fragment_variants() { | |
let mut parse = | |
string_fragment::<(&str, ErrorKind)>('"'); | |
expect_match!(parse("The Pop Tarts ... flames 10-18 inches"), Ok((_, Fragment::String(_))), | |
"Did not produce String fragment for normal text"); | |
expect_match!(parse(r"\xAE;"), Ok((_, Fragment::Char(_))), | |
"Did not produce Char fragment for character escape"); | |
expect_match!(parse(r"\ "), Ok((_, Fragment::Void)), | |
"Did not produce Void fragment for escaped whitespace"); | |
expect_match!(parse("\\ \n "), Ok((_, Fragment::Void)), | |
"Did not produce Void fragment for escaped whitespace containing linefeed"); | |
} | |
/// Test that the string body parser consumes and recombines the input string as expected. | |
#[test] | |
fn parses_string_body() { | |
let mut parse = | |
string_body::<(&str, ErrorKind)>('"'); | |
assert_eq!(parse(r"\x1FAD0; Pop Tarts"), Ok(("", String::from("\u{1FAD0} Pop Tarts"))), | |
"Failed to transform char fragment followed by string fragment"); | |
assert_eq!(parse(r"one\ two"), Ok(("", String::from("onetwo"))), | |
"Failed to transform string fragments separated by void fragment"); | |
assert_eq!(parse(""), Ok(("", String::from(""))), | |
"Failed to transform empty input to empty string"); | |
assert_eq!(parse(r"\ "), Ok(("", String::from(""))), | |
"Failed to transform single void fragment to empty string"); | |
assert_eq!(parse(r"\ \n18-10 inches"), Ok(("", String::from("\n18-10 inches"))), | |
"Failed to transform [Void,Char,String] sequence"); | |
// non-printable chars that may appear in strings unescaped | |
// yes, the newlines are supposed to be there | |
assert_eq!(parse("\nStrawberry\n"), Ok(("", String::from("\nStrawberry\n"))), | |
"Failed to transform String fragment containing newlines"); | |
assert_eq!(parse("\t"), Ok(("", String::from("\t"))), | |
"Failed to transform String fragment containing tab"); | |
} | |
/// Test that delimited strings are correctly parsed, including where broken by newlines, etc… | |
/// Also test that escaped string delimiters are treated nicely. Note that most tests concerned | |
/// with the body parser are in [parses_string_body]. | |
#[test] | |
fn parses_delimited_string() { | |
let parse = | |
string::<WantError>; | |
assert_eq!( | |
parse("\"Strawberry Pop Tarts\""), | |
Ok(("", Atom::String(String::from("Strawberry Pop Tarts")))), | |
"Failed to parse valid delimited string" | |
); | |
assert_eq!( | |
parse("\"\\\"flames 18-10 inches\""), | |
Ok(("", Atom::String(String::from("\"flames 18-10 inches")))), | |
"Failed to parse valid delimited string with single escaped delimiter" | |
); | |
assert_eq!( | |
parse("\"\\\"flames 18-10 inches\\\" in height\""), | |
Ok(("", Atom::String(String::from("\"flames 18-10 inches\" in height")))), | |
"Failed to parse valid delimited string with balanced escaped delimiters" | |
); | |
assert_eq!( | |
parse("\"incendiary devices.\n Toasters\""), | |
Ok(("", Atom::String(String::from("incendiary devices.\n Toasters")))), | |
"Failed to parse valid delimited string containing unescaped newline" | |
); | |
parse("Pop Tarts may be … incendiary devices") | |
.expect_err("Did not yield error when parsing un-delimited string"); | |
} | |
#[test] | |
fn parses_delimited_identifier() { | |
let parse = | |
long_identifier::<WantError>; | |
assert_eq!( | |
parse("|pop tarts|"), | |
Ok(("", Atom::String(String::from("pop tarts")))), | |
"Failed to parse delimited identifier" | |
) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment