Created
November 4, 2021 17:53
-
-
Save unleashy/9c8307f1a09627cea1ff3756c4515110 to your computer and use it in GitHub Desktop.
Literate code for a simple lexer in Rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![allow(unused_doc_comments)] | |
#![allow(dead_code)] | |
/// let's create a lexer to tokenise a simplistic arithmetic language, | |
/// with integers, plusses and minuses! | |
/// | |
/// playground: https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=7e0716a957a90c486328c8e775ca268f | |
/// | |
/// to store the state of our lexer we'll just create a lexer struct: | |
#[derive(Clone, Debug)] | |
struct Lexer { | |
source: String, | |
current: usize, | |
start: usize, | |
peeked: Option<Result<Token, String>> | |
} | |
/// `source` is just, the source string, the code, whatever; | |
/// `current` points to the current character in the source that we're looking | |
/// at, which, importantly, always points to the *next* character, like, if we | |
/// just parsed "a" in the string 'abc' then index will equal 1 and not 0 or | |
/// anything. | |
/// `start` points to the start of the current token. this lets us track the | |
/// text of a token. | |
/// `peeked` buffers a peeked token so that we can take it out when we `next` | |
/// later. | |
/// | |
/// note that there are other formulations for a lexer, but this one is simple | |
/// enough to work fine. | |
/// | |
/// now we also need a struct for a token: | |
#[derive(Clone, Debug, Eq, PartialEq)] | |
enum TokenKind { | |
End, | |
Integer(u64), | |
Plus, | |
Minus | |
} | |
#[derive(Clone, Debug)] | |
struct Token { | |
kind: TokenKind, | |
lexeme: String | |
} | |
/// the `kind` just, tells us the kind of this token. it can be an integer, a | |
/// plus '+' or a minus '-'. the lexeme holds the token's text, so if you parse | |
/// the integer "1234" lexeme should == "1234". it can also be an "End" which is | |
/// just a marker token for the end of the source. i could get rid of this and | |
/// instead use `Option<Token>`, but that is a lot less convenient, so | |
/// whatever. | |
/// | |
/// with that in place we can start implementing the lexer: | |
impl Lexer { | |
/// a simple constructor, clones its given source string. | |
pub fn new(source: &str) -> Self { | |
Self { | |
source: source.into(), | |
current: 0, | |
start: 0, | |
peeked: None | |
} | |
} | |
/// this is just a simple constructor, which clones the source it's given | |
/// (you can opt to not do that but then you have to store a reference in | |
/// your Lexer and thus use generic lifetimes and in the Token too) | |
/// | |
/// these are some helper functions for dealing with iterating on `Source`: | |
/// | |
/// peek at the current character, without advancing. note that we can't | |
/// index directly into a string in rust so we need to slice it at current, | |
/// and then use `.chars()` to get the first char of the slice. | |
fn current(&mut self) -> Option<char> { | |
self.source[self.current..] | |
.chars() | |
.next() | |
} | |
/// returns the current character and advances `current` by one. note the | |
/// handling of unicode: adding by 1 instead of the character's encoded | |
/// length could make us panic by indexing inside an UTF-8 character, so we | |
/// use the proper UTF-8 length. | |
fn advance(&mut self) -> Option<char> { | |
self.current() | |
.map(|c| { | |
self.current += c.len_utf8(); | |
c | |
}) | |
} | |
/// gets the current lexeme by using `start` and `current`: | |
fn lexeme(&self) -> &str { | |
&self.source[self.start .. self.current] | |
} | |
/// before diving into the proper scanning method, here's a method that | |
/// simply skips over all the whitespace it finds and nothing else. we'll | |
/// use this later to do well, exactly that. | |
fn skip_whitespace(&mut self) { | |
// go through the string until it ends, without advancing. | |
while let Some(c) = self.current() { | |
// if this character is a whitespace, advance and continue. | |
// otherwise, stop, as we don't want to consume any non-whitespace | |
// characters. | |
if c.is_ascii_whitespace() { | |
self.advance(); | |
} else { | |
break; | |
} | |
} | |
} | |
/// hopefully that was understandable, now we're off to scan integers: | |
fn scan_integer(&mut self) -> Result<Token, String> { | |
// if this method is called, we *guarantee* that `start` points to an | |
// ascii digit; you can see that later on. so, let's just scan as many | |
// digits as we can: | |
while let Some(c) = self.current() { | |
if c.is_ascii_digit() { | |
self.advance(); | |
} else { | |
break; | |
} | |
} | |
// as you can see this is nearly the same code as for whitespace. this | |
// is kinda the General Scanning Procedure: | |
// | |
// for each character: | |
// advance if it is what you want | |
// stop if not | |
// | |
// since we've scanned all digits we could, let's get the text and | |
// parse it into an u64: | |
let value = self.lexeme().parse::<u64>(); | |
// oh fuck, wait. parsing can fail, as our integer may be larger than | |
// u64::MAX! let's handle it, by mapping parse's error into our own | |
// and re-throwing it with `?`: | |
let value = value.map_err(|e| { | |
// the only error that can happen here is PosOverflow due to our | |
// parsing work before, but let's be sure with an assert. | |
assert_eq!(e.kind(), &std::num::IntErrorKind::PosOverflow); | |
// yay for good error messages! | |
format!("integer literal '{}' is too large", self.lexeme()) | |
})?; | |
// now we can emit a token: | |
Ok(Token { | |
kind: TokenKind::Integer(value), | |
lexeme: self.lexeme().into() | |
}) | |
} | |
/// now that we're done with our helpers, let's implement "next_impl": this | |
/// scans the source and returns a token: given a Lexer with source: | |
/// "12 + 1", current: 0, and start: 0, it will scan "12" and be left at | |
/// source: "12 + 1", current: 2, and start: 0. this is just like an | |
/// iterator for a "virtual" token stream. the next call to `next` starts | |
/// from that and so on until the end. | |
/// | |
/// also note the possibility of errors; i'm using String as an error type | |
/// for simplicity but you can make a rich Error struct or whatever. | |
/// | |
/// and also this isn't the real method we'll use in parsing or whatever, | |
/// this is just an internal method that does the hard work and we handle | |
/// peeking etc. later. | |
fn next_impl(&mut self) -> Result<Token, String> { | |
// before 'next'ing we have to get rid of any "ignorable" characters, | |
// such as whitespace: | |
self.skip_whitespace(); | |
// set start to current to "sync" the lexeme tracking: | |
self.start = self.current; | |
// unconditionally get & advance the current character: | |
let c = self.advance(); | |
// the Big Match looks at the current character and emits the | |
// appropriate token for it: | |
match c { | |
// if None, we reached the end | |
None => | |
Ok(Token { kind: TokenKind::End, lexeme: self.lexeme().into() }), | |
// if 0..=9 this is an integer! start scanning that | |
Some('0'..='9') => | |
self.scan_integer(), | |
// + is Plus, - is Minus | |
Some('+') => | |
Ok(Token { kind: TokenKind::Plus, lexeme: self.lexeme().into() }), | |
Some('-') => | |
Ok(Token { kind: TokenKind::Minus, lexeme: self.lexeme().into() }), | |
// everything else is an error! | |
wtf => Err(format!("unrecognised character {:?}", wtf)) | |
} | |
} | |
/// we're literally done with our lexer proper. now just handling peeking | |
/// and nexting! | |
pub fn peek(&mut self) -> Result<&Token, &String> { | |
// if there is something already peeked, get it, otherwise call | |
// next_impl to do the hard work. | |
if self.peeked.is_none() { | |
self.peeked = Some(self.next_impl()); | |
} | |
// yes this is ugly, but rust kinda sucks here | |
self.peeked.as_ref().unwrap().as_ref() | |
} | |
pub fn next(&mut self) -> Result<Token, String> { | |
// if there is something already peeked, take it (leaving None on its | |
// place), otherwise call next_impl to do the hard work. | |
self.peeked | |
.take() | |
.unwrap_or_else(|| self.next_impl()) | |
} | |
} | |
/// testing the bitch out | |
fn main() { | |
let mut lexer = Lexer::new("50 + 60 - 99"); | |
loop { | |
match lexer.next() { | |
Ok(token) => { | |
println!("{:?}", token); | |
if token.kind == TokenKind::End { | |
break; | |
} | |
} | |
Err(e) => { | |
eprintln!("error: {}", e); | |
break; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment