unleashy · November 4, 2021 17:53
diff --git a/lexer.rs b/lexer.rs
 #![allow(unused_doc_comments)]
 #![allow(dead_code)]

 /// let's create a lexer to tokenise a simplistic arithmetic language,
 /// with integers, plusses and minuses!
 ///
 /// playground: https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=7e0716a957a90c486328c8e775ca268f
 ///
 /// to store the state of our lexer we'll just create a lexer struct:

 #[derive(Clone, Debug)]
 struct Lexer {
    source: String,
    current: usize,
    start: usize,
    peeked: Option<Result<Token, String>>
 }

 /// `source` is just, the source string, the code, whatever;
 /// `current` points to the current character in the source that we're looking
 /// at, which, importantly, always points to the *next* character, like, if we
 /// just parsed "a" in the string 'abc' then index will equal 1 and not 0 or
 /// anything.
 /// `start` points to the start of the current token. this lets us track the
 /// text of a token.
 /// `peeked` buffers a peeked token so that we can take it out when we `next`
 /// later.
 /// 
 /// note that there are other formulations for a lexer, but this one is simple
 /// enough to work fine.
 ///
 /// now we also need a struct for a token:

 #[derive(Clone, Debug, Eq, PartialEq)]
 enum TokenKind {
    End,
    Integer(u64),
    Plus,
    Minus
 }

 #[derive(Clone, Debug)]
 struct Token {
    kind: TokenKind,
    lexeme: String
 }

 /// the `kind` just, tells us the kind of this token. it can be an integer, a
 /// plus '+' or a minus '-'. the lexeme holds the token's text, so if you parse
 /// the integer "1234" lexeme should == "1234". it can also be an "End" which is
 /// just a marker token for the end of the source. i could get rid of this and
 /// instead use `Option<Token>`, but that is a lot less convenient, so
 /// whatever.
 ///
 /// with that in place we can start implementing the lexer:

 impl Lexer {
    /// a simple constructor, clones its given source string.
    
    pub fn new(source: &str) -> Self {
        Self {
            source: source.into(),
            current: 0,
            start: 0,
            peeked: None
        }
    }

    /// this is just a simple constructor, which clones the source it's given
    /// (you can opt to not do that but then you have to store a reference in
    /// your Lexer and thus use generic lifetimes and in the Token too)
    ///
    /// these are some helper functions for dealing with iterating on `Source`:
    ///
    /// peek at the current character, without advancing. note that we can't
    /// index directly into a string in rust so we need to slice it at current,
    /// and then use `.chars()` to get the first char of the slice.
    
    fn current(&mut self) -> Option<char> {
        self.source[self.current..]
            .chars()
            .next()
    }
    
    /// returns the current character and advances `current` by one. note the
    /// handling of unicode: adding by 1 instead of the character's encoded
    /// length could make us panic by indexing inside an UTF-8 character, so we
    /// use the proper UTF-8 length.
    
    fn advance(&mut self) -> Option<char> {
        self.current()
            .map(|c| {
                self.current += c.len_utf8();
                c
            })
    }
    
    /// gets the current lexeme by using `start` and `current`:
    
    fn lexeme(&self) -> &str {
        &self.source[self.start .. self.current]
    }

    /// before diving into the proper scanning method, here's a method that
    /// simply skips over all the whitespace it finds and nothing else. we'll
    /// use this later to do well, exactly that.

    fn skip_whitespace(&mut self) {
        // go through the string until it ends, without advancing.
        while let Some(c) = self.current() {
            // if this character is a whitespace, advance and continue.
            // otherwise, stop, as we don't want to consume any non-whitespace
            // characters.
            if c.is_ascii_whitespace() {
                self.advance();
            } else {
                break;
            }
        }
    }
    
    /// hopefully that was understandable, now we're off to scan integers:
    
    fn scan_integer(&mut self) -> Result<Token, String> {
        // if this method is called, we *guarantee* that `start` points to an
        // ascii digit; you can see that later on. so, let's just scan as many
        // digits as we can:
        while let Some(c) = self.current() {
            if c.is_ascii_digit() {
                self.advance();
            } else {
                break;
            }
        }
        
        // as you can see this is nearly the same code as for whitespace. this
        // is kinda the General Scanning Procedure:
        //
        //   for each character:
        //     advance if it is what you want
        //     stop if not
        //
        // since we've scanned all digits we could, let's get the text and
        // parse it into an u64:
        let value = self.lexeme().parse::<u64>();
        
        // oh fuck, wait. parsing can fail, as our integer may be larger than
        // u64::MAX! let's handle it, by mapping parse's error into our own
        // and re-throwing it with `?`:
        
        let value = value.map_err(|e| {
            // the only error that can happen here is PosOverflow due to our
            // parsing work before, but let's be sure with an assert.
            assert_eq!(e.kind(), &std::num::IntErrorKind::PosOverflow);
            
            // yay for good error messages!
            format!("integer literal '{}' is too large", self.lexeme())
        })?;
        
        // now we can emit a token:
        Ok(Token {
            kind: TokenKind::Integer(value),
            lexeme: self.lexeme().into()
        })
    }
    
    /// now that we're done with our helpers, let's implement "next_impl": this
    /// scans the source and returns a token: given a Lexer with source:
    /// "12 + 1", current: 0, and start: 0, it will scan "12" and be left at
    /// source: "12 + 1", current: 2, and start: 0. this is just like an
    /// iterator for a "virtual" token stream. the next call to `next` starts
    /// from that and so on until the end.
    ///
    /// also note the possibility of errors; i'm using String as an error type
    /// for simplicity but you can make a rich Error struct or whatever.
    ///
    /// and also this isn't the real method we'll use in parsing or whatever,
    /// this is just an internal method that does the hard work and we handle
    /// peeking etc. later.
    
    fn next_impl(&mut self) -> Result<Token, String> {
        // before 'next'ing we have to get rid of any "ignorable" characters,
        // such as whitespace:
        self.skip_whitespace();
        
        // set start to current to "sync" the lexeme tracking:
        self.start = self.current;
        
        // unconditionally get & advance the current character:
        let c = self.advance();
        
        // the Big Match looks at the current character and emits the
        // appropriate token for it:
        
        match c {
            // if None, we reached the end
            None =>
                Ok(Token { kind: TokenKind::End, lexeme: self.lexeme().into() }),
            
            // if 0..=9 this is an integer! start scanning that
            Some('0'..='9') =>
                self.scan_integer(),
            
            // + is Plus, - is Minus
            Some('+') =>
                Ok(Token { kind: TokenKind::Plus, lexeme: self.lexeme().into() }),
            Some('-') =>
                Ok(Token { kind: TokenKind::Minus, lexeme: self.lexeme().into() }),
            
            // everything else is an error!
            wtf => Err(format!("unrecognised character {:?}", wtf))
        }
    }
    
    /// we're literally done with our lexer proper. now just handling peeking
    /// and nexting!
    
    pub fn peek(&mut self) -> Result<&Token, &String> {
        // if there is something already peeked, get it, otherwise call
        // next_impl to do the hard work.
        if self.peeked.is_none() {
            self.peeked = Some(self.next_impl());
        }
        
        // yes this is ugly, but rust kinda sucks here
        self.peeked.as_ref().unwrap().as_ref()
    }
    
    pub fn next(&mut self) -> Result<Token, String> {
        // if there is something already peeked, take it (leaving None on its
        // place), otherwise call next_impl to do the hard work.
        self.peeked
            .take()
            .unwrap_or_else(|| self.next_impl())
    }
 }

 /// testing the bitch out

 fn main() {
    let mut lexer = Lexer::new("50 + 60 - 99");
    
    loop {
        match lexer.next() {
            Ok(token) => {
                println!("{:?}", token);
                if token.kind == TokenKind::End {
                    break;
                }
            }
            Err(e) => {
                eprintln!("error: {}", e);
                break;
            }
        }
    }
 }
	#![allow(unused_doc_comments)]
	#![allow(dead_code)]

	/// let's create a lexer to tokenise a simplistic arithmetic language,
	/// with integers, plusses and minuses!
	///
	/// playground: https://play.rust-lang.org/?version=stable&mode=debug&edition=2021&gist=7e0716a957a90c486328c8e775ca268f
	///
	/// to store the state of our lexer we'll just create a lexer struct:

	#[derive(Clone, Debug)]
	struct Lexer {
	source: String,
	current: usize,
	start: usize,
	peeked: Option<Result<Token, String>>
	}

	/// `source` is just, the source string, the code, whatever;
	/// `current` points to the current character in the source that we're looking
	/// at, which, importantly, always points to the next character, like, if we
	/// just parsed "a" in the string 'abc' then index will equal 1 and not 0 or
	/// anything.
	/// `start` points to the start of the current token. this lets us track the
	/// text of a token.
	/// `peeked` buffers a peeked token so that we can take it out when we `next`
	/// later.
	///
	/// note that there are other formulations for a lexer, but this one is simple
	/// enough to work fine.
	///
	/// now we also need a struct for a token:

	#[derive(Clone, Debug, Eq, PartialEq)]
	enum TokenKind {
	End,
	Integer(u64),
	Plus,
	Minus
	}

	#[derive(Clone, Debug)]
	struct Token {
	kind: TokenKind,
	lexeme: String
	}

	/// the `kind` just, tells us the kind of this token. it can be an integer, a
	/// plus '+' or a minus '-'. the lexeme holds the token's text, so if you parse
	/// the integer "1234" lexeme should == "1234". it can also be an "End" which is
	/// just a marker token for the end of the source. i could get rid of this and
	/// instead use `Option<Token>`, but that is a lot less convenient, so
	/// whatever.
	///
	/// with that in place we can start implementing the lexer:

	impl Lexer {
	/// a simple constructor, clones its given source string.

	pub fn new(source: &str) -> Self {
	Self {
	source: source.into(),
	current: 0,
	start: 0,
	peeked: None
	}
	}

	/// this is just a simple constructor, which clones the source it's given
	/// (you can opt to not do that but then you have to store a reference in
	/// your Lexer and thus use generic lifetimes and in the Token too)
	///
	/// these are some helper functions for dealing with iterating on `Source`:
	///
	/// peek at the current character, without advancing. note that we can't
	/// index directly into a string in rust so we need to slice it at current,
	/// and then use `.chars()` to get the first char of the slice.

	fn current(&mut self) -> Option<char> {
	self.source[self.current..]
	.chars()
	.next()
	}

	/// returns the current character and advances `current` by one. note the
	/// handling of unicode: adding by 1 instead of the character's encoded
	/// length could make us panic by indexing inside an UTF-8 character, so we
	/// use the proper UTF-8 length.

	fn advance(&mut self) -> Option<char> {
	self.current()
	.map(\|c\| {
	self.current += c.len_utf8();
	c
	})
	}

	/// gets the current lexeme by using `start` and `current`:

	fn lexeme(&self) -> &str {
	&self.source[self.start .. self.current]
	}

	/// before diving into the proper scanning method, here's a method that
	/// simply skips over all the whitespace it finds and nothing else. we'll
	/// use this later to do well, exactly that.

	fn skip_whitespace(&mut self) {
	// go through the string until it ends, without advancing.
	while let Some(c) = self.current() {
	// if this character is a whitespace, advance and continue.
	// otherwise, stop, as we don't want to consume any non-whitespace
	// characters.
	if c.is_ascii_whitespace() {
	self.advance();
	} else {
	break;
	}
	}
	}

	/// hopefully that was understandable, now we're off to scan integers:

	fn scan_integer(&mut self) -> Result<Token, String> {
	// if this method is called, we guarantee that `start` points to an
	// ascii digit; you can see that later on. so, let's just scan as many
	// digits as we can:
	while let Some(c) = self.current() {
	if c.is_ascii_digit() {
	self.advance();
	} else {
	break;
	}
	}

	// as you can see this is nearly the same code as for whitespace. this
	// is kinda the General Scanning Procedure:
	//
	// for each character:
	// advance if it is what you want
	// stop if not
	//
	// since we've scanned all digits we could, let's get the text and
	// parse it into an u64:
	let value = self.lexeme().parse::<u64>();

	// oh fuck, wait. parsing can fail, as our integer may be larger than
	// u64::MAX! let's handle it, by mapping parse's error into our own
	// and re-throwing it with `?`:

	let value = value.map_err(\|e\| {
	// the only error that can happen here is PosOverflow due to our
	// parsing work before, but let's be sure with an assert.
	assert_eq!(e.kind(), &std::num::IntErrorKind::PosOverflow);

	// yay for good error messages!
	format!("integer literal '{}' is too large", self.lexeme())
	})?;

	// now we can emit a token:
	Ok(Token {
	kind: TokenKind::Integer(value),
	lexeme: self.lexeme().into()
	})
	}

	/// now that we're done with our helpers, let's implement "next_impl": this
	/// scans the source and returns a token: given a Lexer with source:
	/// "12 + 1", current: 0, and start: 0, it will scan "12" and be left at
	/// source: "12 + 1", current: 2, and start: 0. this is just like an
	/// iterator for a "virtual" token stream. the next call to `next` starts
	/// from that and so on until the end.
	///
	/// also note the possibility of errors; i'm using String as an error type
	/// for simplicity but you can make a rich Error struct or whatever.
	///
	/// and also this isn't the real method we'll use in parsing or whatever,
	/// this is just an internal method that does the hard work and we handle
	/// peeking etc. later.

	fn next_impl(&mut self) -> Result<Token, String> {
	// before 'next'ing we have to get rid of any "ignorable" characters,
	// such as whitespace:
	self.skip_whitespace();

	// set start to current to "sync" the lexeme tracking:
	self.start = self.current;

	// unconditionally get & advance the current character:
	let c = self.advance();

	// the Big Match looks at the current character and emits the
	// appropriate token for it:

	match c {
	// if None, we reached the end
	None =>
	Ok(Token { kind: TokenKind::End, lexeme: self.lexeme().into() }),

	// if 0..=9 this is an integer! start scanning that
	Some('0'..='9') =>
	self.scan_integer(),

	// + is Plus, - is Minus
	Some('+') =>
	Ok(Token { kind: TokenKind::Plus, lexeme: self.lexeme().into() }),
	Some('-') =>
	Ok(Token { kind: TokenKind::Minus, lexeme: self.lexeme().into() }),

	// everything else is an error!
	wtf => Err(format!("unrecognised character {:?}", wtf))
	}
	}

	/// we're literally done with our lexer proper. now just handling peeking
	/// and nexting!

	pub fn peek(&mut self) -> Result<&Token, &String> {
	// if there is something already peeked, get it, otherwise call
	// next_impl to do the hard work.
	if self.peeked.is_none() {
	self.peeked = Some(self.next_impl());
	}

	// yes this is ugly, but rust kinda sucks here
	self.peeked.as_ref().unwrap().as_ref()
	}

	pub fn next(&mut self) -> Result<Token, String> {
	// if there is something already peeked, take it (leaving None on its
	// place), otherwise call next_impl to do the hard work.
	self.peeked
	.take()
	.unwrap_or_else(\|\| self.next_impl())
	}
	}

	/// testing the bitch out

	fn main() {
	let mut lexer = Lexer::new("50 + 60 - 99");

	loop {
	match lexer.next() {
	Ok(token) => {
	println!("{:?}", token);
	if token.kind == TokenKind::End {
	break;
	}
	}
	Err(e) => {
	eprintln!("error: {}", e);
	break;
	}
	}
	}
	}