Last active
March 6, 2024 14:02
-
-
Save brokenpylons/71fdd4f07aed8e3f212250946a00d3bf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.InputStream | |
import java.io.OutputStream | |
const val ERROR_STATE = 0 | |
enum class Symbol { | |
EOF, | |
SKIP, | |
FOR, | |
FOREACH, | |
FFF | |
} | |
const val EOF = -1 | |
const val NEWLINE = '\n'.code | |
interface DFA { | |
val states: Set<Int> | |
val alphabet: IntRange | |
fun next(state: Int, code: Int): Int | |
fun symbol(state: Int): Symbol | |
val startState: Int | |
val finalStates: Set<Int> | |
} | |
object ForForeachFFFAutomaton: DFA { | |
override val states = (1 .. 11).toSet() | |
override val alphabet = 0 .. 255 | |
override val startState = 1 | |
override val finalStates = setOf(2, 3, 5, 9, 10, 11) | |
private val numberOfStates = states.max() + 1 // plus the ERROR_STATE | |
private val numberOfCodes = alphabet.max() + 1 // plus the EOF | |
private val transitions = Array(numberOfStates) {IntArray(numberOfCodes)} | |
private val values = Array(numberOfStates) {Symbol.SKIP} | |
private fun setTransition(from: Int, chr: Char, to: Int) { | |
transitions[from][chr.code + 1] = to // + 1 because EOF is -1 and the array starts at 0 | |
} | |
private fun setTransition(from: Int, code: Int, to: Int) { | |
transitions[from][code + 1] = to | |
} | |
private fun setSymbol(state: Int, symbol: Symbol) { | |
values[state] = symbol | |
} | |
override fun next(state: Int, code: Int): Int { | |
assert(states.contains(state)) | |
assert(alphabet.contains(code)) | |
return transitions[state][code + 1] | |
} | |
override fun symbol(state: Int): Symbol { | |
assert(states.contains(state)) | |
return values[state] | |
} | |
init { | |
setTransition(1, 'f', 2) | |
setTransition(2, 'f', 3) | |
setTransition(3, 'f', 3) | |
setTransition(2, 'o', 4) | |
setTransition(4, 'r', 5) | |
setTransition(5, 'e', 6) | |
setTransition(6, 'a', 7) | |
setTransition(7, 'c', 8) | |
setTransition(8, 'h', 9) | |
setTransition(1, ' ', 10) | |
setTransition(1, EOF, 11) | |
setSymbol(2, Symbol.FFF) | |
setSymbol(3, Symbol.FFF) | |
setSymbol(5, Symbol.FOR) | |
setSymbol(9, Symbol.FOREACH) | |
setSymbol(11, Symbol.EOF) | |
} | |
} | |
data class Token(val symbol: Symbol, val lexeme: String, val startRow: Int, val startColumn: Int) | |
class Scanner(private val automaton: DFA, private val stream: InputStream) { | |
private var last: Int? = null | |
private var row = 1 | |
private var column = 1 | |
private fun updatePosition(code: Int) { | |
if (code == NEWLINE) { | |
row += 1 | |
column = 1 | |
} else { | |
column += 1 | |
} | |
} | |
fun getToken(): Token { | |
val startRow = row | |
val startColumn = column | |
val buffer = mutableListOf<Char>() | |
var code = last ?: stream.read() | |
var state = automaton.startState | |
while (true) { | |
val nextState = automaton.next(state, code) | |
if (nextState == ERROR_STATE) break // Longest match | |
state = nextState | |
updatePosition(code) | |
buffer.add(code.toChar()) | |
code = stream.read() | |
} | |
last = code // The code following the current lexeme is the first code of the next lexeme | |
if (automaton.finalStates.contains(state)) { | |
val symbol = automaton.symbol(state) | |
return if (symbol == Symbol.SKIP) { | |
getToken() | |
} else { | |
val lexeme = String(buffer.toCharArray()) | |
Token(symbol, lexeme, startRow, startColumn) | |
} | |
} else { | |
throw Error("Invalid pattern at ${row}:${column}") | |
} | |
} | |
} | |
fun name(symbol: Symbol) = | |
when (symbol) { | |
Symbol.FOR -> "for" | |
Symbol.FOREACH -> "foreach" | |
Symbol.FFF -> "fff" | |
else -> throw Error("Invalid symbol") | |
} | |
fun printTokens(scanner: Scanner, output: OutputStream) { | |
val writer = output.writer(Charsets.UTF_8) | |
var token = scanner.getToken() | |
while (token.symbol != Symbol.EOF) { | |
writer.append("${name(token.symbol)}(\"${token.lexeme}\") ") // The output ends with a space! | |
token = scanner.getToken() | |
} | |
writer.appendLine() | |
writer.flush() | |
} | |
fun main(args: Array<String>) { | |
printTokens(Scanner(ForForeachFFFAutomaton, "fffffff foreach for f".byteInputStream()), System.out) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment