module natcomp where

open import Data.Nat using (ℕ; _+_)
open import Data.List using (List; []; _∷_; _++_)
open import Data.List.Properties using (++-assoc)
open import Data.Maybe using (Maybe; just; nothing; _>>=_)

open import Relation.Binary.PropositionalEquality using (_≡_; refl; cong)
open Relation.Binary.PropositionalEquality.≡-Reasoning

Natural Number Compiler

My goal in this project is to learn about compiler correctness. I will write a small compiler from a minimalistic natural-numbers language down to a stack-based machine language, then prove forward simulation.

Source Language

Begin by defining the source language, where terms consist of nat literals and addition

data Tm : Set where
  lit : ℕ → Tm
  add : Tm → Tm → Tm

Since all terms in this language are closed, we can interpret them directly to numbers

eval : Tm → ℕ
eval (lit n) = n
eval (add tm₁ tm₂) = eval tm₁ + eval tm₂

But to make things more interesting let's give small-step semantics

data _↦_ : Tm → Tm → Set where
  redₗ : ∀{tm₁ tm₁′ tm₂ : Tm}
    → tm₁ ↦ tm₁′
    → add tm₁ tm₂ ↦ add tm₁′ tm₂
  redᵣ : ∀{tm₁ tm₂ tm₂′ : Tm}
    → tm₂ ↦ tm₂′
    → add tm₁ tm₂ ↦ add tm₁ tm₂′
  red₊ : ∀{n₁ n₂ : ℕ}
    → add (lit n₁) (lit n₂) ↦ lit (n₁ + n₂)

And the symmetric-transitive closure...

data _↦*_ : Tm → Tm → Set where
  refl* : ∀{tm : Tm}
    → tm ↦* tm
  trans* : ∀{tm₁ tm₂ tm₃ : Tm}
    → tm₁ ↦  tm₂
    → tm₂ ↦* tm₃
    → tm₁ ↦* tm₃

single* : ∀{tm₁ tm₂ : Tm} → tm₁ ↦ tm₂ → tm₁ ↦* tm₂
single* s = trans* s refl*

Target Language

Next we define what our language will compile to. This is a simple stack language

data Instr : Set where
  push : ℕ → Instr
  add : Instr
Prog = List Instr

We can execute those as RPN programs. Their execution is defined in terms of a relation on stacks of numbers

Stack = List ℕ

data _∣_↦_ : Instr → Stack → Stack → Set where
  red-push : ∀{Σ : Stack} {n : ℕ}
    → (push n) ∣ Σ ↦ (n ∷ Σ)
  red-add  : ∀{Σ : Stack} {n₁ n₂ : ℕ}
    → add ∣ (n₁ ∷ n₂ ∷ Σ) ↦ ((n₁ + n₂) ∷ Σ)

Execution of entire programs (starting from the "head" instruction) follows from that

data _∥_↦*_ : Prog → Stack → Stack → Set where
  done* : ∀{Σ : Stack}
    → [] ∥ Σ ↦* Σ
  step* : ∀{i : Instr} {p : Prog} {Σ Σ′ Σ″ : Stack}
    → i       ∣ Σ ↦ Σ′
    → p       ∥ Σ′ ↦* Σ″
    → (i ∷ p) ∥ Σ  ↦* Σ″

As an aside, we can prove that the result of evaluating a stack program directly, if doesn't fail, will argee with the reduction relation

run : Stack → Prog → Maybe Stack
run stk [] = just stk
run stk (push n ∷ prog) = run (n ∷ stk) prog
run stk (add ∷ prog) with stk
... | n₁ ∷ n₂ ∷ stk = run ((n₁ + n₂) ∷ stk) prog
... | _ = nothing

run-soundness : ∀(prog : Prog) (stk stk′ : Stack)
  → run stk prog ≡ just stk′
  → prog ∥ stk ↦* stk′
run-soundness [] stk .stk refl = done*
run-soundness (add ∷ prog) (n₁ ∷ n₂ ∷ stk) stk′ success₊ =
  step* red-add (run-soundness prog ((n₁ + n₂) ∷ stk) stk′ success₊)
run-soundness (push x ∷ prog) stk stk′ success =
  step* red-push (run-soundness prog (x ∷ stk) stk′ success)

Another crucial property of program execution is the following characterization of concatenated programs: executing a concatenated program is the same as executing both sub-programs sequentially, assuming the first one doesn't fail.

run-++ : ∀(prog₁ prog₂ : Prog) (stk : Stack)
  → run stk (prog₁ ++ prog₂) ≡ (run stk prog₁ >>= (λ stk′ → run stk′ prog₂))
run-++ [] prog₂ stk = refl
run-++ (push n ∷ prog₁) prog₂ stk = run-++ prog₁ prog₂ (n ∷ stk)
run-++ (add ∷ prog₁) prog₂ (n₁ ∷ n₂ ∷ stk) = run-++ prog₁ prog₂ ((n₁ + n₂) ∷ stk)

run-++ (add ∷ prog₁) prog₂ [] = refl
run-++ (add ∷ prog₁) prog₂ (_ ∷ []) = refl

Compliation

Compilation is a behavior-preserving translation from the source language to the target language

compile : Tm → Prog
compile (lit x) = (push x) ∷ []
compile (add tm₁ tm₂) = compile tm₂ ++ (compile tm₁ ++ (add ∷ []))

The simplest safety guarantee we'll prove for compilation is that it never "goes wrong". By that I mean, running a compiled stack program will always return a just stk. In fact, we can prove something even stronger: a compiled program will always add just a single new element onto the stack, and that single element is precisely the result of evaluating the term we started from.

compiler-correctness : ∀(tm : Tm) (stk : Stack)
  → run stk (compile tm) ≡ just (eval tm ∷ stk)
compiler-correctness (lit n) stk = refl
compiler-correctness (add tm₁ tm₂) stk = begin
  run stk (compile tm₂ ++ compile tm₁ ++ (add ∷ []))
    ≡⟨ run-++ (compile tm₂) (compile tm₁ ++ (add ∷ [])) stk ⟩
  ((run stk (compile tm₂)) >>= (λ stk′ → run stk′ (compile tm₁ ++ (add ∷ []))))
    ≡⟨ cong (_>>= _) (compiler-correctness tm₂ stk) ⟩
  run (eval tm₂ ∷ stk) (compile tm₁ ++ (add ∷ []))
    ≡⟨ run-++ (compile tm₁) (add ∷ []) (eval tm₂ ∷ stk) ⟩
  (run (eval tm₂ ∷ stk) (compile tm₁) >>= (λ stk′ → run stk′ (add ∷ [])))
    ≡⟨ cong (_>>= _) (compiler-correctness tm₁ (eval tm₂ ∷ stk)) ⟩
  just ((eval tm₁ + eval tm₂) ∷ stk)
    ∎

As a corollary, running any compiled term on an empty stack returns a singleton stack containing the value of that term

main-result : ∀(tm : Tm)
  → run [] (compile tm) ≡ just (eval tm ∷ [])
main-result tm = compiler-correctness tm []

aradarbel10/natcomp.lagda.md

Natural Number Compiler

Source Language

Target Language

Compliation