https://github.com/xiazemin/the-super-tiny-compiler-1/blob/master/compiler.go
https://github.com/xiazemin/the-super-tiny-compiler
https://adampresley.github.io/2015/06/01/writing-a-lexer-and-parser-in-go-part-3.html
https://adampresley.github.io/2015/06/01/writing-a-lexer-and-parser-in-go-part-2.html
https://adampresley.github.io/2015/06/01/writing-a-lexer-and-parser-in-go-part-1.html
/**
/**
add
and subtract
they would be written like this:/**
/**
*
/**
*
*
*
/**
*
/**
/**
/**
import (
“fmt”
“log”
“strings”
)
type token struct {
kind string
value string
}
// We start by accepting an input string of code, and we’re gonna set up two
// things…
func tokenizer(input string) []token {
// A new line is appended to the program
input += “\n”
// A `current` variable for tracking our position in the code like a cursor.
current := 0
// And a slice of our `token` type for appending tokens to.
tokens := []token{}
// We start by creating a `for` loop where we are setting up our `current`
// variable to be incremented as much as we want `inside` the loop.
//
// We do this because we may want to increment `current` many times within a
// single loop because our tokens can be any length.
for current < len([]rune(input)) {
// We're also going to store the `current` character in the `input`.
char := string([]rune(input)[current])
// The first thing we want to check for is an open parenthesis. This will
// later be used for `CallExpressions` but for now we only care about the
// character.
//
// We check to see if we have an open parenthesis:
if char == "(" {
// If we do, we append a new token to our slice with the kind `paren`
// and set the value to an open parenthesis.
tokens = append(tokens, token{
kind: "paren",
value: "(",
})
// Then we increment `current`
current++
// And we `continue` onto the next cycle of the loop.
continue
}
// Next we're going to check for a closing parenthesis. We do the same exact
// thing as before: Check for a closing parenthesis, add a new token,
// increment `current`, and `continue`.
if char == ")" {
tokens = append(tokens, token{
kind: "paren",
value: ")",
})
current++
continue
}
// Moving on, we're now going to check for whitespace. This is interesting
// because we care that whitespace exists to separate characters, but it
// isn't actually important for us to store as a token. We would only throw
// it out later.
//
// So here we're just going to test for existence and if it does exist we're
// going to just `continue` on.
if char == " " {
current++
continue
}
// The next type of token is a number. This is different than what we have
// seen before because a number could be any number of characters and we
// want to capture the entire sequence of characters as one token.
//
// (add 123 456)
// ^^^ ^^^
// Only two separate tokens
//
// So we start this off when we encounter the first number in a sequence.
if isNumber(char) {
// We're going to create a `value` string that we are going to append
// characters to.
value := ""
// Then we're going to loop through each character in the sequence until
// we encounter a character that is not a number, pushing each character
// that is a number to our `value` and incrementing `current` as we go.
for isNumber(char) {
value += char
current++
char = string([]rune(input)[current])
}
// After that we append our `number` token to the `tokens` slice.
tokens = append(tokens, token{
kind: "number",
value: value,
})
// And we continue on.
continue
}
// The last type of token will be a `name` token. This is a sequence of
// letters instead of numbers, that are the names of functions in our lisp
// syntax.
//
// (add 2 4)
// ^^^
// Name token
//
if isLetter(char) {
value := ""
// Again we're just going to loop through all the letters pushing them to
// a value.
for isLetter(char) {
value += char
current++
char = string([]rune(input)[current])
}
// And appending that value as a token with the type `name` and continuing.
tokens = append(tokens, token{
kind: "name",
value: value,
})
continue
}
break
}
// Then at the end of our `tokenizer` we simply return the tokens array.
return tokens }
// isNumber accepts a string and will check to see whether or not what has been
// passed through is between the range of 0 - 9.
func isNumber(char string) bool {
if char == “” {
return false
}
n := []rune(char)[0]
if n >= ‘0’ && n <= ‘9’ {
return true
}
return false
}
// isLetter works in a similar way to isNumber, but checks the range for a
// letter in the range of a - z.
func isLetter(char string) bool {
if char == “” {
return false
}
n := []rune(char)[0]
if n >= ‘a’ && n <= ‘z’ {
return true
}
return false
}
/**
/**
// We will define our type, node
here. Within node are pointers types to what
// would otherwise be recursive types in Go. e.g.
//
// callee node
//
// Would cause the Go compiler to complain about a recursive type. When we want
// to use one of these types to pass through to a function, for example, we’d
// use &
as it’d be a reference. But we’ll come to that a bit later on.
type node struct {
kind string
value string
name string
callee *node
expression *node
body []node
params []node
arguments *[]node
context *[]node
}
// Type ast
is just another alias type. I find this makes part of the code
// more readable, as you’ll come to see that there are a ton of references to
// node
.
type ast node
// This is the counter variable that we’ll use for parsing.
var pc int
// This variable will store our slice of token
s inside of it.
var pt []token
// Okay, so we define a parser
function that accepts our slice of tokens
.
func parser(tokens []token) ast {
// Here, we initially give both the parser counter and the parser tokens a
// value.
pc = 0
pt = tokens
// Now, we're going to create our AST which will have a root which is a
// `Program` node.
ast := ast{
kind: "Program",
body: []node{},
}
// And we're going to kickstart our `walk` function, which you can find just
// below this, we'll be pushing nodes to our `ast.body` slice.
//
// The reason we are doing this inside a loop is because our program can have
// `CallExpressions` after one another instead of being nested.
//
// (add 2 2)
// (subtract 4 2)
//
for pc < len(pt) {
ast.body = append(ast.body, walk())
}
// At the end of our parser we'll return the AST.
return ast }
// But this time we’re going to use recursion instead of a while
loop. So we
// define a walk
function.
func walk() node {
// Inside the walk function we start by grabbing the `current` token.
token := pt[pc]
// We're going to split each type of token off into a different code path,
// starting off with `number` tokens.
//
// We test to see if we have a `number` token.
if token.kind == "number" {
// If we have one, we'll increment `current`.
pc++
// And we'll return a new AST node called `NumberLiteral` and setting its
// value to the value of our token.
return node{
kind: "NumberLiteral",
value: token.value,
}
}
// Next we're going to look for CallExpressions. We start this off when we
// encounter an open parenthesis.
if token.kind == "paren" && token.value == "(" {
// We'll increment `current` to skip the parenthesis since we don't care
// about it in our AST.
pc++
token = pt[pc]
// We create a base node with the type `CallExpression`, and we're going
// to set the name as the current token's value since the next token after
// the open parenthesis is the name of the function.
n := node{
kind: "CallExpression",
name: token.value,
params: []node{},
}
// We increment `current` *again* to skip the name token.
pc++
token = pt[pc]
// And now we want to loop through each token that will be the `params` of
// our `CallExpression` until we encounter a closing parenthesis.
//
// Now this is where recursion comes in. Instead of trying to parse a
// potentially infinitely nested set of nodes we're going to rely on
// recursion to resolve things.
//
// To explain this, let's take our Lisp code. You can see that the
// parameters of the `add` are a number and a nested `CallExpression` that
// includes its own numbers.
//
// (add 2 (subtract 4 2))
//
// You'll also notice that in our tokens array we have multiple closing
// parenthesis.
//
// [
// { type: 'paren', value: '(' },
// { type: 'name', value: 'add' },
// { type: 'number', value: '2' },
// { type: 'paren', value: '(' },
// { type: 'name', value: 'subtract' },
// { type: 'number', value: '4' },
// { type: 'number', value: '2' },
// { type: 'paren', value: ')' }, <<< Closing parenthesis
// { type: 'paren', value: ')' } <<< Closing parenthesis
// ]
//
// We're going to rely on the nested `walk` function to increment our
// `current` variable past any nested `CallExpressions`.
// So we create a `for` loop that will continue until it encounters a
// token with a `type` of `'paren'` and a `value` of a closing
// parenthesis.
for token.kind != "paren" || (token.kind == "paren" && token.value != ")") {
// we'll call the `walk` function which will return a `node` and we'll
// push it into our `node.params`.
n.params = append(n.params, walk())
token = pt[pc]
}
// Finally we will increment `current` one last time to skip the closing
// parenthesis.
pc++
// And return the node.
return n
}
// Again, if we haven't recognized the token type by now we're going to
// throw an error.
log.Fatal(token.kind)
return node{} }
/**
/**
// We will define our visitor
type here in such a way that strings can
// be tested against easily and then represent the function associoated with it.
//
// e.g.
// “NumberLiteral” : func(n *node, p node) {
// // do something
// }
//
type visitor map[string]func(n *node, p node)
// So we define a traverser function which accepts an AST and a
// visitor. Inside we’re going to define two functions…
func traverser(a ast, v visitor) {
// We kickstart the traverser by calling `traverseNode` with our ast
// with no `parent` because the top level of the AST doesn't have a parent.
traverseNode(node(a), node{}, v) }
// A traverseArray
function that will allow us to iterate over a slice and
// call the next function that we will define: traverseNode
.
func traverseArray(a []node, p node, v visitor) {
for _, child := range a {
traverseNode(child, p, v)
}
}
// traverseNode
will accept a node
and its parent
node. So that it can
// pass both to our visitor methods.
func traverseNode(n, p node, v visitor) {
// We start by testing for the existence of a method on the visitor with a
// matching `type` and then calling it with the `node` and its `parent`.
for k, va := range v {
if k == n.kind {
va(&n, p)
}
}
// Next we are going to split things up by the current node type.
switch n.kind {
// We'll start with our top level `Program`. Since Program nodes have a
// property named body that has an array of nodes, we will call
// `traverseArray` to traverse down into them.
//
// (Remember that `traverseArray` will in turn call `traverseNode` so we
// are causing the tree to be traversed recursively)
case "Program":
traverseArray(n.body, n, v)
break
// Next we do the same with `CallExpressions` and traverse their `params`.
case "CallExpression":
traverseArray(n.params, n, v)
break
// In the case of `NumberLiterals` we don't have any child nodes to visit,
// so we'll just break.
case "NumberLiteral":
break
// And again, if we haven't recognized the node type then we'll throw an
// error.
default:
log.Fatal(n.kind)
} }
/**
/**
Original AST | Transformed AST |
{ | { |
type: ‘Program’, | type: ‘Program’, |
body: [{ | body: [{ |
type: ‘CallExpression’, | type: ‘ExpressionStatement’, |
name: ‘add’, | expression: { |
params: [{ | type: ‘CallExpression’, |
type: ‘NumberLiteral’, | callee: { |
value: ‘2’ | type: ‘Identifier’, |
}, { | name: ‘add’ |
type: ‘CallExpression’, | }, |
name: ‘subtract’, | arguments: [{ |
params: [{ | type: ‘NumberLiteral’, |
type: ‘NumberLiteral’, | value: ‘2’ |
value: ‘4’ | }, { |
}, { | type: ‘CallExpression’, |
type: ‘NumberLiteral’, | callee: { |
value: ‘2’ | type: ‘Identifier’, |
}] | name: ‘subtract’ |
}] | }, |
}] | arguments: [{ |
} | type: ‘NumberLiteral’, |
value: ‘4’ |
———————————- | }, { |
type: ‘NumberLiteral’, |
value: ‘2’ |
}] |
(sorry the other one is longer.) | }] |
} |
}] |
} |
*/
// So we have our transformer function which will accept the lisp ast.
func transformer(a ast) ast {
// We'll create a new ast which like our previous AST will have a program
// node.
nast := ast{
kind: "Program",
body: []node{},
}
// Next I'm going to cheat a little and create a bit of a hack. We're going to
// use a property named `context` on our parent nodes that we're going to push
// nodes to their parent's `context`. Normally you would have a better
// abstraction than this, but for our purposes this keeps things simple.
//
// Just take note that the context is a reference *from* the old ast *to* the
// new ast.
a.context = &nast.body
// We'll start by calling the traverser function with our ast and a visitor.
traverser(a, map[string]func(n *node, p node){
// The first visitor method accepts `NumberLiterals`
"NumberLiteral": func(n *node, p node) {
// We'll create a new node also named `NumberLiteral` that we will push to
// the parent context.
*p.context = append(*p.context, node{
kind: "NumberLiteral",
value: n.value,
})
},
// Next up, `CallExpressions`.
"CallExpression": func(n *node, p node) {
// We start creating a new node `CallExpression` with a nested
// `Identifier`.
e := node{
kind: "CallExpression",
callee: &node{
kind: "Identifier",
name: n.name,
},
arguments: new([]node),
}
// Next we're going to define a new context on the original
// `CallExpression` node that will reference the `expression`'s arguments
// so that we can push arguments.
n.context = e.arguments
// Then we're going to check if the parent node is a `CallExpression`.
// If it is not...
if p.kind != "CallExpression" {
// We're going to wrap our `CallExpression` node with an
// `ExpressionStatement`.
es := node{
kind: "ExpressionStatement",
expression: &e,
}
// Last, we push our (possibly wrapped) `CallExpression` to the `parent`'s
// `context`.
*p.context = append(*p.context, es)
} else {
*p.context = append(*p.context, e)
}
},
})
// At the end of our transformer function we'll return the new ast that we
// just created.
return nast }
/**
/**
func codeGenerator(n node) string {
// We'll break things down by the `type` of the `node`.
switch n.kind {
// If we have a `Program` node. We will map through each node in the `body`
// and run them through the code generator and join them with a newline.
case "Program":
var r []string
for _, no := range n.body {
r = append(r, codeGenerator(no))
}
return strings.Join(r, "\n")
// For `ExpressionStatements` we'll call the code generator on the nested
// expression and we'll add a semicolon...
case "ExpressionStatement":
return codeGenerator(*n.expression) + ";"
// For `CallExpressions` we will print the `callee`, add an open
// parenthesis, we'll map through each node in the `arguments` array and run
// them through the code generator, joining them with a comma, and then
// we'll add a closing parenthesis.
case "CallExpression":
var ra []string
c := codeGenerator(*n.callee)
for _, no := range *n.arguments {
ra = append(ra, codeGenerator(no))
}
r := strings.Join(ra, ", ")
return c + "(" + r + ")"
// For `Identifiers` we'll just return the `node`'s name.
case "Identifier":
return n.name
// For `NumberLiterals` we'll just return the `node`'s value.
case "NumberLiteral":
return n.value
// And if we haven't recognized the node, we'll throw an error.
default:
log.Fatal("err")
return ""
} }
/**
/**
compiler
function. Here we will link togetherfunc compiler(input string) string {
tokens := tokenizer(input)
ast := parser(tokens)
nast := transformer(ast)
out := codeGenerator(node(nast))
// and simply return the output!
return out }
func main() {
program := “(add 10 (subtract 10 6))”
out := compiler(program)
fmt.Println(out)
}