lang: Split lang package out into many subpackages

This is a giant refactor to split the giant lang package into many
subpackages. The most difficult piece was figuring out how to extract
the extra ast structs into their own package, because they needed to
call two functions which also needed to import the ast.

The solution was to separate out those functions into their own
packages, and to pass them into the ast at the root when they're needed,
and to let the relevant ast portions call a handle.

This isn't terribly ugly because we already had a giant data struct
woven through the ast.

The bad part is rebasing any WIP work on top of this.
This commit is contained in:
James Shubin
2021-10-21 03:35:31 -04:00
parent 8ae47bd490
commit 23b5a4729f
23 changed files with 1212 additions and 1129 deletions

3
lang/parser/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
lexer.nn.go
y.go
y.output

416
lang/parser/lexer.nex Normal file
View File

@@ -0,0 +1,416 @@
/[ \t\n]/ { /* skip over whitespace */ }
/{/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return OPEN_CURLY
}
/}/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return CLOSE_CURLY
}
/\(/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return OPEN_PAREN
}
/\)/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return CLOSE_PAREN
}
/\[/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return OPEN_BRACK
}
/\]/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return CLOSE_BRACK
}
/if/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return IF
}
/else/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return ELSE
}
/\?:/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return ELVIS
}
/=>/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return ROCKET
}
/,/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return COMMA
}
/:/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return COLON
}
/;/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return SEMICOLON
}
/=/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return EQUALS
}
/\+/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return PLUS
}
/\-/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return MINUS
}
/\*/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return MULTIPLY
}
/\// {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return DIVIDE
}
/==/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return EQ
}
/!=/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return NEQ
}
/</ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return LT
}
/>/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return GT
}
/<=/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return LTE
}
/>=/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return GTE
}
/&&/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return AND
}
/\|\|/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return OR
}
/!/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return NOT
}
/in/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return IN
}
/\->/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return ARROW
}
/\./ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
// sanity check... these should be the same!
if x, y := lval.str, interfaces.ModuleSep; x != y {
panic(fmt.Sprintf("DOT does not match ModuleSep (%s != %s)", x, y))
}
return DOT
}
/\$/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return DOLLAR
}
/bool/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return BOOL_IDENTIFIER
}
/str/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return STR_IDENTIFIER
}
/int/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return INT_IDENTIFIER
}
/float/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return FLOAT_IDENTIFIER
}
/map/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return MAP_IDENTIFIER
}
/struct/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return STRUCT_IDENTIFIER
}
/func/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return FUNC_IDENTIFIER
}
/class/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return CLASS_IDENTIFIER
}
/include/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return INCLUDE_IDENTIFIER
}
/import/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return IMPORT_IDENTIFIER
}
/as/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return AS_IDENTIFIER
}
/variant/ {
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return VARIANT_IDENTIFIER
}
/true|false/ {
yylex.pos(lval) // our pos
s := yylex.Text()
if s == "true" {
lval.bool = true
} else if s == "false" {
lval.bool = false
} else {
// the lexer was wrong
panic(fmt.Sprintf("error lexing BOOL, got: %s", s))
}
return BOOL
}
/"(\\.|[^"])*"/
{ // This matches any number of the bracketed patterns
// that are surrounded by the two quotes on each side.
// The bracket pattern is any escaped char or something
// that is not a single quote char. See this reference:
// https://www.lysator.liu.se/c/ANSI-C-grammar-l.html#STRING-LITERAL
// old: /"[\a\b\t\n\v\f\r !#$%&'()*+,-.\/0-9:;<=>?@A-Z\[\\\]^_a-z{|}~]*"/
yylex.pos(lval) // our pos
s := yylex.Text()
if s[0:1] != "\"" || s[len(s)-1:] != "\"" {
// unhandled error
panic(fmt.Sprintf("error lexing STRING, got: %s", s))
//return ERROR // unreachable
}
lval.str = s[1:len(s)-1] // remove the two quotes
return STRING
}
/\-?[0-9]+/
{
yylex.pos(lval) // our pos
s := yylex.Text()
var err error
lval.int, err = strconv.ParseInt(s, 10, 64) // int64
if err == nil {
return INTEGER
} else if e := err.(*strconv.NumError); e.Err == strconv.ErrRange {
// this catches range errors for very large ints
lp := yylex.cast()
lp.lexerErr = &LexParseErr{
Err: ErrLexerIntegerOverflow,
Str: s,
Row: yylex.Line(),
Col: yylex.Column(),
}
return ERROR
} else {
panic(fmt.Sprintf("error lexing INTEGER, got: %v", err))
}
}
/\-?[0-9]+\.[0-9]+/
{
yylex.pos(lval) // our pos
s := yylex.Text()
var err error
lval.float, err = strconv.ParseFloat(s, 64) // float64
if err == nil {
return FLOAT
} else if e := err.(*strconv.NumError); e.Err == strconv.ErrRange {
// this catches range errors for very large floats
lp := yylex.cast()
lp.lexerErr = &LexParseErr{
Err: ErrLexerFloatOverflow,
Str: s,
Row: yylex.Line(),
Col: yylex.Column(),
}
return ERROR
} else {
panic(fmt.Sprintf("error lexing FLOAT, got: %v", err))
}
}
/\$[a-z]+([a-z0-9_]*[a-z0-9]+)?{[0-9]+}/
{
// we have this as a single token, because otherwise the
// parser can get confused by the curly brackets :/
yylex.pos(lval) // our pos
s := yylex.Text()
s = s[1:len(s)] // remove the leading $
s = s[0:len(s)-1] // remove the trailing close curly
// XXX: nex has a bug that it gets confused by the
// following single curly brace. Please see:
// https://github.com/blynn/nex/issues/48
a := strings.Split(s, "{") // XXX: close match here: }
if len(a) != 2 {
panic(fmt.Sprintf("error lexing VAR_IDENTIFIER_HX: %v", a))
}
lval.str = a[0]
var err error
lval.int, err = strconv.ParseInt(a[1], 10, 64) // int64
if err == nil {
return VAR_IDENTIFIER_HX
} else if e := err.(*strconv.NumError); e.Err == strconv.ErrRange {
// this catches range errors for very large ints
lp := yylex.cast()
lp.lexerErr = &LexParseErr{
Err: ErrLexerIntegerOverflow,
Str: a[1],
Row: yylex.Line(),
Col: yylex.Column(),
}
return ERROR
} else {
panic(fmt.Sprintf("error lexing VAR_IDENTIFIER_HX, got: %v", err))
}
}
/\$[a-z]([a-z0-9_]*[a-z0-9]+)?/
{
// an alternate pattern: /\$[a-z](|[a-z0-9_]*[a-z0-9])/
yylex.pos(lval) // our pos
s := yylex.Text()
lval.str = s[1:len(s)] // remove the leading $
return VAR_IDENTIFIER
}
/[a-z]([a-z0-9_]*[a-z0-9]+)?/
{
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return IDENTIFIER
}
/[A-Z]([a-z0-9_]*[a-z0-9]+)?/
{
yylex.pos(lval) // our pos
s := yylex.Text()
lval.str = strings.ToLower(s) // uncapitalize it
return CAPITALIZED_IDENTIFIER
}
/[a-z]([a-z0-9:]*[a-z0-9]+)?/
{
yylex.pos(lval) // our pos
lval.str = yylex.Text()
return RES_IDENTIFIER
}
/#[^\n]*/
{ // this matches a (#) pound char followed by any
// number of chars that aren't the (\n) newline!
yylex.pos(lval) // our pos
s := yylex.Text()
lval.str = s[1:len(s)] // remove the leading #
//log.Printf("lang: lexer: comment: `%s`", lval.str)
//return COMMENT // skip return to avoid parsing
}
/./ {
yylex.pos(lval) // our pos
s := yylex.Text()
lp := yylex.cast()
e := ErrLexerUnrecognized
if s == "\r" { // windows!
e = ErrLexerUnrecognizedCR
}
lp.lexerErr = &LexParseErr{
Err: e,
Str: s,
Row: yylex.Line(),
Col: yylex.Column(),
}
return ERROR
}
//
// Mgmt
// Copyright (C) 2013-2021+ James Shubin and the project contributors
// Written by James Shubin <james@shubin.ca> and the project contributors
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package parser
import (
"fmt"
"strconv"
"github.com/purpleidea/mgmt/lang/interfaces"
)
// NOTE:
// Among rules in the same scope, the longest matching pattern takes precedence.
// In event of a tie, the first pattern wins.

226
lang/parser/lexparse.go Normal file
View File

@@ -0,0 +1,226 @@
// Mgmt
// Copyright (C) 2013-2021+ James Shubin and the project contributors
// Written by James Shubin <james@shubin.ca> and the project contributors
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package parser
import (
"bufio"
"fmt"
"io"
"path"
"sort"
"strings"
"github.com/purpleidea/mgmt/engine"
"github.com/purpleidea/mgmt/lang/interfaces"
"github.com/purpleidea/mgmt/util"
"github.com/purpleidea/mgmt/util/errwrap"
)
// These constants represent the different possible lexer/parser errors.
const (
ErrLexerUnrecognized = interfaces.Error("unrecognized")
ErrLexerUnrecognizedCR = interfaces.Error("unrecognized carriage return")
ErrLexerStringBadEscaping = interfaces.Error("string: bad escaping")
ErrLexerIntegerOverflow = interfaces.Error("integer: overflow")
ErrLexerFloatOverflow = interfaces.Error("float: overflow")
ErrParseError = interfaces.Error("parser")
ErrParseSetType = interfaces.Error("can't set return type in parser")
ErrParseResFieldInvalid = interfaces.Error("can't use unknown resource field")
ErrParseAdditionalEquals = interfaces.Error(errstrParseAdditionalEquals)
ErrParseExpectingComma = interfaces.Error(errstrParseExpectingComma)
)
// LexParseErr is a permanent failure error to notify about borkage.
type LexParseErr struct {
Err interfaces.Error
Str string
Row int // this is zero-indexed (the first line is 0)
Col int // this is zero-indexed (the first char is 0)
// Filename is the file that this error occurred in. If this is unknown,
// then it will be empty. This is not set when run by the basic LexParse
// function.
Filename string
}
// Error displays this error with all the relevant state information.
func (e *LexParseErr) Error() string {
return fmt.Sprintf("%s: `%s` @%d:%d", e.Err, e.Str, e.Row+1, e.Col+1)
}
// lexParseAST is a struct which we pass into the lexer/parser so that we have a
// location to store the AST to avoid having to use a global variable.
type lexParseAST struct {
ast interfaces.Stmt
row int
col int
lexerErr error // from lexer
parseErr error // from Error(e string)
}
// LexParse runs the lexer/parser machinery and returns the AST.
func LexParse(input io.Reader) (interfaces.Stmt, error) {
lp := &lexParseAST{}
// parseResult is a seemingly unused field in the Lexer struct for us...
lexer := NewLexerWithInit(input, func(y *Lexer) { y.parseResult = lp })
yyParse(lexer) // writes the result to lp.ast
var err error
if e := lp.parseErr; e != nil {
err = e
}
if e := lp.lexerErr; e != nil {
err = e
}
if err != nil {
return nil, err
}
return lp.ast, nil
}
// LexParseWithOffsets takes an io.Reader input and a list of corresponding
// offsets and runs LexParse on them. The input to this function is most
// commonly the output from DirectoryReader which returns a single io.Reader and
// the offsets map. It usually produces the combined io.Reader from an
// io.MultiReader grouper. If the offsets map is nil or empty, then it simply
// redirects directly to LexParse. This differs because when it errors it will
// also report the corresponding file the error occurred in based on some offset
// math. The offsets are in units of file size (bytes) and not length (lines).
// TODO: Due to an implementation difficulty, offsets are currently in length!
// NOTE: This was used for an older deprecated form of lex/parse file combining.
func LexParseWithOffsets(input io.Reader, offsets map[uint64]string) (interfaces.Stmt, error) {
if offsets == nil || len(offsets) == 0 {
return LexParse(input) // special case, no named offsets...
}
stmt, err := LexParse(input)
if err == nil { // handle the success case first because it ends faster
return stmt, nil
}
e, ok := err.(*LexParseErr)
if !ok {
return nil, err // unexpected error format
}
// rebuild the error so it contains the right filename index, etc...
uints := []uint64{}
for i := range offsets {
uints = append(uints, i)
}
sort.Sort(util.UInt64Slice(uints))
if i := uints[0]; i != 0 { // first offset is supposed to be zero
return nil, fmt.Errorf("unexpected first offset of %d", i)
}
// TODO: switch this to an offset in bytes instead of lines
// TODO: we'll also need a way to convert that into the new row number!
row := uint64(e.Row)
var i uint64 // initial condition
filename := offsets[0] // (assumption)
for _, i = range uints {
if row <= i {
break
}
// if we fall off the end of the loop, the last file is correct
filename = offsets[i]
}
return nil, &LexParseErr{
Err: e.Err, // same
Str: e.Str, // same
Row: int(i - row), // computed offset
Col: e.Col, // same
Filename: filename, // actual filename
}
}
// DirectoryReader takes a filesystem and an absolute directory path, and it
// returns a combined reader into that directory, and an offset map of the file
// contents. This is used to build a reader from a directory containing language
// source files, and as a result, this will skip over files that don't have the
// correct extension. The offsets are in units of file size (bytes) and not
// length (lines).
// TODO: Due to an implementation difficulty, offsets are currently in length!
// NOTE: This was used for an older deprecated form of lex/parse file combining.
func DirectoryReader(fs engine.Fs, dir string) (io.Reader, map[uint64]string, error) {
fis, err := fs.ReadDir(dir) // ([]os.FileInfo, error)
if err != nil {
return nil, nil, errwrap.Wrapf(err, "can't stat directory contents `%s`", dir)
}
var offset uint64
offsets := make(map[uint64]string) // cumulative offset to abs. filename
readers := []io.Reader{}
for _, fi := range fis {
if fi.IsDir() {
continue // skip directories
}
name := path.Join(dir, fi.Name()) // relative path made absolute
if !strings.HasSuffix(name, interfaces.DotFileNameExtension) {
continue
}
f, err := fs.Open(name) // opens read-only
if err != nil {
return nil, nil, errwrap.Wrapf(err, "can't open file `%s`", name)
}
defer f.Close()
//stat, err := f.Stat() // (os.FileInfo, error)
//if err != nil {
// return nil, nil, errwrap.Wrapf(err, "can't stat file `%s`", name)
//}
offsets[offset] = name // save cumulative offset (starts at 0)
//offset += uint64(stat.Size()) // the earlier stat causes file download
// TODO: store the offset in size instead of length! we're using
// length at the moment since it is not clear how easy it is for
// the lexer/parser to return the byte offset as well as line no
// NOTE: in addition, this scanning is not the fastest for perf!
scanner := bufio.NewScanner(f)
lines := 0
for scanner.Scan() { // each line
lines++
}
if err := scanner.Err(); err != nil {
return nil, nil, errwrap.Wrapf(err, "can't scan file `%s`", name)
}
offset += uint64(lines)
if start, err := f.Seek(0, io.SeekStart); err != nil { // reset
return nil, nil, errwrap.Wrapf(err, "can't reset file `%s`", name)
} else if start != 0 { // we should be at the start (0)
return nil, nil, fmt.Errorf("reset of file `%s` was %d", name, start)
}
readers = append(readers, f)
}
if len(offsets) == 0 {
// TODO: this condition should be validated during the deploy...
return nil, nil, fmt.Errorf("no files in main directory")
}
if len(offsets) == 1 { // no need for a multi reader
return readers[0], offsets, nil
}
return io.MultiReader(readers...), offsets, nil
}

2561
lang/parser/lexparse_test.go Normal file

File diff suppressed because it is too large Load Diff

1415
lang/parser/parser.y Normal file

File diff suppressed because it is too large Load Diff