From 57ce3fa587897d74634c1216af67dd42252c64e5 Mon Sep 17 00:00:00 2001
From: James Shubin <james@shubin.ca>
Date: Sat, 22 Sep 2018 10:37:29 -0400
Subject: [PATCH] lang: Allow matching underscores in some of the identifier's

This allows matching underscores in some of the identifier's, but not
when they're the last character.

This caused me to suffer a bit of pain tracking down a bug which turned
out to be in the lexer. It started with a failing test that I wrote in:

https://github.com/blynn/nex/commit/974c2498c46eaf4fb3963cfc059bcefd66ea48a8

and which followed with a fix in:

https://github.com/blynn/nex/commit/52682f463a45fdc37630e650d82f126a32d810a3

Glad that's fixed!
---
 lang/lexer.nex        | 36 +++++++++++----------
 lang/lexparse_test.go | 74 +++++++++++++++++++++++++++++++++++++++++++
 lang/parser.y         | 43 +++++++++++++++++++------
 lang/structs.go       |  8 +++++
 4 files changed, 135 insertions(+), 26 deletions(-)

diff --git a/lang/lexer.nex b/lang/lexer.nex
index ba26e671..3b6e9753 100644
--- a/lang/lexer.nex
+++ b/lang/lexer.nex
@@ -289,7 +289,7 @@
 				panic(fmt.Sprintf("error lexing FLOAT, got: %v", err))
 			}
 		}
-/\$[a-z][a-z0-9]*{[0-9]+}/
+/\$[a-z]+([a-z0-9_]*[a-z0-9]+)?{[0-9]+}/
 		{
 			// we have this as a single token, because otherwise the
 			// parser can get confused by the curly brackets :/
@@ -323,39 +323,39 @@
 				panic(fmt.Sprintf("error lexing VAR_IDENTIFIER_HX, got: %v", err))
 			}
 		}
-/\$[a-z][a-z0-9]*/
+/\$[a-z]([a-z0-9_]*[a-z0-9]+)?/
 		{
+			// an alternate pattern: /\$[a-z](|[a-z0-9_]*[a-z0-9])/
 			yylex.pos(lval) // our pos
 			s := yylex.Text()
 			lval.str = s[1:len(s)] // remove the leading $
 			return VAR_IDENTIFIER
 		}
-/[A-Z][a-z0-9:]*[a-z0-9]*/
+/[a-z]([a-z0-9_]*[a-z0-9]+)?/
 		{
 			yylex.pos(lval) // our pos
-			s := yylex.Text()
-			lval.str = strings.ToLower(s) // uncapitalize it
-			return CAPITALIZED_RES_IDENTIFIER
+			lval.str = yylex.Text()
+			return IDENTIFIER
 		}
-/[a-z][a-z0-9:]*[a-z0-9]*/
-		{
-			yylex.pos(lval) // our pos
-			s := yylex.Text()
-			lval.str = strings.ToLower(s) // uncapitalize it
-			return RES_IDENTIFIER
-		}
-/[A-Z][a-z0-9]*/
+/[A-Z]([a-z0-9_]*[a-z0-9]+)?/
 		{
 			yylex.pos(lval) // our pos
 			s := yylex.Text()
 			lval.str = strings.ToLower(s) // uncapitalize it
 			return CAPITALIZED_IDENTIFIER
 		}
-/[a-z][a-z0-9]*/
+/[a-z]([a-z0-9:]*[a-z0-9]+)?/
 		{
 			yylex.pos(lval) // our pos
 			lval.str = yylex.Text()
-			return IDENTIFIER
+			return RES_IDENTIFIER
+		}
+/[A-Z]([a-z0-9:]*[a-z0-9]+)?/
+		{
+			yylex.pos(lval) // our pos
+			s := yylex.Text()
+			lval.str = strings.ToLower(s) // uncapitalize it
+			return CAPITALIZED_RES_IDENTIFIER
 		}
 /#[^\n]*/
 		{	// this matches a (#) pound char followed by any
@@ -405,3 +405,7 @@ import (
 	"fmt"
 	"strconv"
 )
+
+// NOTE:
+// Among rules in the same scope, the longest matching pattern takes precedence.
+// In event of a tie, the first pattern wins.
diff --git a/lang/lexparse_test.go b/lang/lexparse_test.go
index 27474a0e..5aecfd63 100644
--- a/lang/lexparse_test.go
+++ b/lang/lexparse_test.go
@@ -241,6 +241,34 @@ func TestLexParse0(t *testing.T) {
 			//exp: ???, // FIXME: add the expected AST
 		})
 	}
+	{
+		values = append(values, test{
+			name: "maps 1",
+			code: `
+			# make sure the "str:" part doesn't match a single ident
+			$strmap map{str: int} = {
+				"key1" => 42,
+				"key2" => -13,
+			}
+			`,
+			fail: false,
+			//exp: ???, // FIXME: add the expected AST
+		})
+	}
+	{
+		values = append(values, test{
+			name: "maps 2",
+			code: `
+			$mapstrintlist map{str: []int} = {
+				"key1" => [42, 44,],
+				"key2" => [],
+				"key3" => [-13,],
+			}
+			`,
+			fail: false,
+			//exp: ???, // FIXME: add the expected AST
+		})
+	}
 	{
 		values = append(values, test{
 			name: "maps and lists",
@@ -1268,6 +1296,52 @@ func TestLexParse0(t *testing.T) {
 			fail: true,
 		})
 	}
+	{
+		exp := &StmtProg{
+			Prog: []interfaces.Stmt{
+				&StmtClass{
+					Name: "x",
+					Args: []*Arg{},
+					Body: &StmtProg{
+						Prog: []interfaces.Stmt{},
+					},
+				},
+				&StmtClass{
+					Name: "y1",
+					Args: []*Arg{},
+					Body: &StmtProg{
+						Prog: []interfaces.Stmt{},
+					},
+				},
+				&StmtInclude{
+					Name: "z",
+					Args: nil,
+				},
+			},
+		}
+		values = append(values, test{
+			name: "simple class with args 0",
+			code: `
+			class x() {
+			}
+			class y1() {
+			}
+			include z
+			`,
+			fail: false,
+			exp:  exp,
+		})
+	}
+	{
+		values = append(values, test{
+			name: "simple class underscore failure",
+			code: `
+			class x_() {
+			}
+			`,
+			fail: true,
+		})
+	}
 	{
 		exp := &StmtProg{
 			Prog: []interfaces.Stmt{
diff --git a/lang/parser.y b/lang/parser.y
index 6bb6237d..80c3efdb 100644
--- a/lang/parser.y
+++ b/lang/parser.y
@@ -754,16 +754,17 @@ resource:
 			Contents: $4.resContents,
 		}
 	}
-// TODO: do we need to include this simpler case as well?
-//|	IDENTIFIER expr OPEN_CURLY resource_body CLOSE_CURLY
-//	{
-//		posLast(yylex, yyDollar) // our pos
-//		$$.stmt = &StmtRes{
-//			Kind:     $1.str,
-//			Name:     $2.expr,
-//			Contents: $4.resContents,
-//		}
-//	}
+	// note: this is a simplified version of the above if the lexer picks it
+	// note: must not include underscores, but that is checked after parsing
+|	IDENTIFIER expr OPEN_CURLY resource_body CLOSE_CURLY
+	{
+		posLast(yylex, yyDollar) // our pos
+		$$.stmt = &StmtRes{
+			Kind:     $1.str,
+			Name:     $2.expr,
+			Contents: $4.resContents,
+		}
+	}
 ;
 resource_body:
 	/* end of list */
@@ -885,6 +886,17 @@ edge_half:
 			//SendRecv: "", // unused
 		}
 	}
+	// note: this is a simplified version of the above if the lexer picks it
+	// note: must not include underscores, but that is checked after parsing
+|	CAPITALIZED_IDENTIFIER OPEN_BRACK expr CLOSE_BRACK
+	{
+		posLast(yylex, yyDollar) // our pos
+		$$.edgeHalf = &StmtEdgeHalf{
+			Kind: $1.str,
+			Name: $3.expr,
+			//SendRecv: "", // unused
+		}
+	}
 ;
 edge_half_sendrecv:
 	// eg: Test["t1"].foo_send
@@ -897,6 +909,17 @@ edge_half_sendrecv:
 			SendRecv: $6.str,
 		}
 	}
+	// note: this is a simplified version of the above if the lexer picks it
+	// note: must not include underscores, but that is checked after parsing
+|	CAPITALIZED_IDENTIFIER OPEN_BRACK expr CLOSE_BRACK DOT IDENTIFIER
+	{
+		posLast(yylex, yyDollar) // our pos
+		$$.edgeHalf = &StmtEdgeHalf{
+			Kind: $1.str,
+			Name: $3.expr,
+			SendRecv: $6.str,
+		}
+	}
 ;
 type:
 	BOOL_IDENTIFIER
diff --git a/lang/structs.go b/lang/structs.go
index 22fae189..c35790e3 100644
--- a/lang/structs.go
+++ b/lang/structs.go
@@ -164,6 +164,10 @@ func (obj *StmtRes) Apply(fn func(interfaces.Node) error) error {
 // Init initializes this branch of the AST, and returns an error if it fails to
 // validate.
 func (obj *StmtRes) Init(data *interfaces.Data) error {
+	if strings.Contains(obj.Kind, "_") {
+		return fmt.Errorf("kind must not contain underscores")
+	}
+
 	obj.data = data
 	if err := obj.Name.Init(data); err != nil {
 		return err
@@ -1058,6 +1062,10 @@ func (obj *StmtEdgeHalf) Apply(fn func(interfaces.Node) error) error {
 // Init initializes this branch of the AST, and returns an error if it fails to
 // validate.
 func (obj *StmtEdgeHalf) Init(data *interfaces.Data) error {
+	if strings.Contains(obj.Kind, "_") {
+		return fmt.Errorf("kind must not contain underscores")
+	}
+
 	return obj.Name.Init(data)
 }