replace go/parser with go/scanner in printFile

printFile is one of the functions to blame for most of the CPU cost and allocations for garble itself, as reported by `perf record` for a clean build. One contributor is how we print each file and then parse it again, which we did for the sake of inserting line directives correctly. With a bit of care, we can do this by tokenizing after printing, as opposed to parsing into a full go/ast again. This is moderately cheaper, but more than anything, allocates far less. That is to be expected given how go/ast is a tree of pointers, whereas go/scanner simply gives us a stream of tokens. name old time/op new time/op delta Build-16 10.4s ± 2% 10.3s ± 1% ~ (p=0.393 n=10+10) name old bin-B new bin-B delta Build-16 5.51M ± 0% 5.51M ± 0% ~ (all equal) name old cached-time/op new cached-time/op delta Build-16 398ms ±12% 391ms ±10% ~ (p=0.529 n=10+10) name old mallocs/op new mallocs/op delta Build-16 34.4M ± 0% 31.8M ± 0% -7.65% (p=0.000 n=10+10) name old sys-time/op new sys-time/op delta Build-16 5.80s ± 6% 5.86s ± 4% ~ (p=0.218 n=10+10) The new code is shorter, but perhaps a bit trickier, so I also added more comments to explain what's going on. Note how the time/op change is practically noise, but mallocs/op goes down significantly, which is always a good sign.
3 years ago · d2622e8223
parent 21bd89ff73
commit d2622e8223
1 changed files with 71 additions and 85 deletions
--- a/position.go
+++ b/position.go
@ -7,12 +7,11 @@ import (
 	"bytes"
 	"fmt"
 	"go/ast"
-	"go/parser"
 	"go/printer"
+	"go/scanner"
+	"go/token"
 	"path/filepath"
 	"strings"
-
-	"golang.org/x/exp/slices"
 )

 var printBuf1, printBuf2 bytes.Buffer
@ -20,11 +19,11 @@ var printBuf1, printBuf2 bytes.Buffer
 // printFile prints a Go file to a buffer, while also removing non-directive
 // comments and adding extra compiler directives to obfuscate position
 // information.
-func printFile(file1 *ast.File) ([]byte, error) {
+func printFile(file *ast.File) ([]byte, error) {
 	printConfig := printer.Config{Mode: printer.RawFormat}

 	printBuf1.Reset()
-	if err := printConfig.Fprint(&printBuf1, fset, file1); err != nil {
+	if err := printConfig.Fprint(&printBuf1, fset, file); err != nil {
 		return nil, err
 	}
 	src := printBuf1.Bytes()
@ -36,7 +35,8 @@ func printFile(file1 *ast.File) ([]byte, error) {
 		return src, nil
 	}

-	filename := filepath.Base(fset.Position(file1.Pos()).Filename)
+	fsetFile := fset.File(file.Pos())
+	filename := filepath.Base(fsetFile.Name())
 	if strings.HasPrefix(filename, "_cgo_") {
 		// cgo-generated files don't need changed line numbers.
 		// Plus, the compiler can complain rather easily.
@ -47,78 +47,25 @@ func printFile(file1 *ast.File) ([]byte, error) {
 	// Unfortunately, comments are free-floating in File.Comments,
 	// and those are the only source of truth that go/printer uses.
 	// So the positions of the comments in the given file are wrong.
-	// The only way we can get the final ones is to parse again.
-	//
-	// We use an empty filename here.
-	// Syntax errors should be rare, and when they do happen,
-	// we don't want to point to the original source file on disk.
-	// That would be confusing, as we've changed the source in memory.
-	file2, err := parser.ParseFile(fset, "", src, parser.SkipObjectResolution|parser.ParseComments)
-	if err != nil {
-		return nil, fmt.Errorf("re-parse error: %w", err)
-	}
-
-	// Remove any comments by making them whitespace.
-	// Keep directives, as they affect the build.
-	// This is superior to removing the comments before printing,
-	// as otherwise 'garble reverse' would show different line numbers.
-	for _, group := range file2.Comments {
-		for _, comment := range group.List {
-			if strings.HasPrefix(comment.Text, "//go:") {
-				continue
-			}
-			start := fset.Position(comment.Pos()).Offset
-			end := fset.Position(comment.End()).Offset
-			for i := start; i < end; i++ {
-				src[i] = ' '
-			}
-		}
-	}
+	// The only way we can get the final ones is to tokenize again.
+	// Using go/scanner is slightly awkward, but cheaper than parsing again.

 	// We want to use the original positions for the hashed positions.
-	var origCallExprs []*ast.CallExpr
-	ast.Inspect(file1, func(node ast.Node) bool {
-		if node, ok := node.(*ast.CallExpr); ok {
-			origCallExprs = append(origCallExprs, node)
+	// Since later we'll iterate on tokens rather than walking an AST,
+	// we use a list of offsets indexed by identifiers in source order.
+	var origCallOffsets []int
+	nextOffset := -1
+	ast.Inspect(file, func(node ast.Node) bool {
+		switch node := node.(type) {
+		case *ast.CallExpr:
+			nextOffset = fsetFile.Position(node.Pos()).Offset
+		case *ast.Ident:
+			origCallOffsets = append(origCallOffsets, nextOffset)
+			nextOffset = -1
 		}
 		return true
 	})

-	type commentToAdd struct {
-		offset int
-		text   string
-	}
-	var toAdd []commentToAdd
-	i := 0
-	ast.Inspect(file2, func(node ast.Node) bool {
-		node, ok := node.(*ast.CallExpr)
-		if !ok {
-			return true
-		}
-		origNode := origCallExprs[i]
-		i++
-		newName := ""
-		if !flagTiny {
-			origPos := fmt.Sprintf("%s:%d", filename, fset.Position(origNode.Pos()).Offset)
-			newName = hashWithPackage(curPkg, origPos) + ".go"
-			// log.Printf("%q hashed with %x to %q", origPos, curPkg.GarbleActionID, newName)
-		}
-		pos := fset.Position(node.Pos())
-
-		// We use the "/*text*/" form, since we can use multiple of them
-		// on a single line, and they don't require extra newlines.
-		toAdd = append(toAdd, commentToAdd{
-			offset: pos.Offset,
-			text:   fmt.Sprintf("/*line %s:1*/", newName),
-		})
-		return true
-	})
-
-	// We add comments in order.
-	slices.SortFunc(toAdd, func(a, b commentToAdd) bool {
-		return a.offset < b.offset
-	})
-
 	copied := 0
 	printBuf2.Reset()

@ -128,19 +75,58 @@ func printFile(file1 *ast.File) ([]byte, error) {
 	// toAdd is for /*-style comments, so add it to printBuf2 directly.
 	printBuf2.WriteString("//line :1\n")

-	for _, comment := range toAdd {
-		printBuf2.Write(src[copied:comment.offset])
-		copied = comment.offset
+	// We use an empty filename when tokenizing below.
+	// We use a nil go/scanner.ErrorHandler because src comes from go/printer.
+	// Syntax errors should be rare, and when they do happen,
+	// we don't want to point to the original source file on disk.
+	// That would be confusing, as we've changed the source in memory.
+	var s scanner.Scanner
+	fsetFile = fset.AddFile("", fset.Base(), len(src))
+	s.Init(fsetFile, src, nil, scanner.ScanComments)
+
+	identIndex := 0
+	for {
+		pos, tok, lit := s.Scan()
+		switch tok {
+		case token.EOF:
+			// Copy the rest and return.
+			printBuf2.Write(src[copied:])
+			return printBuf2.Bytes(), nil
+		case token.COMMENT:
+			// Omit comments from the final Go code.
+			// Keep directives, as they affect the build.
+			// This is superior to removing the comments before printing,
+			// because then the final source would have different line numbers.
+			if strings.HasPrefix(lit, "//go:") {
+				continue // directives are kept
+			}
+			offset := fsetFile.Position(pos).Offset
+			printBuf2.Write(src[copied:offset])
+			copied = offset + len(lit)
+		case token.IDENT:
+			origOffset := origCallOffsets[identIndex]
+			identIndex++
+			if origOffset == -1 {
+				continue // identifiers which don't start func calls are left untouched
+			}
+			newName := ""
+			if !flagTiny {
+				origPos := fmt.Sprintf("%s:%d", filename, origOffset)
+				newName = hashWithPackage(curPkg, origPos) + ".go"
+				// log.Printf("%q hashed with %x to %q", origPos, curPkg.GarbleActionID, newName)
+			}
+
+			offset := fsetFile.Position(pos).Offset
+			printBuf2.Write(src[copied:offset])
+			copied = offset

-		// We assume that all comments are of the form "/*text*/".
+			// We use the "/*text*/" form, since we can use multiple of them
+			// on a single line, and they don't require extra newlines.
 			// Make sure there is whitespace at either side of a comment.
 			// Otherwise, we could change the syntax of the program.
 			// Inserting "/*text*/" in "a/b" // must be "a/ /*text*/ b",
 			// as "a//*text*/b" is tokenized as a "//" comment.
-		printBuf2.WriteByte(' ')
-		printBuf2.WriteString(comment.text)
-		printBuf2.WriteByte(' ')
+			fmt.Fprintf(&printBuf2, " /*line %s:1*/ ", newName)
+		}
 	}
-	printBuf2.Write(src[copied:])
-	return printBuf2.Bytes(), nil
 }