cmd/digraph: support Go-style double-quotes in input data.

+ test. LGTM=sameer R=sameer CC=golang-codereviews, gri https://golang.org/cl/170090043
2014-11-12 13:37:06 -05:00 · 2014-11-12 13:37:06 -05:00 · c097262a24
parent fb44a24d4c
commit c097262a24
2 changed files with 192 additions and 4 deletions
--- a/cmd/digraph/digraph.go
+++ b/cmd/digraph/digraph.go
@ -16,19 +16,26 @@ package main
 import (
 	"bufio"
 	"bytes"
 	"errors"
 	"flag"
 	"fmt"
 	"io"
 	"os"
 	"sort"
-	"strings"
+	"strconv"
 	"unicode"
 	"unicode/utf8"
 )
 const Usage = `digraph: queries over directed graphs in text form.
 Graph format:
-  Each line contains zero or more whitespace-separated fields.
+  Each line contains zero or more words.  Words are separated by
  unquoted whitespace; words may contain Go-style double-quoted portions,
  allowing spaces and other characters to be expressed.
  Each field declares a node, and if there are more than one,
  an edge from the first to each subsequent one.
  The graph is provided on the standard input.
@ -38,7 +45,7 @@ Graph format:
 	% cat clothes.txt
 	socks shoes
-	shorts pants
+	"boxer shorts" pants
 	pants belt shoes
 	shirt tie sweater
 	sweater jacket
@ -225,9 +232,15 @@ func (g graph) sccs() []nodeset {
 func parse(rd io.Reader) (graph, error) {
 	g := make(graph)
 	var linenum int
 	in := bufio.NewScanner(rd)
 	for in.Scan() {
-		words := strings.Fields(in.Text())
+		linenum++
 		// Split into words, honoring double-quotes per Go spec.
 		words, err := split(in.Text())
 		if err != nil {
 			return nil, fmt.Errorf("at line %d: %v", linenum, err)
 		}
 		if len(words) > 0 {
 			g.addEdges(words[0], words[1:]...)
 		}
@ -409,3 +422,119 @@ func digraph(cmd string, args []string) error {
 	return nil
 }
 // -- Utilities --------------------------------------------------------
 // split splits a line into words, which are generally separated by
 // spaces, but Go-style double-quoted string literals are also supported.
 // (This approximates the behaviour of the Bourne shell.)
 //
 //   `one "two three"` -> ["one" "two three"]
 //   `a"\n"b` -> ["a\nb"]
 //
 func split(line string) ([]string, error) {
 	var (
 		words   []string
 		inWord  bool
 		current bytes.Buffer
 	)
 	for len(line) > 0 {
 		r, size := utf8.DecodeRuneInString(line)
 		if unicode.IsSpace(r) {
 			if inWord {
 				words = append(words, current.String())
 				current.Reset()
 				inWord = false
 			}
 		} else if r == '"' {
 			var ok bool
 			size, ok = quotedLength(line)
 			if !ok {
 				return nil, errors.New("invalid quotation")
 			}
 			s, err := strconv.Unquote(line[:size])
 			if err != nil {
 				return nil, err
 			}
 			current.WriteString(s)
 			inWord = true
 		} else {
 			current.WriteRune(r)
 			inWord = true
 		}
 		line = line[size:]
 	}
 	if inWord {
 		words = append(words, current.String())
 	}
 	return words, nil
 }
 // quotedLength returns the length in bytes of the prefix of input that
 // contain a possibly-valid double-quoted Go string literal.
 //
 // On success, n is at least two (""); input[:n] may be passed to
 // strconv.Unquote to interpret its value, and input[n:] contains the
 // rest of the input.
 //
 // On failure, quotedLength returns false, and the entire input can be
 // passed to strconv.Unquote if an informative error message is desired.
 //
 // quotedLength does not and need not detect all errors, such as
 // invalid hex or octal escape sequences, since it assumes
 // strconv.Unquote will be applied to the prefix.  It guarantees only
 // that if there is a prefix of input containing a valid string literal,
 // its length is returned.
 //
 // TODO(adonovan): move this into a strconv-like utility package.
 //
 func quotedLength(input string) (n int, ok bool) {
 	var offset int
 	// next returns the rune at offset, or -1 on EOF.
 	// offset advances to just after that rune.
 	next := func() rune {
 		if offset < len(input) {
 			r, size := utf8.DecodeRuneInString(input[offset:])
 			offset += size
 			return r
 		}
 		return -1
 	}
 	if next() != '"' {
 		return // error: not a quotation
 	}
 	for {
 		r := next()
 		if r == '\n' || r < 0 {
 			return // error: string literal not terminated
 		}
 		if r == '"' {
 			return offset, true // success
 		}
 		if r == '\\' {
 			var skip int
 			switch next() {
 			case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
 				skip = 0
 			case '0', '1', '2', '3', '4', '5', '6', '7':
 				skip = 2
 			case 'x':
 				skip = 2
 			case 'u':
 				skip = 4
 			case 'U':
 				skip = 8
 			default:
 				return // error: invalid escape
 			}
 			for i := 0; i < skip; i++ {
 				next()
 			}
 		}
 	}
 }
--- a/cmd/digraph/digraph_test.go
+++ b/cmd/digraph/digraph_test.go
@ -3,6 +3,7 @@ package main
 import (
 	"bytes"
 	"fmt"
 	"reflect"
 	"strings"
 	"testing"
 )
@ -60,3 +61,61 @@ d c
 	// - test somepath (it's nondeterministic).
 	// - test errors
 }
 func TestSplit(t *testing.T) {
 	for _, test := range []struct {
 		line string
 		want []string
 	}{
 		{`one "2a 2b" three`, []string{"one", "2a 2b", "three"}},
 		{`one tw"\n\x0a\u000a\012"o three`, []string{"one", "tw\n\n\n\no", "three"}},
 	} {
 		got, err := split(test.line)
 		if err != nil {
 			t.Errorf("split(%s) failed: %v", test.line, err)
 		}
 		if !reflect.DeepEqual(got, test.want) {
 			t.Errorf("split(%s) = %v, want %v", test.line, got, test.want)
 		}
 	}
 }
 func TestQuotedLength(t *testing.T) {
 	for _, test := range []struct {
 		input string
 		want  int
 	}{
 		{`"abc"`, 5},
 		{`"abc"def`, 5},
 		{`"abc\"d"ef`, 8}, // "abc\"d" is consumed, ef is residue
 		{`"\012\n\x0a\u000a\U0000000a"`, 28},
 		{"\"\xff\"", 3}, // bad UTF-8 is ok
 		{`"\xff"`, 6},   // hex escape for bad UTF-8 is ok
 	} {
 		got, ok := quotedLength(test.input)
 		if !ok {
 			got = 0
 		}
 		if got != test.want {
 			t.Errorf("quotedLength(%s) = %d, want %d", test.input, got, test.want)
 		}
 	}
 	// errors
 	for _, input := range []string{
 		``,            // not a quotation
 		`a`,           // not a quotation
 		`'a'`,         // not a quotation
 		`"a`,          // not terminated
 		`"\0"`,        // short octal escape
 		`"\x1"`,       // short hex escape
 		`"\u000"`,     // short \u escape
 		`"\U0000000"`, // short \U escape
 		`"\k"`,        // invalid escape
 		"\"ab\nc\"",   // newline
 	} {
 		if n, ok := quotedLength(input); ok {
 			t.Errorf("quotedLength(%s) = %d, want !ok", input, n)
 		}
 	}
 }