cmd/digraph: support Go-style double-quotes in input data.

+ test.

LGTM=sameer
R=sameer
CC=golang-codereviews, gri
https://golang.org/cl/170090043
This commit is contained in:
Alan Donovan 2014-11-12 13:37:06 -05:00
parent fb44a24d4c
commit c097262a24
2 changed files with 192 additions and 4 deletions

View File

@ -16,19 +16,26 @@ package main
import ( import (
"bufio" "bufio"
"bytes"
"errors"
"flag" "flag"
"fmt" "fmt"
"io" "io"
"os" "os"
"sort" "sort"
"strings" "strconv"
"unicode"
"unicode/utf8"
) )
const Usage = `digraph: queries over directed graphs in text form. const Usage = `digraph: queries over directed graphs in text form.
Graph format: Graph format:
Each line contains zero or more whitespace-separated fields. Each line contains zero or more words. Words are separated by
unquoted whitespace; words may contain Go-style double-quoted portions,
allowing spaces and other characters to be expressed.
Each field declares a node, and if there are more than one, Each field declares a node, and if there are more than one,
an edge from the first to each subsequent one. an edge from the first to each subsequent one.
The graph is provided on the standard input. The graph is provided on the standard input.
@ -38,7 +45,7 @@ Graph format:
% cat clothes.txt % cat clothes.txt
socks shoes socks shoes
shorts pants "boxer shorts" pants
pants belt shoes pants belt shoes
shirt tie sweater shirt tie sweater
sweater jacket sweater jacket
@ -225,9 +232,15 @@ func (g graph) sccs() []nodeset {
func parse(rd io.Reader) (graph, error) { func parse(rd io.Reader) (graph, error) {
g := make(graph) g := make(graph)
var linenum int
in := bufio.NewScanner(rd) in := bufio.NewScanner(rd)
for in.Scan() { for in.Scan() {
words := strings.Fields(in.Text()) linenum++
// Split into words, honoring double-quotes per Go spec.
words, err := split(in.Text())
if err != nil {
return nil, fmt.Errorf("at line %d: %v", linenum, err)
}
if len(words) > 0 { if len(words) > 0 {
g.addEdges(words[0], words[1:]...) g.addEdges(words[0], words[1:]...)
} }
@ -409,3 +422,119 @@ func digraph(cmd string, args []string) error {
return nil return nil
} }
// -- Utilities --------------------------------------------------------
// split splits a line into words, which are generally separated by
// spaces, but Go-style double-quoted string literals are also supported.
// (This approximates the behaviour of the Bourne shell.)
//
// `one "two three"` -> ["one" "two three"]
// `a"\n"b` -> ["a\nb"]
//
func split(line string) ([]string, error) {
var (
words []string
inWord bool
current bytes.Buffer
)
for len(line) > 0 {
r, size := utf8.DecodeRuneInString(line)
if unicode.IsSpace(r) {
if inWord {
words = append(words, current.String())
current.Reset()
inWord = false
}
} else if r == '"' {
var ok bool
size, ok = quotedLength(line)
if !ok {
return nil, errors.New("invalid quotation")
}
s, err := strconv.Unquote(line[:size])
if err != nil {
return nil, err
}
current.WriteString(s)
inWord = true
} else {
current.WriteRune(r)
inWord = true
}
line = line[size:]
}
if inWord {
words = append(words, current.String())
}
return words, nil
}
// quotedLength returns the length in bytes of the prefix of input that
// contain a possibly-valid double-quoted Go string literal.
//
// On success, n is at least two (""); input[:n] may be passed to
// strconv.Unquote to interpret its value, and input[n:] contains the
// rest of the input.
//
// On failure, quotedLength returns false, and the entire input can be
// passed to strconv.Unquote if an informative error message is desired.
//
// quotedLength does not and need not detect all errors, such as
// invalid hex or octal escape sequences, since it assumes
// strconv.Unquote will be applied to the prefix. It guarantees only
// that if there is a prefix of input containing a valid string literal,
// its length is returned.
//
// TODO(adonovan): move this into a strconv-like utility package.
//
func quotedLength(input string) (n int, ok bool) {
var offset int
// next returns the rune at offset, or -1 on EOF.
// offset advances to just after that rune.
next := func() rune {
if offset < len(input) {
r, size := utf8.DecodeRuneInString(input[offset:])
offset += size
return r
}
return -1
}
if next() != '"' {
return // error: not a quotation
}
for {
r := next()
if r == '\n' || r < 0 {
return // error: string literal not terminated
}
if r == '"' {
return offset, true // success
}
if r == '\\' {
var skip int
switch next() {
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
skip = 0
case '0', '1', '2', '3', '4', '5', '6', '7':
skip = 2
case 'x':
skip = 2
case 'u':
skip = 4
case 'U':
skip = 8
default:
return // error: invalid escape
}
for i := 0; i < skip; i++ {
next()
}
}
}
}

View File

@ -3,6 +3,7 @@ package main
import ( import (
"bytes" "bytes"
"fmt" "fmt"
"reflect"
"strings" "strings"
"testing" "testing"
) )
@ -60,3 +61,61 @@ d c
// - test somepath (it's nondeterministic). // - test somepath (it's nondeterministic).
// - test errors // - test errors
} }
func TestSplit(t *testing.T) {
for _, test := range []struct {
line string
want []string
}{
{`one "2a 2b" three`, []string{"one", "2a 2b", "three"}},
{`one tw"\n\x0a\u000a\012"o three`, []string{"one", "tw\n\n\n\no", "three"}},
} {
got, err := split(test.line)
if err != nil {
t.Errorf("split(%s) failed: %v", test.line, err)
}
if !reflect.DeepEqual(got, test.want) {
t.Errorf("split(%s) = %v, want %v", test.line, got, test.want)
}
}
}
func TestQuotedLength(t *testing.T) {
for _, test := range []struct {
input string
want int
}{
{`"abc"`, 5},
{`"abc"def`, 5},
{`"abc\"d"ef`, 8}, // "abc\"d" is consumed, ef is residue
{`"\012\n\x0a\u000a\U0000000a"`, 28},
{"\"\xff\"", 3}, // bad UTF-8 is ok
{`"\xff"`, 6}, // hex escape for bad UTF-8 is ok
} {
got, ok := quotedLength(test.input)
if !ok {
got = 0
}
if got != test.want {
t.Errorf("quotedLength(%s) = %d, want %d", test.input, got, test.want)
}
}
// errors
for _, input := range []string{
``, // not a quotation
`a`, // not a quotation
`'a'`, // not a quotation
`"a`, // not terminated
`"\0"`, // short octal escape
`"\x1"`, // short hex escape
`"\u000"`, // short \u escape
`"\U0000000"`, // short \U escape
`"\k"`, // invalid escape
"\"ab\nc\"", // newline
} {
if n, ok := quotedLength(input); ok {
t.Errorf("quotedLength(%s) = %d, want !ok", input, n)
}
}
}