cmd/digraph: support Go-style double-quotes in input data.
+ test. LGTM=sameer R=sameer CC=golang-codereviews, gri https://golang.org/cl/170090043
This commit is contained in:
parent
fb44a24d4c
commit
c097262a24
|
@ -16,19 +16,26 @@ package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
|
"bytes"
|
||||||
|
"errors"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strconv"
|
||||||
|
"unicode"
|
||||||
|
"unicode/utf8"
|
||||||
)
|
)
|
||||||
|
|
||||||
const Usage = `digraph: queries over directed graphs in text form.
|
const Usage = `digraph: queries over directed graphs in text form.
|
||||||
|
|
||||||
Graph format:
|
Graph format:
|
||||||
|
|
||||||
Each line contains zero or more whitespace-separated fields.
|
Each line contains zero or more words. Words are separated by
|
||||||
|
unquoted whitespace; words may contain Go-style double-quoted portions,
|
||||||
|
allowing spaces and other characters to be expressed.
|
||||||
|
|
||||||
Each field declares a node, and if there are more than one,
|
Each field declares a node, and if there are more than one,
|
||||||
an edge from the first to each subsequent one.
|
an edge from the first to each subsequent one.
|
||||||
The graph is provided on the standard input.
|
The graph is provided on the standard input.
|
||||||
|
@ -38,7 +45,7 @@ Graph format:
|
||||||
|
|
||||||
% cat clothes.txt
|
% cat clothes.txt
|
||||||
socks shoes
|
socks shoes
|
||||||
shorts pants
|
"boxer shorts" pants
|
||||||
pants belt shoes
|
pants belt shoes
|
||||||
shirt tie sweater
|
shirt tie sweater
|
||||||
sweater jacket
|
sweater jacket
|
||||||
|
@ -225,9 +232,15 @@ func (g graph) sccs() []nodeset {
|
||||||
func parse(rd io.Reader) (graph, error) {
|
func parse(rd io.Reader) (graph, error) {
|
||||||
g := make(graph)
|
g := make(graph)
|
||||||
|
|
||||||
|
var linenum int
|
||||||
in := bufio.NewScanner(rd)
|
in := bufio.NewScanner(rd)
|
||||||
for in.Scan() {
|
for in.Scan() {
|
||||||
words := strings.Fields(in.Text())
|
linenum++
|
||||||
|
// Split into words, honoring double-quotes per Go spec.
|
||||||
|
words, err := split(in.Text())
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("at line %d: %v", linenum, err)
|
||||||
|
}
|
||||||
if len(words) > 0 {
|
if len(words) > 0 {
|
||||||
g.addEdges(words[0], words[1:]...)
|
g.addEdges(words[0], words[1:]...)
|
||||||
}
|
}
|
||||||
|
@ -409,3 +422,119 @@ func digraph(cmd string, args []string) error {
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -- Utilities --------------------------------------------------------
|
||||||
|
|
||||||
|
// split splits a line into words, which are generally separated by
|
||||||
|
// spaces, but Go-style double-quoted string literals are also supported.
|
||||||
|
// (This approximates the behaviour of the Bourne shell.)
|
||||||
|
//
|
||||||
|
// `one "two three"` -> ["one" "two three"]
|
||||||
|
// `a"\n"b` -> ["a\nb"]
|
||||||
|
//
|
||||||
|
func split(line string) ([]string, error) {
|
||||||
|
var (
|
||||||
|
words []string
|
||||||
|
inWord bool
|
||||||
|
current bytes.Buffer
|
||||||
|
)
|
||||||
|
|
||||||
|
for len(line) > 0 {
|
||||||
|
r, size := utf8.DecodeRuneInString(line)
|
||||||
|
if unicode.IsSpace(r) {
|
||||||
|
if inWord {
|
||||||
|
words = append(words, current.String())
|
||||||
|
current.Reset()
|
||||||
|
inWord = false
|
||||||
|
}
|
||||||
|
} else if r == '"' {
|
||||||
|
var ok bool
|
||||||
|
size, ok = quotedLength(line)
|
||||||
|
if !ok {
|
||||||
|
return nil, errors.New("invalid quotation")
|
||||||
|
}
|
||||||
|
s, err := strconv.Unquote(line[:size])
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
current.WriteString(s)
|
||||||
|
inWord = true
|
||||||
|
} else {
|
||||||
|
current.WriteRune(r)
|
||||||
|
inWord = true
|
||||||
|
}
|
||||||
|
line = line[size:]
|
||||||
|
}
|
||||||
|
if inWord {
|
||||||
|
words = append(words, current.String())
|
||||||
|
}
|
||||||
|
return words, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// quotedLength returns the length in bytes of the prefix of input that
|
||||||
|
// contain a possibly-valid double-quoted Go string literal.
|
||||||
|
//
|
||||||
|
// On success, n is at least two (""); input[:n] may be passed to
|
||||||
|
// strconv.Unquote to interpret its value, and input[n:] contains the
|
||||||
|
// rest of the input.
|
||||||
|
//
|
||||||
|
// On failure, quotedLength returns false, and the entire input can be
|
||||||
|
// passed to strconv.Unquote if an informative error message is desired.
|
||||||
|
//
|
||||||
|
// quotedLength does not and need not detect all errors, such as
|
||||||
|
// invalid hex or octal escape sequences, since it assumes
|
||||||
|
// strconv.Unquote will be applied to the prefix. It guarantees only
|
||||||
|
// that if there is a prefix of input containing a valid string literal,
|
||||||
|
// its length is returned.
|
||||||
|
//
|
||||||
|
// TODO(adonovan): move this into a strconv-like utility package.
|
||||||
|
//
|
||||||
|
func quotedLength(input string) (n int, ok bool) {
|
||||||
|
var offset int
|
||||||
|
|
||||||
|
// next returns the rune at offset, or -1 on EOF.
|
||||||
|
// offset advances to just after that rune.
|
||||||
|
next := func() rune {
|
||||||
|
if offset < len(input) {
|
||||||
|
r, size := utf8.DecodeRuneInString(input[offset:])
|
||||||
|
offset += size
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
if next() != '"' {
|
||||||
|
return // error: not a quotation
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
r := next()
|
||||||
|
if r == '\n' || r < 0 {
|
||||||
|
return // error: string literal not terminated
|
||||||
|
}
|
||||||
|
if r == '"' {
|
||||||
|
return offset, true // success
|
||||||
|
}
|
||||||
|
if r == '\\' {
|
||||||
|
var skip int
|
||||||
|
switch next() {
|
||||||
|
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
|
||||||
|
skip = 0
|
||||||
|
case '0', '1', '2', '3', '4', '5', '6', '7':
|
||||||
|
skip = 2
|
||||||
|
case 'x':
|
||||||
|
skip = 2
|
||||||
|
case 'u':
|
||||||
|
skip = 4
|
||||||
|
case 'U':
|
||||||
|
skip = 8
|
||||||
|
default:
|
||||||
|
return // error: invalid escape
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < skip; i++ {
|
||||||
|
next()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package main
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
@ -60,3 +61,61 @@ d c
|
||||||
// - test somepath (it's nondeterministic).
|
// - test somepath (it's nondeterministic).
|
||||||
// - test errors
|
// - test errors
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSplit(t *testing.T) {
|
||||||
|
for _, test := range []struct {
|
||||||
|
line string
|
||||||
|
want []string
|
||||||
|
}{
|
||||||
|
{`one "2a 2b" three`, []string{"one", "2a 2b", "three"}},
|
||||||
|
{`one tw"\n\x0a\u000a\012"o three`, []string{"one", "tw\n\n\n\no", "three"}},
|
||||||
|
} {
|
||||||
|
got, err := split(test.line)
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("split(%s) failed: %v", test.line, err)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(got, test.want) {
|
||||||
|
t.Errorf("split(%s) = %v, want %v", test.line, got, test.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestQuotedLength(t *testing.T) {
|
||||||
|
for _, test := range []struct {
|
||||||
|
input string
|
||||||
|
want int
|
||||||
|
}{
|
||||||
|
{`"abc"`, 5},
|
||||||
|
{`"abc"def`, 5},
|
||||||
|
{`"abc\"d"ef`, 8}, // "abc\"d" is consumed, ef is residue
|
||||||
|
{`"\012\n\x0a\u000a\U0000000a"`, 28},
|
||||||
|
{"\"\xff\"", 3}, // bad UTF-8 is ok
|
||||||
|
{`"\xff"`, 6}, // hex escape for bad UTF-8 is ok
|
||||||
|
} {
|
||||||
|
got, ok := quotedLength(test.input)
|
||||||
|
if !ok {
|
||||||
|
got = 0
|
||||||
|
}
|
||||||
|
if got != test.want {
|
||||||
|
t.Errorf("quotedLength(%s) = %d, want %d", test.input, got, test.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// errors
|
||||||
|
for _, input := range []string{
|
||||||
|
``, // not a quotation
|
||||||
|
`a`, // not a quotation
|
||||||
|
`'a'`, // not a quotation
|
||||||
|
`"a`, // not terminated
|
||||||
|
`"\0"`, // short octal escape
|
||||||
|
`"\x1"`, // short hex escape
|
||||||
|
`"\u000"`, // short \u escape
|
||||||
|
`"\U0000000"`, // short \U escape
|
||||||
|
`"\k"`, // invalid escape
|
||||||
|
"\"ab\nc\"", // newline
|
||||||
|
} {
|
||||||
|
if n, ok := quotedLength(input); ok {
|
||||||
|
t.Errorf("quotedLength(%s) = %d, want !ok", input, n)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue