html2article: Ignore empty anchor elements and obtain the original url from Google Docs

LGTM=adg
R=adg, campoy
CC=golang-codereviews
https://golang.org/cl/127560043
This commit is contained in:
Francesc Campoy 2014-08-19 17:55:46 -07:00
parent f579fb3656
commit e8a1924bfb
1 changed files with 19 additions and 1 deletions

View File

@ -14,6 +14,7 @@ import (
"fmt" "fmt"
"io" "io"
"log" "log"
"net/url"
"os" "os"
"regexp" "regexp"
"strings" "strings"
@ -210,7 +211,23 @@ func text(n *html.Node) string {
indent(&buf, childText(n)) indent(&buf, childText(n))
buf.WriteByte('\n') buf.WriteByte('\n')
case atom.A: case atom.A:
fmt.Fprintf(&buf, "[[%s][%s]]", attr(n, "href"), childText(n)) href, text := attr(n, "href"), childText(n)
// Skip links with no text.
if strings.TrimSpace(text) == "" {
break
}
// Don't emit empty links.
if strings.TrimSpace(href) == "" {
buf.WriteString(text)
break
}
// Use original url for Google Docs redirections.
if u, err := url.Parse(href); err != nil {
log.Println("parsing url %q: %v", href, err)
} else if u.Host == "www.google.com" && u.Path == "/url" {
href = u.Query().Get("q")
}
fmt.Fprintf(&buf, "[[%s][%s]]", href, text)
case atom.Code: case atom.Code:
buf.WriteString(highlight(n, "`")) buf.WriteString(highlight(n, "`"))
case atom.B: case atom.B:
@ -233,6 +250,7 @@ func text(n *html.Node) string {
} }
fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u) fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u)
} }
case atom.Title:
default: default:
return true return true
} }