From 58c8b8a73844ec94edcf645c1a8f82697292478c Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Fri, 2 Jan 2015 17:16:44 -0800 Subject: [PATCH] dashboard/coordinator: use VMs to run non-Linux builders Currently this is running an OpenBSD amd64 builder, but other operating systems are now easy. Updates golang/go#8642 Updates golang/go#9492 Change-Id: I4e1026c49ee4d253dfcf78c7606767294784f02e Reviewed-on: https://go-review.googlesource.com/2264 Reviewed-by: Andrew Gerrand --- dashboard/coordinator/main.go | 653 ++++++++++++++++++++++++++++++---- 1 file changed, 586 insertions(+), 67 deletions(-) diff --git a/dashboard/coordinator/main.go b/dashboard/coordinator/main.go index 40eb9058..3a883004 100644 --- a/dashboard/coordinator/main.go +++ b/dashboard/coordinator/main.go @@ -8,10 +8,13 @@ package main // import "golang.org/x/tools/dashboard/coordinator" import ( + "archive/tar" "bytes" + "compress/gzip" "crypto/hmac" "crypto/md5" "encoding/json" + "errors" "flag" "fmt" "html" @@ -19,8 +22,11 @@ import ( "io/ioutil" "log" "net/http" + "net/url" "os" "os/exec" + "path" + "regexp" "sort" "strconv" "strings" @@ -34,8 +40,8 @@ import ( ) var ( - masterKeyFile = flag.String("masterkey", "", "Path to builder master key. Else fetched using GCE project attribute 'builder-master-key'.") - maxBuilds = flag.Int("maxbuilds", 6, "Max concurrent builds") + masterKeyFile = flag.String("masterkey", "", "Path to builder master key. Else fetched using GCE project attribute 'builder-master-key'.") + maxLocalBuilds = flag.Int("maxbuilds", 6, "Max concurrent Docker builds (VM builds don't count)") cleanZones = flag.String("zones", "us-central1-a,us-central1-b,us-central1-f", "Comma-separated list of zones to periodically clean of stale build VMs (ones that failed to shut themselves down)") @@ -47,8 +53,8 @@ var ( var ( startTime = time.Now() - builders = map[string]buildConfig{} // populated once at startup - watchers = map[string]watchConfig{} // populated once at startup + builders = map[string]buildConfig{} // populated at startup, keys like "openbsd-amd64-56" + watchers = map[string]watchConfig{} // populated at startup, keyed by repo, e.g. "https://go.googlesource.com/go" donec = make(chan builderRev) // reports of finished builders statusMu sync.Mutex // guards both status (ongoing ones) and statusDone (just finished) @@ -58,6 +64,37 @@ var ( const maxStatusDone = 30 +// Initialized by initGCE: +var ( + projectID string + projectZone string + computeService *compute.Service +) + +func initGCE() error { + if !metadata.OnGCE() { + return errors.New("not running on GCE; VM support disabled") + } + var err error + projectID, err = metadata.ProjectID() + if err != nil { + return fmt.Errorf("failed to get current GCE ProjectID: %v", err) + } + projectZone, err = metadata.Get("instance/zone") + if err != nil || projectZone == "" { + return fmt.Errorf("failed to get current GCE zone: %v", err) + } + // Convert the zone from "projects/1234/zones/us-central1-a" to "us-central1-a". + projectZone = path.Base(projectZone) + if !hasComputeScope() { + return errors.New("The coordinator is not running with access to read and write Compute resources. VM support disabled.") + + } + ts := google.ComputeTokenSource("default") + computeService, _ = compute.New(oauth2.NewClient(oauth2.NoContext, ts)) + return nil +} + type imageInfo struct { url string // of tar file @@ -74,8 +111,15 @@ var images = map[string]*imageInfo{ "gobuilders/linux-x86-sid": {url: "https://storage.googleapis.com/go-builder-data/docker-linux.sid.tar.gz"}, } +// A buildConfig describes how to run either a Docker-based or VM-based build. type buildConfig struct { - name string // "linux-amd64-race" + name string // "linux-amd64-race" + + // VM-specific settings: (used if vmImage != "") + vmImage string // e.g. "openbsd-amd64-56" + machineType string // optional GCE instance type + + // Docker-specific settings: (used if vmImage == "") image string // Docker image to use to build cmd string // optional -cmd flag (relative to go/src/) env []string // extra environment ("key=value") pairs @@ -83,6 +127,31 @@ type buildConfig struct { tool string // the tool this configuration is for } +func (c *buildConfig) usesDocker() bool { return c.vmImage == "" } +func (c *buildConfig) usesVM() bool { return c.vmImage != "" } + +func (c *buildConfig) MachineType() string { + if v := c.machineType; v != "" { + return v + } + return "n1-highcpu-4" +} + +// recordResult sends build results to the dashboard +func (b *buildConfig) recordResult(ok bool, hash, buildLog string, runTime time.Duration) error { + req := map[string]interface{}{ + "Builder": b.name, + "PackagePath": "", + "Hash": hash, + "GoHash": "", + "OK": ok, + "Log": buildLog, + "RunTime": runTime, + } + args := url.Values{"key": {builderKey(b.name)}, "builder": {b.name}} + return dash("POST", "result", args, req, nil) +} + type watchConfig struct { repo string // "https://go.googlesource.com/go" dash string // "https://build.golang.org/" (must end in /) @@ -91,6 +160,11 @@ type watchConfig struct { func main() { flag.Parse() + + if err := initGCE(); err != nil { + log.Printf("VM support disabled due to error initializing GCE: %v", err) + } + addBuilder(buildConfig{name: "linux-386"}) addBuilder(buildConfig{name: "linux-386-387", env: []string{"GO386=387"}}) addBuilder(buildConfig{name: "linux-amd64"}) @@ -118,6 +192,10 @@ func main() { addBuilder(buildConfig{name: "linux-386-clang", image: "gobuilders/linux-x86-clang"}) addBuilder(buildConfig{name: "linux-amd64-clang", image: "gobuilders/linux-x86-clang"}) + // VMs: + // addBuilder(buildConfig{name: "openbsd-amd64-gce56", vmImage: "openbsd-amd64-56"}) + // addBuilder(buildConfig{name: "plan9-386-gce", vmImage: "plan9-386"}) + addWatcher(watchConfig{repo: "https://go.googlesource.com/go", dash: "https://build.golang.org/"}) // TODO(adg,cmang): fix gccgo watcher // addWatcher(watchConfig{repo: "https://code.google.com/p/gofrontend", dash: "https://build.golang.org/gccgo/"}) @@ -146,6 +224,7 @@ func main() { go cleanUpOldContainers() go cleanUpOldVMs() + stopWatchers() // clean up before we start new ones for _, watcher := range watchers { if err := startWatching(watchers[watcher.repo]); err != nil { log.Printf("Error starting watcher for %s: %v", watcher.repo, err) @@ -161,17 +240,11 @@ func main() { for { select { case work := <-workc: - log.Printf("workc received %+v; len(status) = %v, maxBuilds = %v; cur = %p", work, len(status), *maxBuilds, status[work]) - mayBuild := mayBuildRev(work) - if mayBuild { - if numBuilds() > *maxBuilds { - mayBuild = false - } - } - if mayBuild { - if st, err := startBuilding(builders[work.name], work.rev); err == nil { + log.Printf("workc received %+v; len(status) = %v, maxLocalBuilds = %v; cur = %p", work, len(status), *maxLocalBuilds, status[work]) + if mayBuildRev(work) { + conf := builders[work.name] + if st, err := startBuilding(conf, work.rev); err == nil { setStatus(work, st) - log.Printf("%v now building in %v", work, st.container) } else { log.Printf("Error starting to build %v: %v", work, err) } @@ -193,10 +266,28 @@ func numCurrentBuilds() int { return len(status) } +// mayBuildRev reports whether the build type & revision should be started. +// It returns true if it's not already building, and there is capacity. func mayBuildRev(work builderRev) bool { + conf := builders[work.name] + statusMu.Lock() - defer statusMu.Unlock() - return len(status) < *maxBuilds && status[work] == nil + _, building := status[work] + statusMu.Unlock() + + if building { + return false + } + if conf.usesVM() { + // These don't count towards *maxLocalBuilds. + return true + } + numDocker, err := numDockerBuilds() + if err != nil { + log.Printf("not starting %v due to docker ps failure: %v", work, err) + return false + } + return numDocker < *maxLocalBuilds } func setStatus(work builderRev, st *buildStatus) { @@ -220,14 +311,20 @@ func markDone(work builderRev) { statusDone = append(statusDone, st) } -func getStatus(work builderRev) *buildStatus { +// statusPtrStr disambiguates which status to return if there are +// multiple in the history (e.g. recent failures where the build +// didn't finish for reasons outside of all.bash failing) +func getStatus(work builderRev, statusPtrStr string) *buildStatus { statusMu.Lock() defer statusMu.Unlock() - if st, ok := status[work]; ok { + match := func(st *buildStatus) bool { + return statusPtrStr == "" || fmt.Sprintf("%p", st) == statusPtrStr + } + if st, ok := status[work]; ok && match(st) { return st } for _, st := range statusDone { - if st.builderRev == work { + if st.builderRev == work && match(st) { return st } } @@ -248,14 +345,23 @@ func handleStatus(w http.ResponseWriter, r *http.Request) { active = append(active, st) } recent = append(recent, statusDone...) + numTotal := len(status) + numDocker, err := numDockerBuilds() statusMu.Unlock() sort.Sort(byAge(active)) - sort.Sort(byAge(recent)) + sort.Sort(sort.Reverse(byAge(recent))) io.WriteString(w, "

Go build coordinator

") - fmt.Fprintf(w, "

running

%d of max %d builds running:

", len(status), *maxBuilds)
+	if err != nil {
+		fmt.Fprintf(w, "

Error

Error fetching Docker build count: %s\n", html.EscapeString(err.Error())) + } + + fmt.Fprintf(w, "

running

%d total builds active (Docker: %d/%d; VMs: %d/∞):", + numTotal, numDocker, *maxLocalBuilds, numTotal-numDocker) + + io.WriteString(w, "

")
 	for _, st := range active {
 		io.WriteString(w, st.htmlStatusLine())
 	}
@@ -276,7 +382,7 @@ func diskFree() string {
 }
 
 func handleLogs(w http.ResponseWriter, r *http.Request) {
-	st := getStatus(builderRev{r.FormValue("name"), r.FormValue("rev")})
+	st := getStatus(builderRev{r.FormValue("name"), r.FormValue("rev")}, r.FormValue("st"))
 	if st == nil {
 		http.NotFound(w, r)
 		return
@@ -367,6 +473,10 @@ func (conf buildConfig) dockerRunArgs(rev string) (args []string) {
 }
 
 func addBuilder(c buildConfig) {
+	if c.tool == "gccgo" {
+		// TODO(cmang,bradfitz,adg): fix gccgo
+		return
+	}
 	if c.name == "" {
 		panic("empty name")
 	}
@@ -394,8 +504,11 @@ func addBuilder(c buildConfig) {
 	if strings.HasPrefix(c.name, "linux-") && c.image == "" {
 		c.image = "gobuilders/linux-x86-base"
 	}
-	if c.image == "" {
-		panic("empty image")
+	if c.image == "" && c.vmImage == "" {
+		panic("empty image and vmImage")
+	}
+	if c.image != "" && c.vmImage != "" {
+		panic("can't specify both image and vmImage")
 	}
 	builders[c.name] = c
 }
@@ -478,21 +591,29 @@ func condUpdateImage(img string) error {
 	return nil
 }
 
-// numBuilds finds the number of go builder instances currently running.
-func numBuilds() int {
-	out, _ := exec.Command("docker", "ps").Output()
-	numBuilds := 0
-	ps := bytes.Split(out, []byte("\n"))
-	for _, p := range ps {
-		if bytes.HasPrefix(p, []byte("gobuilders/")) {
-			numBuilds++
+// numDockerBuilds finds the number of go builder instances currently running.
+func numDockerBuilds() (n int, err error) {
+	out, err := exec.Command("docker", "ps").Output()
+	if err != nil {
+		return 0, err
+	}
+	for _, line := range strings.Split(string(out), "\n") {
+		if strings.Contains(line, "gobuilders/") {
+			n++
 		}
 	}
-	log.Printf("num current docker builds: %d", numBuilds)
-	return numBuilds
+	return n, nil
 }
 
 func startBuilding(conf buildConfig, rev string) (*buildStatus, error) {
+	if conf.usesVM() {
+		return startBuildingInVM(conf, rev)
+	} else {
+		return startBuildingInDocker(conf, rev)
+	}
+}
+
+func startBuildingInDocker(conf buildConfig, rev string) (*buildStatus, error) {
 	if err := condUpdateImage(conf.image); err != nil {
 		log.Printf("Failed to setup container for %v %v: %v", conf.name, rev, err)
 		return nil, err
@@ -505,14 +626,16 @@ func startBuilding(conf buildConfig, rev string) (*buildStatus, error) {
 		return nil, err
 	}
 	container := strings.TrimSpace(string(all))
-	bs := &buildStatus{
-		builderRev: builderRev{
-			name: conf.name,
-			rev:  rev,
-		},
-		container: container,
-		start:     time.Now(),
+	brev := builderRev{
+		name: conf.name,
+		rev:  rev,
 	}
+	st := &buildStatus{
+		builderRev: brev,
+		container:  container,
+		start:      time.Now(),
+	}
+	log.Printf("%v now building in Docker container %v", brev, st.container)
 	go func() {
 		all, err := exec.Command("docker", "wait", container).CombinedOutput()
 		output := strings.TrimSpace(string(all))
@@ -521,15 +644,15 @@ func startBuilding(conf buildConfig, rev string) (*buildStatus, error) {
 			exit, err := strconv.Atoi(output)
 			ok = (err == nil && exit == 0)
 		}
-		bs.setDone(ok)
+		st.setDone(ok)
 		log.Printf("docker wait %s/%s: %v, %s", container, rev, err, output)
 		donec <- builderRev{conf.name, rev}
 		exec.Command("docker", "rm", container).Run()
 	}()
 	go func() {
 		cmd := exec.Command("docker", "logs", "-f", container)
-		cmd.Stdout = bs
-		cmd.Stderr = bs
+		cmd.Stdout = st
+		cmd.Stderr = st
 		if err := cmd.Run(); err != nil {
 			// The docker logs subcommand always returns
 			// success, even if the underlying process
@@ -537,7 +660,273 @@ func startBuilding(conf buildConfig, rev string) (*buildStatus, error) {
 			log.Printf("failed to follow docker logs of %s: %v", container, err)
 		}
 	}()
-	return bs, nil
+	return st, nil
+}
+
+var osArchRx = regexp.MustCompile(`^(\w+-\w+)`)
+
+// startBuildingInVM starts a VM on GCE running the buildlet binary to build rev.
+func startBuildingInVM(conf buildConfig, rev string) (*buildStatus, error) {
+	brev := builderRev{
+		name: conf.name,
+		rev:  rev,
+	}
+	st := &buildStatus{
+		builderRev: brev,
+		start:      time.Now(),
+	}
+
+	// name is the project-wide unique name of the GCE instance. It can't be longer
+	// than 61 bytes, so we only use the first 8 bytes of the rev.
+	name := "buildlet-" + conf.name + "-" + rev[:8]
+
+	// buildletURL is the URL of the buildlet binary which the VMs
+	// are configured to download at boot and run. This lets us
+	// update the buildlet more easily than rebuilding the whole
+	// VM image. We put this URL in a well-known GCE metadata attribute.
+	// The value will be of the form:
+	//  http://storage.googleapis.com/go-builder-data/buildlet.GOOS-GOARCH
+	m := osArchRx.FindStringSubmatch(conf.name)
+	if m == nil {
+		return nil, fmt.Errorf("invalid builder name %q", conf.name)
+	}
+	buildletURL := "http://storage.googleapis.com/go-builder-data/buildlet." + m[1]
+
+	prefix := "https://www.googleapis.com/compute/v1/projects/" + projectID
+	machType := prefix + "/zones/" + projectZone + "/machineTypes/" + conf.MachineType()
+
+	instance := &compute.Instance{
+		Name:        name,
+		Description: fmt.Sprintf("Go Builder building %s %s", conf.name, rev),
+		MachineType: machType,
+		Disks: []*compute.AttachedDisk{
+			{
+				AutoDelete: true,
+				Boot:       true,
+				Type:       "PERSISTENT",
+				InitializeParams: &compute.AttachedDiskInitializeParams{
+					DiskName:    name,
+					SourceImage: "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + conf.vmImage,
+					DiskType:    "https://www.googleapis.com/compute/v1/projects/" + projectID + "/zones/" + projectZone + "/diskTypes/pd-ssd",
+				},
+			},
+		},
+		Tags: &compute.Tags{
+			// Warning: do NOT list "http-server" or "allow-ssh" (our
+			// project's custom tag to allow ssh access) here; the
+			// buildlet provides full remote code execution.
+			Items: []string{},
+		},
+		Metadata: &compute.Metadata{
+			Items: []*compute.MetadataItems{
+				{
+					Key:   "buildlet-binary-url",
+					Value: buildletURL,
+				},
+				// In case the VM gets away from us (generally: if the
+				// coordinator dies while a build is running), then we
+				// set this attribute of when it should be killed so
+				// we can kill it later when the coordinator is
+				// restarted. The cleanUpOldVMs goroutine loop handles
+				// that killing.
+				{
+					Key:   "delete-at",
+					Value: fmt.Sprint(time.Now().Add(30 * time.Minute).Unix()),
+				},
+			},
+		},
+		NetworkInterfaces: []*compute.NetworkInterface{
+			&compute.NetworkInterface{
+				AccessConfigs: []*compute.AccessConfig{
+					&compute.AccessConfig{
+						Type: "ONE_TO_ONE_NAT",
+						Name: "External NAT",
+					},
+				},
+				Network: prefix + "/global/networks/default",
+			},
+		},
+	}
+	op, err := computeService.Instances.Insert(projectID, projectZone, instance).Do()
+	if err != nil {
+		return nil, fmt.Errorf("Failed to create instance: %v", err)
+	}
+	st.createOp = op.Name
+	st.instName = name
+	log.Printf("%v now building in VM %v", brev, st.instName)
+	// Start the goroutine to monitor the VM now that it's booting. This might
+	// take minutes for it to come up, and then even more time to do the build.
+	go func() {
+		err := watchVM(st)
+		deleteVM(projectZone, st.instName)
+		st.setDone(err == nil)
+		if err != nil {
+			fmt.Fprintf(st, "\n\nError: %v\n", err)
+		}
+		donec <- builderRev{conf.name, rev}
+	}()
+	return st, nil
+}
+
+// watchVM monitors a VM doing a build.
+func watchVM(st *buildStatus) (err error) {
+	goodRes := func(res *http.Response, err error, what string) bool {
+		if err != nil {
+			err = fmt.Errorf("%s: %v", what, err)
+			return false
+		}
+		if res.StatusCode/100 != 2 {
+			err = fmt.Errorf("%s: %v", what, res.Status)
+			return false
+
+		}
+		return true
+	}
+	st.logEventTime("instance_create_requested")
+	// Wait for instance create operation to succeed.
+OpLoop:
+	for {
+		time.Sleep(2 * time.Second)
+		op, err := computeService.ZoneOperations.Get(projectID, projectZone, st.createOp).Do()
+		if err != nil {
+			return fmt.Errorf("Failed to get op %s: %v", st.createOp, err)
+		}
+		switch op.Status {
+		case "PENDING", "RUNNING":
+			continue
+		case "DONE":
+			if op.Error != nil {
+				for _, operr := range op.Error.Errors {
+					return fmt.Errorf("Error creating instance: %+v", operr)
+				}
+				return errors.New("Failed to start.")
+			}
+			break OpLoop
+		default:
+			log.Fatalf("Unknown status %q: %+v", op.Status, op)
+		}
+	}
+	st.logEventTime("instance_created")
+
+	inst, err := computeService.Instances.Get(projectID, projectZone, st.instName).Do()
+	if err != nil {
+		return fmt.Errorf("Error getting instance %s details after creation: %v", st.instName, err)
+	}
+	st.logEventTime("got_instance_info")
+
+	// Find its internal IP.
+	var ip string
+	for _, iface := range inst.NetworkInterfaces {
+		if strings.HasPrefix(iface.NetworkIP, "10.") {
+			ip = iface.NetworkIP
+		}
+	}
+	if ip == "" {
+		return errors.New("didn't find its internal IP address")
+	}
+
+	// Wait for it to boot and its buildlet to come up on port 80.
+	st.logEventTime("waiting_for_buildlet")
+	buildletURL := "http://" + ip
+	const numTries = 60
+	var alive bool
+	for i := 1; i <= numTries; i++ {
+		res, err := http.Get(buildletURL)
+		if err != nil {
+			time.Sleep(1 * time.Second)
+			continue
+		}
+		res.Body.Close()
+		if res.StatusCode != 200 {
+			return fmt.Errorf("buildlet returned HTTP status code %d on try number %d", res.StatusCode, i)
+		}
+		st.logEventTime("buildlet_up")
+		alive = true
+		break
+	}
+	if !alive {
+		return fmt.Errorf("buildlet didn't come up in %d seconds", numTries)
+	}
+
+	// Write the VERSION file.
+	st.logEventTime("start_write_version_tar")
+	verReq, err := http.NewRequest("PUT", buildletURL+"/writetgz", versionTgz(st.rev))
+	if err != nil {
+		return err
+	}
+	verRes, err := http.DefaultClient.Do(verReq)
+	if !goodRes(verRes, err, "writing VERSION tgz") {
+		return
+	}
+
+	// Feed the buildlet a tar file for it to extract.
+	// TODO: cache these.
+	st.logEventTime("start_fetch_gerrit_tgz")
+	tarRes, err := http.Get("https://go.googlesource.com/go/+archive/" + st.rev + ".tar.gz")
+	if !goodRes(tarRes, err, "fetching tarball from Gerrit") {
+		return
+	}
+
+	st.logEventTime("start_write_tar")
+	putReq, err := http.NewRequest("PUT", buildletURL+"/writetgz", tarRes.Body)
+	if err != nil {
+		tarRes.Body.Close()
+		return err
+	}
+	putRes, err := http.DefaultClient.Do(putReq)
+	st.logEventTime("end_write_tar")
+	tarRes.Body.Close()
+	if !goodRes(putRes, err, "writing tarball to buildlet") {
+		return
+	}
+
+	// Run the builder
+	cmd := "all.bash"
+	if strings.HasPrefix(st.name, "windows-") {
+		cmd = "all.bat"
+	} else if strings.HasPrefix(st.name, "plan9-") {
+		cmd = "all.rc"
+	}
+	execStartTime := time.Now()
+	st.logEventTime("start_exec")
+	res, err := http.PostForm(buildletURL+"/exec", url.Values{"cmd": {"src/" + cmd}})
+	if !goodRes(res, err, "running "+cmd) {
+		return
+	}
+	defer res.Body.Close()
+	st.logEventTime("running_exec")
+	// Stream the output:
+	if _, err := io.Copy(st, res.Body); err != nil {
+		return fmt.Errorf("error copying response: %v", err)
+	}
+	st.logEventTime("done")
+	state := res.Trailer.Get("Process-State")
+
+	// Don't record to the dashboard unless we heard the trailer from
+	// the buildlet, otherwise it was probably some unrelated error
+	// (like the VM being killed, or the buildlet crashing due to
+	// e.g. https://golang.org/issue/9309, since we require a tip
+	// build of the buildlet to get Trailers support)
+	if state != "" {
+		conf := builders[st.name]
+		var log string
+		if state != "ok" {
+			log = st.logs()
+		}
+		if err := conf.recordResult(state == "ok", st.rev, log, time.Since(execStartTime)); err != nil {
+			return fmt.Errorf("Status was %q but failed to report it to the dashboard: %v", state, err)
+		}
+	}
+	if state != "ok" {
+
+		return fmt.Errorf("got Trailer process state %q", state)
+	}
+	return nil
+}
+
+type eventAndTime struct {
+	evt string
+	t   time.Time
 }
 
 // buildStatus is the status of a build.
@@ -547,10 +936,15 @@ type buildStatus struct {
 	start     time.Time
 	container string // container ID for docker, else it's a VM
 
+	// Immutable, used by VM only:
+	createOp string // Instances.Insert operation name
+	instName string
+
 	mu        sync.Mutex   // guards following
 	done      time.Time    // finished running
 	succeeded bool         // set when done
 	output    bytes.Buffer // stdout and stderr
+	events    []eventAndTime
 }
 
 func (st *buildStatus) setDone(succeeded bool) {
@@ -560,6 +954,12 @@ func (st *buildStatus) setDone(succeeded bool) {
 	st.done = time.Now()
 }
 
+func (st *buildStatus) logEventTime(event string) {
+	st.mu.Lock()
+	defer st.mu.Unlock()
+	st.events = append(st.events, eventAndTime{event, time.Now()})
+}
+
 // htmlStatusLine returns the HTML to show within the 
 block on
 // the main page's list of active builds.
 func (st *buildStatus) htmlStatusLine() string {
@@ -583,8 +983,11 @@ func (st *buildStatus) htmlStatusLine() string {
 		buf.WriteString(", failed")
 	}
 
+	logsURL := fmt.Sprintf("/logs?name=%s&rev=%s&st=%p", st.name, st.rev, st)
 	if st.container != "" {
-		fmt.Fprintf(&buf, " in container %s", st.name, st.rev, st.container)
+		fmt.Fprintf(&buf, " in container %s", logsURL, st.container)
+	} else {
+		fmt.Fprintf(&buf, " in VM %s", logsURL, st.instName)
 	}
 
 	t := st.done
@@ -592,6 +995,17 @@ func (st *buildStatus) htmlStatusLine() string {
 		t = st.start
 	}
 	fmt.Fprintf(&buf, ", %v ago\n", time.Since(t))
+	for i, evt := range st.events {
+		var elapsed string
+		if i != 0 {
+			elapsed = fmt.Sprintf("+%0.1fs", evt.t.Sub(st.events[i-1].t).Seconds())
+		}
+		msg := evt.evt
+		if msg == "running_exec" {
+			msg = fmt.Sprintf("%s", logsURL, msg)
+		}
+		fmt.Fprintf(&buf, " %7s %v %s\n", elapsed, evt.t.Format(time.RFC3339), msg)
+	}
 	return buf.String()
 }
 
@@ -607,10 +1021,28 @@ func (st *buildStatus) Write(p []byte) (n int, err error) {
 	st.mu.Lock()
 	defer st.mu.Unlock()
 	const maxBufferSize = 2 << 20 // 2MB of output is way more than we expect.
+	plen := len(p)
 	if st.output.Len()+len(p) > maxBufferSize {
 		p = p[:maxBufferSize-st.output.Len()]
 	}
-	return st.output.Write(p)
+	st.output.Write(p) // bytes.Buffer can't fail
+	return plen, nil
+}
+
+// Stop any previous go-commit-watcher Docker tasks, so they don't
+// pile up upon restarts of the coordinator.
+func stopWatchers() {
+	out, err := exec.Command("docker", "ps").Output()
+	if err != nil {
+		return
+	}
+	for _, line := range strings.Split(string(out), "\n") {
+		if !strings.Contains(line, "go-commit-watcher:") {
+			continue
+		}
+		f := strings.Fields(line)
+		exec.Command("docker", "rm", "-f", "-v", f[0]).Run()
+	}
 }
 
 func startWatching(conf watchConfig) (err error) {
@@ -723,16 +1155,13 @@ func oldContainers() []string {
 // "delete-at" metadata attribute on them when created to some time
 // that's well beyond their expected lifetime.
 func cleanUpOldVMs() {
-	if !hasComputeScope() {
-		log.Printf("The coordinator is not running with access to read and write Compute resources. Background VM cleaning disabled.")
+	if computeService == nil {
 		return
 	}
-	ts := google.ComputeTokenSource("default")
-	computeService, _ := compute.New(oauth2.NewClient(oauth2.NoContext, ts))
 	for {
 		for _, zone := range strings.Split(*cleanZones, ",") {
 			zone = strings.TrimSpace(zone)
-			if err := cleanZoneVMs(computeService, zone); err != nil {
+			if err := cleanZoneVMs(zone); err != nil {
 				log.Printf("Error cleaning VMs in zone %q: %v", zone, err)
 			}
 		}
@@ -741,11 +1170,7 @@ func cleanUpOldVMs() {
 }
 
 // cleanZoneVMs is part of cleanUpOldVMs, operating on a single zone.
-func cleanZoneVMs(svc *compute.Service, zone string) error {
-	proj, err := metadata.ProjectID()
-	if err != nil {
-		return fmt.Errorf("failed to get current GCE ProjectID: %v", err)
-	}
+func cleanZoneVMs(zone string) error {
 	// Fetch the first 500 (default) running instances and clean
 	// thoes. We expect that we'll be running many fewer than
 	// that. Even if we have more, eventually the first 500 will
@@ -753,7 +1178,7 @@ func cleanZoneVMs(svc *compute.Service, zone string) error {
 	// partially-different 500.
 	// TODO(bradfitz): revist this code if we ever start running
 	// thousands of VMs.
-	list, err := svc.Instances.List(proj, zone).Do()
+	list, err := computeService.Instances.List(projectID, zone).Do()
 	if err != nil {
 		return fmt.Errorf("listing instances: %v", err)
 	}
@@ -770,7 +1195,7 @@ func cleanZoneVMs(svc *compute.Service, zone string) error {
 				}
 				if err == nil && time.Now().Unix() > unixDeadline {
 					log.Printf("Deleting expired VM %q in zone %q ...", inst.Name, zone)
-					deleteVM(svc, zone, inst.Name)
+					deleteVM(zone, inst.Name)
 				}
 			}
 		}
@@ -778,13 +1203,8 @@ func cleanZoneVMs(svc *compute.Service, zone string) error {
 	return nil
 }
 
-func deleteVM(svc *compute.Service, zone, instName string) {
-	proj, err := metadata.ProjectID()
-	if err != nil {
-		log.Printf("failed to get project id to delete instace: %v", err)
-		return
-	}
-	op, err := svc.Instances.Delete(proj, zone, instName).Do()
+func deleteVM(zone, instName string) {
+	op, err := computeService.Instances.Delete(projectID, zone, instName).Do()
 	if err != nil {
 		log.Printf("Failed to delete instance %q in zone %q: %v", instName, zone, err)
 		return
@@ -808,3 +1228,102 @@ func hasComputeScope() bool {
 	}
 	return false
 }
+
+// dash is copied from the builder binary. It runs the given method and command on the dashboard.
+//
+// TODO(bradfitz,adg): unify this somewhere?
+//
+// If args is non-nil it is encoded as the URL query string.
+// If req is non-nil it is JSON-encoded and passed as the body of the HTTP POST.
+// If resp is non-nil the server's response is decoded into the value pointed
+// to by resp (resp must be a pointer).
+func dash(meth, cmd string, args url.Values, req, resp interface{}) error {
+	const builderVersion = 1 // keep in sync with dashboard/app/build/handler.go
+	argsCopy := url.Values{"version": {fmt.Sprint(builderVersion)}}
+	for k, v := range args {
+		if k == "version" {
+			panic(`dash: reserved args key: "version"`)
+		}
+		argsCopy[k] = v
+	}
+	var r *http.Response
+	var err error
+	cmd = "https://build.golang.org/" + cmd + "?" + argsCopy.Encode()
+	switch meth {
+	case "GET":
+		if req != nil {
+			log.Panicf("%s to %s with req", meth, cmd)
+		}
+		r, err = http.Get(cmd)
+	case "POST":
+		var body io.Reader
+		if req != nil {
+			b, err := json.Marshal(req)
+			if err != nil {
+				return err
+			}
+			body = bytes.NewBuffer(b)
+		}
+		r, err = http.Post(cmd, "text/json", body)
+	default:
+		log.Panicf("%s: invalid method %q", cmd, meth)
+		panic("invalid method: " + meth)
+	}
+	if err != nil {
+		return err
+	}
+	defer r.Body.Close()
+	if r.StatusCode != http.StatusOK {
+		return fmt.Errorf("bad http response: %v", r.Status)
+	}
+	body := new(bytes.Buffer)
+	if _, err := body.ReadFrom(r.Body); err != nil {
+		return err
+	}
+
+	// Read JSON-encoded Response into provided resp
+	// and return an error if present.
+	var result = struct {
+		Response interface{}
+		Error    string
+	}{
+		// Put the provided resp in here as it can be a pointer to
+		// some value we should unmarshal into.
+		Response: resp,
+	}
+	if err = json.Unmarshal(body.Bytes(), &result); err != nil {
+		log.Printf("json unmarshal %#q: %s\n", body.Bytes(), err)
+		return err
+	}
+	if result.Error != "" {
+		return errors.New(result.Error)
+	}
+
+	return nil
+}
+
+func versionTgz(rev string) io.Reader {
+	var buf bytes.Buffer
+	zw := gzip.NewWriter(&buf)
+	tw := tar.NewWriter(zw)
+
+	contents := fmt.Sprintf("devel " + rev)
+	check(tw.WriteHeader(&tar.Header{
+		Name: "VERSION",
+		Mode: 0644,
+		Size: int64(len(contents)),
+	}))
+	_, err := io.WriteString(tw, contents)
+	check(err)
+	check(tw.Close())
+	check(zw.Close())
+	return bytes.NewReader(buf.Bytes())
+}
+
+// check is only for things which should be impossible (not even rare)
+// to fail.
+func check(err error) {
+	if err != nil {
+		panic("previously assumed to never fail: " + err.Error())
+	}
+}