diff --git a/dashboard/coordinator/Makefile b/dashboard/coordinator/Makefile index ec1d88b5..7bbcaf0f 100644 --- a/dashboard/coordinator/Makefile +++ b/dashboard/coordinator/Makefile @@ -1,6 +1,9 @@ coordinator: main.go - GOOS=linux go build -o coordinator . + GOOS=linux go build --tags=build_coordinator -o coordinator . +# After "make upload", either reboot the machine, or ssh to it and: +# sudo systemctl restart gobuild.service +# And watch its logs with: +# sudo journalctl -f -u gobuild.service upload: coordinator cat coordinator | (cd buildongce && go run create.go --write_object=go-builder-data/coordinator) - diff --git a/dashboard/coordinator/main.go b/dashboard/coordinator/main.go index 3820668a..5a91d647 100644 --- a/dashboard/coordinator/main.go +++ b/dashboard/coordinator/main.go @@ -2,6 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +// +build build_coordinator + // The coordinator runs on GCE and coordinates builds in Docker containers. package main // import "golang.org/x/tools/dashboard/coordinator" @@ -20,15 +22,23 @@ import ( "os" "os/exec" "sort" + "strconv" "strings" "sync" "time" + + "golang.org/x/oauth2" + "golang.org/x/oauth2/google" + "google.golang.org/api/compute/v1" + "google.golang.org/cloud/compute/metadata" ) var ( masterKeyFile = flag.String("masterkey", "", "Path to builder master key. Else fetched using GCE project attribute 'builder-master-key'.") maxBuilds = flag.Int("maxbuilds", 6, "Max concurrent builds") + cleanZones = flag.String("zones", "us-central1-a,us-central1-b,us-central1-f", "Comma-separated list of zones to periodically clean of stale build VMs (ones that failed to shut themselves down)") + // Debug flags: addTemp = flag.Bool("temp", false, "Append -temp to all builders.") just = flag.String("just", "", "If non-empty, run single build in the foreground. Requires rev.") @@ -131,6 +141,7 @@ func main() { go http.ListenAndServe(":80", nil) go cleanUpOldContainers() + go cleanUpOldVMs() for _, watcher := range watchers { if err := startWatching(watchers[watcher.repo]); err != nil { @@ -581,3 +592,103 @@ func oldContainers() []string { out, _ := exec.Command("docker", "ps", "-a", "--filter=status=exited", "--no-trunc", "-q").Output() return strings.Fields(string(out)) } + +// cleanUpOldVMs loops forever and periodically enumerates virtual +// machines and deletes those which have expired. +// +// A VM is considered expired if it has a "delete-at" metadata +// attribute having a unix timestamp before the current time. +// +// This is the safety mechanism to delete VMs which stray from the +// normal deleting process. VMs are created to run a single build and +// should be shut down by a controlling process. Due to various types +// of failures, they might get stranded. To prevent them from getting +// stranded and wasting resources forever, we instead set the +// "delete-at" metadata attribute on them when created to some time +// that's well beyond their expected lifetime. +func cleanUpOldVMs() { + if !hasComputeScope() { + log.Printf("The coordinator is not running with access to read and write Compute resources. Background VM cleaning disabled.") + return + } + ts := google.ComputeTokenSource("default") + computeService, _ := compute.New(oauth2.NewClient(oauth2.NoContext, ts)) + for { + for _, zone := range strings.Split(*cleanZones, ",") { + zone = strings.TrimSpace(zone) + if err := cleanZoneVMs(computeService, zone); err != nil { + log.Printf("Error cleaning VMs in zone %q: %v", zone, err) + } + } + time.Sleep(time.Minute) + } +} + +// cleanZoneVMs is part of cleanUpOldVMs, operating on a single zone. +func cleanZoneVMs(svc *compute.Service, zone string) error { + proj, err := metadata.ProjectID() + if err != nil { + return fmt.Errorf("failed to get current GCE ProjectID: %v", err) + } + // Fetch the first 500 (default) running instances and clean + // thoes. We expect that we'll be running many fewer than + // that. Even if we have more, eventually the first 500 will + // either end or be cleaned, and then the next call will get a + // partially-different 500. + // TODO(bradfitz): revist this code if we ever start running + // thousands of VMs. + list, err := svc.Instances.List(proj, zone).Do() + if err != nil { + return fmt.Errorf("listing instances: %v", err) + } + for _, inst := range list.Items { + if inst.Metadata == nil { + // Defensive. Not seen in practice. + continue + } + for _, it := range inst.Metadata.Items { + if it.Key == "delete-at" { + unixDeadline, err := strconv.ParseInt(it.Value, 10, 64) + if err != nil { + log.Printf("invalid delete-at value %q seen; ignoring", it.Value) + } + if err == nil && time.Now().Unix() > unixDeadline { + log.Printf("Deleting expired VM %q in zone %q ...", inst.Name, zone) + deleteVM(svc, zone, inst.Name) + } + } + } + } + return nil +} + +func deleteVM(svc *compute.Service, zone, instName string) { + proj, err := metadata.ProjectID() + if err != nil { + log.Printf("failed to get project id to delete instace: %v", err) + return + } + op, err := svc.Instances.Delete(proj, zone, instName).Do() + if err != nil { + log.Printf("Failed to delete instance %q in zone %q: %v", instName, zone, err) + return + } + log.Printf("Sent request to delete instance %q in zone %q. Operation ID == %v", instName, zone, op.Id) +} + +func hasComputeScope() bool { + if !metadata.OnGCE() { + return false + } + scopes, err := metadata.Scopes("default") + if err != nil { + log.Printf("failed to query metadata default scopes: %v", err) + return false + } + for _, v := range scopes { + if v == compute.DevstorageFull_controlScope { + return true + } + } + return false +}