dashboard/coordinator: periodically clean stale VMs
This isn't used yet, but will be for the new-style builders (VMs on GCE running the buildlet, started by the coordinator). From the code's comments: cleanUpOldVMs periodically enumerates virtual machines and deletes any which have a "delete-at" attribute having a unix timestamp before the current time. These VMs are created to run a single build and should be shut down by a controlling process. Due to various types of failures, they might get stranded. To prevent them from getting stranded and wasting resources forever, we instead set the "delete-at" metadata attribute on them when created to some time that's well beyond their expected lifetime, and then this is the backup mechanism to delete them if they get away. Update golang/go#8639 Update golang/go#8640 Update golang/go#8642 Change-Id: I489e97926e7ab56487571c2bf0bd255cdf49570d Reviewed-on: https://go-review.googlesource.com/2199 Reviewed-by: Burcu Dogan <jbd@google.com>
This commit is contained in:
parent
faf0ad1ad3
commit
005d2be0ba
|
@ -1,6 +1,9 @@
|
|||
coordinator: main.go
|
||||
GOOS=linux go build -o coordinator .
|
||||
GOOS=linux go build --tags=build_coordinator -o coordinator .
|
||||
|
||||
# After "make upload", either reboot the machine, or ssh to it and:
|
||||
# sudo systemctl restart gobuild.service
|
||||
# And watch its logs with:
|
||||
# sudo journalctl -f -u gobuild.service
|
||||
upload: coordinator
|
||||
cat coordinator | (cd buildongce && go run create.go --write_object=go-builder-data/coordinator)
|
||||
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build build_coordinator
|
||||
|
||||
// The coordinator runs on GCE and coordinates builds in Docker containers.
|
||||
package main // import "golang.org/x/tools/dashboard/coordinator"
|
||||
|
||||
|
@ -20,15 +22,23 @@ import (
|
|||
"os"
|
||||
"os/exec"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/oauth2"
|
||||
"golang.org/x/oauth2/google"
|
||||
"google.golang.org/api/compute/v1"
|
||||
"google.golang.org/cloud/compute/metadata"
|
||||
)
|
||||
|
||||
var (
|
||||
masterKeyFile = flag.String("masterkey", "", "Path to builder master key. Else fetched using GCE project attribute 'builder-master-key'.")
|
||||
maxBuilds = flag.Int("maxbuilds", 6, "Max concurrent builds")
|
||||
|
||||
cleanZones = flag.String("zones", "us-central1-a,us-central1-b,us-central1-f", "Comma-separated list of zones to periodically clean of stale build VMs (ones that failed to shut themselves down)")
|
||||
|
||||
// Debug flags:
|
||||
addTemp = flag.Bool("temp", false, "Append -temp to all builders.")
|
||||
just = flag.String("just", "", "If non-empty, run single build in the foreground. Requires rev.")
|
||||
|
@ -131,6 +141,7 @@ func main() {
|
|||
go http.ListenAndServe(":80", nil)
|
||||
|
||||
go cleanUpOldContainers()
|
||||
go cleanUpOldVMs()
|
||||
|
||||
for _, watcher := range watchers {
|
||||
if err := startWatching(watchers[watcher.repo]); err != nil {
|
||||
|
@ -581,3 +592,103 @@ func oldContainers() []string {
|
|||
out, _ := exec.Command("docker", "ps", "-a", "--filter=status=exited", "--no-trunc", "-q").Output()
|
||||
return strings.Fields(string(out))
|
||||
}
|
||||
|
||||
// cleanUpOldVMs loops forever and periodically enumerates virtual
|
||||
// machines and deletes those which have expired.
|
||||
//
|
||||
// A VM is considered expired if it has a "delete-at" metadata
|
||||
// attribute having a unix timestamp before the current time.
|
||||
//
|
||||
// This is the safety mechanism to delete VMs which stray from the
|
||||
// normal deleting process. VMs are created to run a single build and
|
||||
// should be shut down by a controlling process. Due to various types
|
||||
// of failures, they might get stranded. To prevent them from getting
|
||||
// stranded and wasting resources forever, we instead set the
|
||||
// "delete-at" metadata attribute on them when created to some time
|
||||
// that's well beyond their expected lifetime.
|
||||
func cleanUpOldVMs() {
|
||||
if !hasComputeScope() {
|
||||
log.Printf("The coordinator is not running with access to read and write Compute resources. Background VM cleaning disabled.")
|
||||
return
|
||||
}
|
||||
ts := google.ComputeTokenSource("default")
|
||||
computeService, _ := compute.New(oauth2.NewClient(oauth2.NoContext, ts))
|
||||
for {
|
||||
for _, zone := range strings.Split(*cleanZones, ",") {
|
||||
zone = strings.TrimSpace(zone)
|
||||
if err := cleanZoneVMs(computeService, zone); err != nil {
|
||||
log.Printf("Error cleaning VMs in zone %q: %v", zone, err)
|
||||
}
|
||||
}
|
||||
time.Sleep(time.Minute)
|
||||
}
|
||||
}
|
||||
|
||||
// cleanZoneVMs is part of cleanUpOldVMs, operating on a single zone.
|
||||
func cleanZoneVMs(svc *compute.Service, zone string) error {
|
||||
proj, err := metadata.ProjectID()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get current GCE ProjectID: %v", err)
|
||||
}
|
||||
// Fetch the first 500 (default) running instances and clean
|
||||
// thoes. We expect that we'll be running many fewer than
|
||||
// that. Even if we have more, eventually the first 500 will
|
||||
// either end or be cleaned, and then the next call will get a
|
||||
// partially-different 500.
|
||||
// TODO(bradfitz): revist this code if we ever start running
|
||||
// thousands of VMs.
|
||||
list, err := svc.Instances.List(proj, zone).Do()
|
||||
if err != nil {
|
||||
return fmt.Errorf("listing instances: %v", err)
|
||||
}
|
||||
for _, inst := range list.Items {
|
||||
if inst.Metadata == nil {
|
||||
// Defensive. Not seen in practice.
|
||||
continue
|
||||
}
|
||||
for _, it := range inst.Metadata.Items {
|
||||
if it.Key == "delete-at" {
|
||||
unixDeadline, err := strconv.ParseInt(it.Value, 10, 64)
|
||||
if err != nil {
|
||||
log.Printf("invalid delete-at value %q seen; ignoring", it.Value)
|
||||
}
|
||||
if err == nil && time.Now().Unix() > unixDeadline {
|
||||
log.Printf("Deleting expired VM %q in zone %q ...", inst.Name, zone)
|
||||
deleteVM(svc, zone, inst.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func deleteVM(svc *compute.Service, zone, instName string) {
|
||||
proj, err := metadata.ProjectID()
|
||||
if err != nil {
|
||||
log.Printf("failed to get project id to delete instace: %v", err)
|
||||
return
|
||||
}
|
||||
op, err := svc.Instances.Delete(proj, zone, instName).Do()
|
||||
if err != nil {
|
||||
log.Printf("Failed to delete instance %q in zone %q: %v", instName, zone, err)
|
||||
return
|
||||
}
|
||||
log.Printf("Sent request to delete instance %q in zone %q. Operation ID == %v", instName, zone, op.Id)
|
||||
}
|
||||
|
||||
func hasComputeScope() bool {
|
||||
if !metadata.OnGCE() {
|
||||
return false
|
||||
}
|
||||
scopes, err := metadata.Scopes("default")
|
||||
if err != nil {
|
||||
log.Printf("failed to query metadata default scopes: %v", err)
|
||||
return false
|
||||
}
|
||||
for _, v := range scopes {
|
||||
if v == compute.DevstorageFull_controlScope {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue