- forward updated labels/annotations for downward API compat

- refactor queue.Pod construction to take functional options, privatize Pod fields - refactor DelayFIFO and HistoricalFIFO to offer consistent, more useful Pop() funcs - refactor pod update processing changes; long term we should somehow combine with the special pod config source that we are using for mirror pods - task launch timer cleanup
2015-11-29 19:34:27 +00:00 · 2015-11-29 19:34:27 +00:00 · af95e3fe0e
parent 5885728318
commit af95e3fe0e
14 changed files with 454 additions and 169 deletions
--- a/contrib/mesos/pkg/executor/executor.go
+++ b/contrib/mesos/pkg/executor/executor.go
@ -20,6 +20,7 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"math"
 	"strings"
 	"sync"
 	"sync/atomic"
@ -34,13 +35,12 @@ import (
 	"k8s.io/kubernetes/contrib/mesos/pkg/executor/messages"
 	"k8s.io/kubernetes/contrib/mesos/pkg/node"
 	"k8s.io/kubernetes/contrib/mesos/pkg/podutil"
+	"k8s.io/kubernetes/contrib/mesos/pkg/runtime"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/executorinfo"
 	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
 	"k8s.io/kubernetes/pkg/api"
-	unversionedapi "k8s.io/kubernetes/pkg/api/unversioned"
 	"k8s.io/kubernetes/pkg/client/cache"
 	client "k8s.io/kubernetes/pkg/client/unversioned"
-	"k8s.io/kubernetes/pkg/controller/framework"
 	"k8s.io/kubernetes/pkg/kubelet/container"
 	"k8s.io/kubernetes/pkg/kubelet/dockertools"
 	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
@ -89,16 +89,12 @@ func (s *stateType) transitionTo(to stateType, unless ...stateType) bool {

 type kuberTask struct {
 	mesosTaskInfo *mesos.TaskInfo
-	podName       string // empty until pod is sent to kubelet and registed in KubernetesExecutor.pods
+	launchTimer   *time.Timer // launchTimer expires when the launch-task process duration exceeds launchGracePeriod
+	podName       string      // empty until pod is sent to kubelet and registed in KubernetesExecutor.pods
 }

 type podStatusFunc func() (*api.PodStatus, error)

-type NodeInfo struct {
-	Cores int
-	Mem   int64 // in bytes
-}
-
 // KubernetesExecutor is an mesos executor that runs pods
 // in a minion machine.
 type Executor struct {
@ -118,9 +114,9 @@ type Executor struct {
 	exitFunc             func(int)
 	podStatusFunc        func(*api.Pod) (*api.PodStatus, error)
 	staticPodsConfigPath string
-	podController        *framework.Controller
 	launchGracePeriod    time.Duration
 	nodeInfos            chan<- NodeInfo
+	initCompleted        chan struct{} // closes upon completion of Init()
 }

 type Config struct {
@ -144,6 +140,13 @@ func (k *Executor) isConnected() bool {

 // New creates a new kubernetes executor.
 func New(config Config) *Executor {
+	launchGracePeriod := config.LaunchGracePeriod
+	if launchGracePeriod == 0 {
+		// this is the equivalent of saying "the timer never expires" and simplies nil
+		// timer checks elsewhere in the code. it's a little hacky but less code to
+		// maintain that alternative approaches.
+		launchGracePeriod = time.Duration(math.MaxInt64)
+	}
 	k := &Executor{
 		updateChan:           config.Updates,
 		state:                disconnectedState,
@ -160,41 +163,22 @@ func New(config Config) *Executor {
 		exitFunc:             config.ExitFunc,
 		podStatusFunc:        config.PodStatusFunc,
 		staticPodsConfigPath: config.StaticPodsConfigPath,
-		launchGracePeriod:    config.LaunchGracePeriod,
+		launchGracePeriod:    launchGracePeriod,
 		nodeInfos:            config.NodeInfos,
+		initCompleted:        make(chan struct{}),
 	}
+	runtime.On(k.initCompleted, k.runSendLoop)

-	// watch pods from the given pod ListWatch
-	if config.PodLW == nil {
-		// fail early to make debugging easier
-		panic("cannot create executor with nil PodLW")
-	}
-	_, k.podController = framework.NewInformer(config.PodLW, &api.Pod{}, podRelistPeriod, &framework.ResourceEventHandlerFuncs{
-		AddFunc: func(obj interface{}) {
-			pod := obj.(*api.Pod)
-			log.V(4).Infof("pod %s/%s created on apiserver", pod.Namespace, pod.Name)
-			k.handleChangedApiserverPod(pod)
-		},
-		UpdateFunc: func(oldObj, newObj interface{}) {
-			pod := newObj.(*api.Pod)
-			log.V(4).Infof("pod %s/%s updated on apiserver", pod.Namespace, pod.Name)
-			k.handleChangedApiserverPod(pod)
-		},
-		DeleteFunc: func(obj interface{}) {
-			pod := obj.(*api.Pod)
-			log.V(4).Infof("pod %s/%s deleted on apiserver", pod.Namespace, pod.Name)
-		},
-	})
+	po := newPodObserver(config.PodLW, k.updateTask, k.terminate)
+	runtime.On(k.initCompleted, po.run)

 	return k
 }

 func (k *Executor) Init(driver bindings.ExecutorDriver) {
+	defer close(k.initCompleted)
 	k.killKubeletContainers()
 	k.resetSuicideWatch(driver)
-
-	go k.podController.Run(k.terminate)
-	go k.sendLoop()
 	//TODO(jdef) monitor kubeletFinished and shutdown if it happens
 }

@ -251,7 +235,7 @@ func (k *Executor) Registered(
 		}
 	}

-	annotations, err := executorInfoToAnnotations(executorInfo)
+	annotations, err := annotationsFor(executorInfo)
 	if err != nil {
 		log.Errorf(
 			"cannot get node annotations from executor info %v error %v",
@ -374,10 +358,10 @@ func (k *Executor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.Ta
 		return
 	}

+	taskId := taskInfo.GetTaskId().GetValue()
 	k.lock.Lock()
 	defer k.lock.Unlock()

-	taskId := taskInfo.GetTaskId().GetValue()
 	if _, found := k.tasks[taskId]; found {
 		log.Errorf("task already launched\n")
 		// Not to send back TASK_RUNNING here, because
@ -390,51 +374,13 @@ func (k *Executor) LaunchTask(driver bindings.ExecutorDriver, taskInfo *mesos.Ta
 	// (c) we're leaving podName == "" for now, indicates we don't need to delete containers
 	k.tasks[taskId] = &kuberTask{
 		mesosTaskInfo: taskInfo,
+		launchTimer:   time.NewTimer(k.launchGracePeriod),
 	}
 	k.resetSuicideWatch(driver)

 	go k.launchTask(driver, taskId, pod)
 }

-func (k *Executor) handleChangedApiserverPod(pod *api.Pod) {
-	// exclude "pre-scheduled" pods which have a NodeName set to this node without being scheduled already
-	taskId := pod.Annotations[meta.TaskIdKey]
-	if taskId == "" {
-		log.V(5).Infof("ignoring pod update for %s/%s because %s annotation is missing", pod.Namespace, pod.Name, meta.TaskIdKey)
-		return
-	}
-
-	k.lock.Lock()
-	defer k.lock.Unlock()
-
-	// exclude tasks which are already deleted from our task registry
-	task := k.tasks[taskId]
-	if task == nil {
-		log.Warningf("task %s for pod %s/%s not found", taskId, pod.Namespace, pod.Name)
-		return
-	}
-
-	oldPod := k.pods[task.podName]
-
-	// terminating pod?
-	if oldPod != nil && pod.Status.Phase == api.PodRunning {
-		timeModified := differentTime(oldPod.DeletionTimestamp, pod.DeletionTimestamp)
-		graceModified := differentPeriod(oldPod.DeletionGracePeriodSeconds, pod.DeletionGracePeriodSeconds)
-		if timeModified || graceModified {
-			log.Infof("pod %s/%s is terminating at %v with %vs grace period, telling kubelet", pod.Namespace, pod.Name, *pod.DeletionTimestamp, *pod.DeletionGracePeriodSeconds)
-
-			// modify the pod in our registry instead of sending the new pod. The later
-			// would allow that other changes bleed into the kubelet. For now we are
-			// very conservative changing this behaviour.
-			// TODO(sttts): check whether we can and should send all changes down to the kubelet
-			oldPod.DeletionTimestamp = pod.DeletionTimestamp
-			oldPod.DeletionGracePeriodSeconds = pod.DeletionGracePeriodSeconds
-
-			k.sendPodsSnapshot()
-		}
-	}
-}
-
 // determine whether we need to start a suicide countdown. if so, then start
 // a timer that, upon expiration, causes this executor to commit suicide.
 // this implementation runs asynchronously. callers that wish to wait for the
@ -619,17 +565,10 @@ func (k *Executor) launchTask(driver bindings.ExecutorDriver, taskId string, pod
 	psf := podStatusFunc(func() (*api.PodStatus, error) {
 		return k.podStatusFunc(pod)
 	})
-	go k._launchTask(driver, taskId, podFullName, psf)
+	go k._launchTask(driver, taskId, podFullName, psf, task.launchTimer.C)
 }

-func (k *Executor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc) {
-
-	expired := make(chan struct{})
-
-	if k.launchGracePeriod > 0 {
-		time.AfterFunc(k.launchGracePeriod, func() { close(expired) })
-	}
-
+func (k *Executor) _launchTask(driver bindings.ExecutorDriver, taskId, podFullName string, psf podStatusFunc, expired <-chan time.Time) {
 	getMarshalledInfo := func() (data []byte, cancel bool) {
 		// potentially long call..
 		if podStatus, err := psf(); err == nil && podStatus != nil {
@ -677,7 +616,8 @@ waitForRunningPod:
 			} else {
 				k.lock.Lock()
 				defer k.lock.Unlock()
-				if _, found := k.tasks[taskId]; !found {
+				task, found := k.tasks[taskId]
+				if !found {
 					goto reportLost
 				}

@ -689,6 +629,7 @@ waitForRunningPod:
 				}

 				k.sendStatus(driver, statusUpdate)
+				task.launchTimer.Stop()

 				// continue to monitor the health of the pod
 				go k.__launchTask(driver, taskId, podFullName, psf)
@ -946,7 +887,7 @@ func (k *Executor) sendFrameworkMessage(driver bindings.ExecutorDriver, msg stri
 	}
 }

-func (k *Executor) sendLoop() {
+func (k *Executor) runSendLoop() {
 	defer log.V(1).Info("sender loop exiting")
 	for {
 		select {
@ -982,53 +923,7 @@ func (k *Executor) sendLoop() {
 	}
 }

-func differentTime(a, b *unversionedapi.Time) bool {
-	return (a == nil) != (b == nil) || (a != nil && b != nil && *a != *b)
-}
-
-func differentPeriod(a, b *int64) bool {
-	return (a == nil) != (b == nil) || (a != nil && b != nil && *a != *b)
-}
-
-func nodeInfo(si *mesos.SlaveInfo, ei *mesos.ExecutorInfo) NodeInfo {
-	var executorCPU, executorMem float64
-
-	// get executor resources
-	if ei != nil {
-		for _, r := range ei.GetResources() {
-			if r == nil || r.GetType() != mesos.Value_SCALAR {
-				continue
-			}
-			switch r.GetName() {
-			case "cpus":
-				executorCPU = r.GetScalar().GetValue()
-			case "mem":
-				executorMem = r.GetScalar().GetValue()
-			}
-		}
-	}
-
-	// get resource capacity of the node
-	ni := NodeInfo{}
-	for _, r := range si.GetResources() {
-		if r == nil || r.GetType() != mesos.Value_SCALAR {
-			continue
-		}
-
-		switch r.GetName() {
-		case "cpus":
-			// We intentionally take the floor of executorCPU because cores are integers
-			// and we would loose a complete cpu here if the value is <1.
-			// TODO(sttts): switch to float64 when "Machine Allocables" are implemented
-			ni.Cores = int(r.GetScalar().GetValue() - float64(int(executorCPU)))
-		case "mem":
-			ni.Mem = int64(r.GetScalar().GetValue()-executorMem) * 1024 * 1024
-		}
-	}
-	return ni
-}
-
-func executorInfoToAnnotations(ei *mesos.ExecutorInfo) (annotations map[string]string, err error) {
+func annotationsFor(ei *mesos.ExecutorInfo) (annotations map[string]string, err error) {
 	annotations = map[string]string{}
 	if ei == nil {
 		return
@ -1044,3 +939,36 @@ func executorInfoToAnnotations(ei *mesos.ExecutorInfo) (annotations map[string]s

 	return
 }
+
+// updateTask executes some mutating operation for the given task/pod, blocking until the update is either
+// attempted or discarded. uses the executor state lock to synchronize concurrent invocation. returns true
+// only if the specified update operation was attempted and also returns true. a result of true also indicates
+// changes have been sent upstream to the kubelet.
+func (k *Executor) updateTask(taskId string, f func(*kuberTask, *api.Pod) bool) (changed bool, err error) {
+	k.lock.Lock()
+	defer k.lock.Unlock()
+
+	// exclude tasks which are already deleted from our task registry
+	task := k.tasks[taskId]
+	if task == nil {
+		// the pod has completed the launch-task-binding phase because it's been annotated with
+		// the task-id, but we don't have a record of it; it's best to let the scheduler reconcile.
+		// it's also possible that our update queue is backed up and hasn't caught up with the state
+		// of the world yet.
+
+		// TODO(jdef) should we hint to the scheduler (via TASK_FAILED, reason=PodRefersToUnknownTask)?
+
+		err = fmt.Errorf("task %s not found", taskId)
+		return
+	}
+
+	oldPod := k.pods[task.podName]
+	changed = f(task, oldPod)
+
+	// TODO(jdef) this abstraction isn't perfect since changes that only impact the task struct,
+	// and not the pod, don't require a new pod snapshot sent back to the kubelet.
+	if changed {
+		k.sendPodsSnapshot()
+	}
+	return
+}
--- a/contrib/mesos/pkg/executor/executor_test.go
+++ b/contrib/mesos/pkg/executor/executor_test.go
@ -608,3 +608,77 @@ func TestExecutorsendFrameworkMessage(t *testing.T) {
 	}
 	mockDriver.AssertExpectations(t)
 }
+
+func TestExecutor_updateMetaMap(t *testing.T) {
+	for i, tc := range []struct {
+		oldmap map[string]string
+		newmap map[string]string
+		wants  bool
+	}{
+		{
+			oldmap: nil,
+			newmap: nil,
+			wants:  false,
+		},
+		{
+			oldmap: nil,
+			newmap: map[string]string{},
+			wants:  false,
+		},
+		{
+			oldmap: map[string]string{},
+			newmap: nil,
+			wants:  false,
+		},
+		{
+			oldmap: nil,
+			newmap: map[string]string{
+				"foo": "bar",
+			},
+			wants: true,
+		},
+		{
+			oldmap: map[string]string{},
+			newmap: map[string]string{
+				"foo": "bar",
+			},
+			wants: true,
+		},
+		{
+			oldmap: map[string]string{
+				"baz": "qax",
+			},
+			newmap: map[string]string{
+				"foo": "bar",
+			},
+			wants: true,
+		},
+		{
+			oldmap: map[string]string{
+				"baz": "qax",
+			},
+			newmap: nil,
+			wants:  true,
+		},
+		{
+			oldmap: map[string]string{
+				"baz": "qax",
+				"qwe": "iop",
+			},
+			newmap: map[string]string{
+				"foo": "bar",
+				"qwe": "iop",
+			},
+			wants: true,
+		},
+	} {
+		// do work here
+		actual := updateMetaMap(&tc.oldmap, tc.newmap)
+		if actual != tc.wants {
+			t.Fatalf("test case %d failed, expected %v but got %v instead", i, tc.wants, actual)
+		}
+		if len(tc.oldmap) != len(tc.newmap) || (len(tc.oldmap) > 0 && !reflect.DeepEqual(tc.oldmap, tc.newmap)) {
+			t.Fatalf("test case %d failed, expected %v but got %v instead", i, tc.newmap, tc.oldmap)
+		}
+	}
+}
--- a/contrib/mesos/pkg/executor/node.go
+++ b/contrib/mesos/pkg/executor/node.go
@ -0,0 +1,64 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package executor
+
+import (
+	mesos "github.com/mesos/mesos-go/mesosproto"
+)
+
+type NodeInfo struct {
+	Cores int
+	Mem   int64 // in bytes
+}
+
+func nodeInfo(si *mesos.SlaveInfo, ei *mesos.ExecutorInfo) NodeInfo {
+	var executorCPU, executorMem float64
+
+	// get executor resources
+	if ei != nil {
+		for _, r := range ei.GetResources() {
+			if r == nil || r.GetType() != mesos.Value_SCALAR {
+				continue
+			}
+			switch r.GetName() {
+			case "cpus":
+				executorCPU = r.GetScalar().GetValue()
+			case "mem":
+				executorMem = r.GetScalar().GetValue()
+			}
+		}
+	}
+
+	// get resource capacity of the node
+	ni := NodeInfo{}
+	for _, r := range si.GetResources() {
+		if r == nil || r.GetType() != mesos.Value_SCALAR {
+			continue
+		}
+
+		switch r.GetName() {
+		case "cpus":
+			// We intentionally take the floor of executorCPU because cores are integers
+			// and we would loose a complete cpu here if the value is <1.
+			// TODO(sttts): switch to float64 when "Machine Allocables" are implemented
+			ni.Cores = int(r.GetScalar().GetValue() - float64(int(executorCPU)))
+		case "mem":
+			ni.Mem = int64(r.GetScalar().GetValue()-executorMem) * 1024 * 1024
+		}
+	}
+	return ni
+}
--- a/contrib/mesos/pkg/executor/observer.go
+++ b/contrib/mesos/pkg/executor/observer.go
@ -0,0 +1,171 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package executor
+
+import (
+	log "github.com/golang/glog"
+	"k8s.io/kubernetes/contrib/mesos/pkg/scheduler/meta"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/unversioned"
+	"k8s.io/kubernetes/pkg/client/cache"
+	"k8s.io/kubernetes/pkg/controller/framework"
+)
+
+// taskUpdateTx execute a task update transaction f for the task identified by
+// taskId. if no such task exists then f is not invoked and an error is
+// returned. if f is invoked then taskUpdateTx returns the bool result of f.
+type taskUpdateTx func(taskId string, f func(*kuberTask, *api.Pod) bool) (changed bool, err error)
+
+// podObserver receives callbacks for every pod state change on the apiserver and
+// for each decides whether to execute a task update transaction.
+type podObserver struct {
+	podController *framework.Controller
+	terminate     <-chan struct{}
+	taskUpdateTx  taskUpdateTx
+}
+
+func newPodObserver(podLW cache.ListerWatcher, taskUpdateTx taskUpdateTx, terminate <-chan struct{}) *podObserver {
+	// watch pods from the given pod ListWatch
+	if podLW == nil {
+		// fail early to make debugging easier
+		panic("cannot create executor with nil PodLW")
+	}
+
+	p := &podObserver{
+		terminate:    terminate,
+		taskUpdateTx: taskUpdateTx,
+	}
+	_, p.podController = framework.NewInformer(podLW, &api.Pod{}, podRelistPeriod, &framework.ResourceEventHandlerFuncs{
+		AddFunc: func(obj interface{}) {
+			pod := obj.(*api.Pod)
+			log.V(4).Infof("pod %s/%s created on apiserver", pod.Namespace, pod.Name)
+			p.handleChangedApiserverPod(pod)
+		},
+		UpdateFunc: func(oldObj, newObj interface{}) {
+			pod := newObj.(*api.Pod)
+			log.V(4).Infof("pod %s/%s updated on apiserver", pod.Namespace, pod.Name)
+			p.handleChangedApiserverPod(pod)
+		},
+		DeleteFunc: func(obj interface{}) {
+			pod := obj.(*api.Pod)
+			log.V(4).Infof("pod %s/%s deleted on apiserver", pod.Namespace, pod.Name)
+		},
+	})
+	return p
+}
+
+// run begins observing pod state changes; blocks until the terminate chan closes.
+func (p *podObserver) run() {
+	p.podController.Run(p.terminate)
+}
+
+// handleChangedApiserverPod is invoked for pod add/update state changes and decides whether
+// task updates are necessary. if so, a task update is executed via taskUpdateTx.
+func (p *podObserver) handleChangedApiserverPod(pod *api.Pod) {
+	// Don't do anything for pods without task anotation which means:
+	// - "pre-scheduled" pods which have a NodeName set to this node without being scheduled already.
+	// - static/mirror pods: they'll never have a TaskID annotation, and we don't expect them to ever change.
+	// - all other pods that haven't passed through the launch-task-binding phase, which would set annotations.
+	taskId := pod.Annotations[meta.TaskIdKey]
+	if taskId == "" {
+		// There also could be a race between the overall launch-task process and this update, but here we
+		// will never be able to process such a stale update because the "update pod" that we're receiving
+		// in this func won't yet have a task ID annotation. It follows that we can safely drop such a stale
+		// update on the floor because we'll get another update later that, in addition to the changes that
+		// we're dropping now, will also include the changes from the binding process.
+		log.V(5).Infof("ignoring pod update for %s/%s because %s annotation is missing", pod.Namespace, pod.Name, meta.TaskIdKey)
+		return
+	}
+
+	_, err := p.taskUpdateTx(taskId, func(_ *kuberTask, relatedPod *api.Pod) (sendSnapshot bool) {
+		if relatedPod == nil {
+			// should never happen because:
+			// (a) the update pod record has already passed through the binding phase in launchTasks()
+			// (b) all remaining updates to executor.{pods,tasks} are sync'd in unison
+			log.Errorf("internal state error: pod not found for task %s", taskId)
+			return
+		}
+
+		// TODO(sttts): check whether we can and should send all "semantic" changes down to the kubelet
+		// see kubelet/config/config.go for semantic change detection
+
+		// check for updated labels/annotations: need to forward these for the downward API
+		sendSnapshot = sendSnapshot || updateMetaMap(&relatedPod.Labels, pod.Labels)
+		sendSnapshot = sendSnapshot || updateMetaMap(&relatedPod.Annotations, pod.Annotations)
+
+		// terminating pod?
+		if pod.Status.Phase == api.PodRunning {
+			timeModified := differentTime(relatedPod.DeletionTimestamp, pod.DeletionTimestamp)
+			graceModified := differentPeriod(relatedPod.DeletionGracePeriodSeconds, pod.DeletionGracePeriodSeconds)
+			if timeModified || graceModified {
+				log.Infof("pod %s/%s is terminating at %v with %vs grace period, telling kubelet",
+					pod.Namespace, pod.Name, *pod.DeletionTimestamp, *pod.DeletionGracePeriodSeconds)
+
+				// modify the pod in our registry instead of sending the new pod. The later
+				// would allow that other changes bleed into the kubelet. For now we are
+				// very conservative changing this behaviour.
+				relatedPod.DeletionTimestamp = pod.DeletionTimestamp
+				relatedPod.DeletionGracePeriodSeconds = pod.DeletionGracePeriodSeconds
+				sendSnapshot = true
+			}
+		}
+		return
+	})
+	if err != nil {
+		log.Errorf("failed to update pod %s/%s: %+v", pod.Namespace, pod.Name, err)
+	}
+}
+
+// updateMetaMap looks for differences between src and dest; if there are differences
+// then dest is changed (possibly to point to src) and this func returns true.
+func updateMetaMap(dest *map[string]string, src map[string]string) (changed bool) {
+	// check for things in dest that are missing in src
+	for k := range *dest {
+		if _, ok := src[k]; !ok {
+			changed = true
+			break
+		}
+	}
+	if !changed {
+		if len(*dest) == 0 {
+			if len(src) > 0 {
+				changed = true
+				goto finished
+			}
+			// no update needed
+			return
+		}
+		// check for things in src that are missing/different in dest
+		for k, v := range src {
+			if vv, ok := (*dest)[k]; !ok || vv != v {
+				changed = true
+				break
+			}
+		}
+	}
+finished:
+	*dest = src
+	return
+}
+
+func differentTime(a, b *unversioned.Time) bool {
+	return (a == nil) != (b == nil) || (a != nil && b != nil && *a != *b)
+}
+
+func differentPeriod(a, b *int64) bool {
+	return (a == nil) != (b == nil) || (a != nil && b != nil && *a != *b)
+}
--- a/contrib/mesos/pkg/node/registrator.go
+++ b/contrib/mesos/pkg/node/registrator.go
@ -78,7 +78,7 @@ func (r *clientRegistrator) Run(terminate <-chan struct{}) error {
 	loop := func() {
 	RegistrationLoop:
 		for {
-			obj := r.queue.CancelablePop(terminate)
+			obj := r.queue.Pop(terminate)
 			if obj == nil {
 				break RegistrationLoop
 			}
--- a/contrib/mesos/pkg/offers/offers.go
+++ b/contrib/mesos/pkg/offers/offers.go
@ -441,7 +441,7 @@ func (s *offerStorage) ageOffers() {
 }

 func (s *offerStorage) nextListener() *offerListener {
-	obj := s.listeners.Pop()
+	obj := s.listeners.Pop(nil)
 	if listen, ok := obj.(*offerListener); !ok {
 		//programming error
 		panic(fmt.Sprintf("unexpected listener object %v", obj))
--- a/contrib/mesos/pkg/queue/delay.go
+++ b/contrib/mesos/pkg/queue/delay.go
@ -302,12 +302,18 @@ func (f *DelayFIFO) Get(id string) (UniqueID, bool) {

 // Variant of DelayQueue.Pop() for UniqueDelayed items
 func (q *DelayFIFO) Await(timeout time.Duration) UniqueID {
-	cancel := make(chan struct{})
-	ch := make(chan interface{}, 1)
+	var (
+		cancel = make(chan struct{})
+		ch     = make(chan interface{}, 1)
+		t      = time.NewTimer(timeout)
+	)
+	defer t.Stop()
+
 	go func() { ch <- q.pop(cancel) }()
+
 	var x interface{}
 	select {
-	case <-time.After(timeout):
+	case <-t.C:
 		close(cancel)
 		x = <-ch
 	case x = <-ch:
@ -319,13 +325,19 @@ func (q *DelayFIFO) Await(timeout time.Duration) UniqueID {
 	return nil
 }

-// Variant of DelayQueue.Pop() for UniqueDelayed items
-func (q *DelayFIFO) Pop() UniqueID {
-	return q.pop(nil).(UniqueID)
+// Pop blocks until either there is an item available to dequeue or else the specified
+// cancel chan is closed. Callers that have no interest in providing a cancel chan
+// should specify nil, or else WithoutCancel() (for readability).
+func (q *DelayFIFO) Pop(cancel <-chan struct{}) UniqueID {
+	x := q.pop(cancel)
+	if x == nil {
+		return nil
+	}
+	return x.(UniqueID)
 }

 // variant of DelayQueue.Pop that implements optional cancellation
-func (q *DelayFIFO) pop(cancel chan struct{}) interface{} {
+func (q *DelayFIFO) pop(cancel <-chan struct{}) interface{} {
 	next := func() *qitem {
 		q.lock()
 		defer q.unlock()
--- a/contrib/mesos/pkg/queue/delay_test.go
+++ b/contrib/mesos/pkg/queue/delay_test.go
@ -358,7 +358,7 @@ func TestDFIFO_sanity_check(t *testing.T) {

 	// pop last
 	before := time.Now()
-	x := df.Pop()
+	x := df.Pop(nil)
 	assert.Equal(a.(*testjob).instance, 2)

 	now := time.Now()
@ -395,7 +395,7 @@ func TestDFIFO_Offer(t *testing.T) {
 	}

 	before := time.Now()
-	x := dq.Pop()
+	x := dq.Pop(nil)

 	now := time.Now()
 	waitPeriod := now.Sub(before)
--- a/contrib/mesos/pkg/queue/historical.go
+++ b/contrib/mesos/pkg/queue/historical.go
@ -220,22 +220,28 @@ func (f *HistoricalFIFO) Poll(id string, t EventType) bool {

 // Variant of DelayQueue.Pop() for UniqueDelayed items
 func (q *HistoricalFIFO) Await(timeout time.Duration) interface{} {
-	cancel := make(chan struct{})
-	ch := make(chan interface{}, 1)
-	go func() { ch <- q.CancelablePop(cancel) }()
+	var (
+		cancel = make(chan struct{})
+		ch     = make(chan interface{}, 1)
+		t      = time.NewTimer(timeout)
+	)
+	defer t.Stop()
+
+	go func() { ch <- q.Pop(cancel) }()
+
 	select {
-	case <-time.After(timeout):
+	case <-t.C:
 		close(cancel)
 		return <-ch
 	case x := <-ch:
 		return x
 	}
 }
-func (f *HistoricalFIFO) Pop() interface{} {
-	return f.CancelablePop(nil)
-}

-func (f *HistoricalFIFO) CancelablePop(cancel <-chan struct{}) interface{} {
+// Pop blocks until either there is an item available to dequeue or else the specified
+// cancel chan is closed. Callers that have no interest in providing a cancel chan
+// should specify nil, or else WithoutCancel() (for readability).
+func (f *HistoricalFIFO) Pop(cancel <-chan struct{}) interface{} {
 	popEvent := (Entry)(nil)
 	defer func() {
 		f.carrier(popEvent)
--- a/contrib/mesos/pkg/queue/historical_test.go
+++ b/contrib/mesos/pkg/queue/historical_test.go
@ -75,7 +75,7 @@ func TestFIFO_basic(t *testing.T) {
 	lastInt := _int(0)
 	lastUint := _uint(0)
 	for i := 0; i < amount*2; i++ {
-		switch obj := f.Pop().(type) {
+		switch obj := f.Pop(nil).(type) {
 		case _int:
 			if obj <= lastInt {
 				t.Errorf("got %v (int) out of order, last was %v", obj, lastInt)
@ -100,7 +100,7 @@ func TestFIFO_addUpdate(t *testing.T) {
 	got := make(chan *testObj, 2)
 	go func() {
 		for {
-			got <- f.Pop().(*testObj)
+			got <- f.Pop(nil).(*testObj)
 		}
 	}()

@ -126,7 +126,7 @@ func TestFIFO_addReplace(t *testing.T) {
 	got := make(chan *testObj, 2)
 	go func() {
 		for {
-			got <- f.Pop().(*testObj)
+			got <- f.Pop(nil).(*testObj)
 		}
 	}()

@ -158,24 +158,24 @@ func TestFIFO_detectLineJumpers(t *testing.T) {
 	done := make(chan struct{})
 	go func() {
 		defer close(done)
-		if e, a := 13, f.Pop().(*testObj).value; a != e {
+		if e, a := 13, f.Pop(nil).(*testObj).value; a != e {
 			err = fmt.Errorf("expected %d, got %d", e, a)
 			return
 		}

 		f.Add(&testObj{"foo", 14}) // ensure foo doesn't jump back in line

-		if e, a := 1, f.Pop().(*testObj).value; a != e {
+		if e, a := 1, f.Pop(nil).(*testObj).value; a != e {
 			err = fmt.Errorf("expected %d, got %d", e, a)
 			return
 		}

-		if e, a := 30, f.Pop().(*testObj).value; a != e {
+		if e, a := 30, f.Pop(nil).(*testObj).value; a != e {
 			err = fmt.Errorf("expected %d, got %d", e, a)
 			return
 		}

-		if e, a := 14, f.Pop().(*testObj).value; a != e {
+		if e, a := 14, f.Pop(nil).(*testObj).value; a != e {
 			err = fmt.Errorf("expected %d, got %d", e, a)
 			return
 		}
--- a/contrib/mesos/pkg/queue/interface.go
+++ b/contrib/mesos/pkg/queue/interface.go
@ -59,7 +59,7 @@ type FIFO interface {
 	// ready, they are returned in the order in which they were added/updated.
 	// The item is removed from the queue (and the store) before it is returned,
 	// so if you don't successfully process it, you need to add it back with Add().
-	Pop() interface{}
+	Pop(cancel <-chan struct{}) interface{}

 	// Await attempts to Pop within the given interval; upon success the non-nil
 	// item is returned, otherwise nil
@ -101,3 +101,5 @@ type UniqueDeadlined interface {
 	UniqueID
 	Deadlined
 }
+
+func WithoutCancel() <-chan struct{} { return nil }
--- a/contrib/mesos/pkg/scheduler/components/errorhandler/errorhandler.go
+++ b/contrib/mesos/pkg/scheduler/components/errorhandler/errorhandler.go
@ -89,7 +89,7 @@ func (k *errorHandler) Error(pod *api.Pod, schedulingErr error) {
 		}
 		delay := k.backoff.Get(podKey)
 		log.V(3).Infof("requeuing pod %v with delay %v", podKey, delay)
-		k.qr.Requeue(&queuer.Pod{Pod: pod, Delay: &delay, Notify: breakoutEarly})
+		k.qr.Requeue(queuer.NewPod(pod, queuer.Delay(delay), queuer.Notify(breakoutEarly)))

 	default:
 		log.V(2).Infof("Task is no longer pending, aborting reschedule for pod %v", podKey)
--- a/contrib/mesos/pkg/scheduler/components/podreconciler/podreconciler.go
+++ b/contrib/mesos/pkg/scheduler/components/podreconciler/podreconciler.go
@ -104,7 +104,7 @@ func (s *podReconciler) Reconcile(t *podtask.T) {

 			now := time.Now()
 			log.V(3).Infof("reoffering pod %v", podKey)
-			s.qr.Reoffer(queuer.NewPodWithDeadline(pod, &now))
+			s.qr.Reoffer(queuer.NewPod(pod, queuer.Deadline(now)))
 		} else {
 			// pod is scheduled.
 			// not sure how this happened behind our backs. attempt to reconstruct
--- a/contrib/mesos/pkg/scheduler/queuer/pod.go
+++ b/contrib/mesos/pkg/scheduler/queuer/pod.go
@ -25,16 +25,44 @@ import (
 	"k8s.io/kubernetes/pkg/client/cache"
 )

+// functional Pod option
+type PodOpt func(*Pod)
+
 // wrapper for the k8s pod type so that we can define additional methods on a "pod"
 type Pod struct {
 	*api.Pod
 	deadline *time.Time
-	Delay    *time.Duration
-	Notify   queue.BreakChan
+	delay    *time.Duration
+	notify   queue.BreakChan
 }

-func NewPodWithDeadline(pod *api.Pod, deadline *time.Time) *Pod {
-	return &Pod{Pod: pod, deadline: deadline}
+func NewPod(pod *api.Pod, opt ...PodOpt) *Pod {
+	p := &Pod{Pod: pod}
+	for _, f := range opt {
+		f(p)
+	}
+	return p
+}
+
+// Deadline sets the deadline for a Pod
+func Deadline(deadline time.Time) PodOpt {
+	return func(pod *Pod) {
+		pod.deadline = &deadline
+	}
+}
+
+// Delay sets the delay for a Pod
+func Delay(delay time.Duration) PodOpt {
+	return func(pod *Pod) {
+		pod.delay = &delay
+	}
+}
+
+// Notify sets the breakout notification channel for a Pod
+func Notify(notify queue.BreakChan) PodOpt {
+	return func(pod *Pod) {
+		pod.notify = notify
+	}
 }

 // implements Copyable
@ -65,14 +93,14 @@ func (dp *Pod) Deadline() (time.Time, bool) {
 }

 func (dp *Pod) GetDelay() time.Duration {
-	if dp.Delay != nil {
-		return *(dp.Delay)
+	if dp.delay != nil {
+		return *(dp.delay)
 	}
 	return 0
 }

 func (p *Pod) Breaker() queue.BreakChan {
-	return p.Notify
+	return p.notify
 }

 func (p *Pod) String() string {