Make slave assignment before binding persistent

- move assigned slave to T.Spec.AssignedSlave - only create the BindingHost annoation in prepareTaskForLaunch - recover the assigned slave from annotation and write it back to the T.Spec field Before this patch the annotation were used to store the assign slave. But due to the cloning of tasks in the registry, this value was never persisted in the registry. This patch adds it to the Spec of a task and only creates the annotation last-minute before launching. Without this patch pods which fail before binding will stay in the registry, but they are never rescheduled again. The reason: the BindingHost annotation does not exist in the registry and not on the apiserver (compare reconcilePod function).
2015-08-11 17:53:45 +02:00 · 2015-08-11 17:53:45 +02:00 · f1a560718c
parent 5a9b36b703
commit f1a560718c
4 changed files with 87 additions and 72 deletions
--- a/contrib/mesos/pkg/scheduler/plugin.go
+++ b/contrib/mesos/pkg/scheduler/plugin.go
@ -198,8 +198,12 @@ func (b *binder) prepareTaskForLaunch(ctx api.Context, machine string, task *pod
 	oemCt := pod.Spec.Containers
 	pod.Spec.Containers = append([]api.Container{}, oemCt...) // (shallow) clone before mod

-	annotateForExecutorOnSlave(&pod, machine)
+	if pod.Annotations == nil {
+		pod.Annotations = make(map[string]string)
+	}
+
 	task.SaveRecoveryInfo(pod.Annotations)
+	pod.Annotations[annotation.BindingHostKey] = task.Spec.AssignedSlave

 	for _, entry := range task.Spec.PortMap {
 		oemPorts := pod.Spec.Containers[entry.ContainerIdx].Ports
@ -233,29 +237,13 @@ type kubeScheduler struct {
 	defaultContainerMemLimit mresource.MegaBytes
 }

-// annotatedForExecutor checks whether a pod is assigned to a Mesos slave, and
-// possibly already launched. It can, but doesn't have to be scheduled already
-// in the sense of kubernetes, i.e. the NodeName field might still be empty.
-func annotatedForExecutor(pod *api.Pod) bool {
-	_, ok := pod.ObjectMeta.Annotations[annotation.BindingHostKey]
-	return ok
-}
-
-// annotateForExecutorOnSlave sets the BindingHostKey annotation which
-// marks the pod to be processed by the scheduler and launched as a Mesos
-// task. The executor on the slave will to the final binding to finish the
-// scheduling in the kubernetes sense.
-func annotateForExecutorOnSlave(pod *api.Pod, slave string) {
-	if pod.Annotations == nil {
-		pod.Annotations = make(map[string]string)
-	} else {
-		oemAnn := pod.Annotations
-		pod.Annotations = make(map[string]string)
-		for k, v := range oemAnn {
-			pod.Annotations[k] = v
-		}
-	}
-	pod.Annotations[annotation.BindingHostKey] = slave
+// recoverAssignedSlave recovers the assigned Mesos slave from a pod by searching
+// the BindingHostKey. For tasks in the registry of the scheduler, the same
+// value is stored in T.Spec.AssignedSlave. Before launching, the BindingHostKey
+// annotation is added and the executor will eventually persist that to the
+// apiserver on binding.
+func recoverAssignedSlave(pod *api.Pod) string {
+	return pod.Annotations[annotation.BindingHostKey]
 }

 // Schedule implements the Scheduler interface of Kubernetes.
@ -462,7 +450,7 @@ func (q *queuer) Run(done <-chan struct{}) {
 			}

 			pod := p.(*Pod)
-			if annotatedForExecutor(pod.Pod) {
+			if recoverAssignedSlave(pod.Pod) != "" {
 				log.V(3).Infof("dequeuing pod for scheduling: %v", pod.Pod.Name)
 				q.dequeue(pod.GetUID())
 			} else {
@ -511,7 +499,7 @@ func (q *queuer) yield() *api.Pod {
 			log.Warningf("yield unable to understand pod object %+v, will skip: %v", pod, err)
 		} else if !q.podUpdates.Poll(podName, queue.POP_EVENT) {
 			log.V(1).Infof("yield popped a transitioning pod, skipping: %+v", pod)
-		} else if annotatedForExecutor(pod) {
+		} else if recoverAssignedSlave(pod) != "" {
 			// should never happen if enqueuePods is filtering properly
 			log.Warningf("yield popped an already-scheduled pod, skipping: %+v", pod)
 		} else {
@ -801,25 +789,27 @@ func (s *schedulingPlugin) scheduleOne() {
 //      host="..." |  host="..."    ; perhaps no updates to process?
 //
 // TODO(jdef) this needs an integration test
-func (s *schedulingPlugin) reconcilePod(oldPod api.Pod) {
-	log.V(1).Infof("reconcile pod %v", oldPod.Name)
-	ctx := api.WithNamespace(api.NewDefaultContext(), oldPod.Namespace)
-	pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(oldPod.Name)
+func (s *schedulingPlugin) reconcileTask(t *podtask.T) {
+	log.V(1).Infof("reconcile pod %v, assigned to slave %q", t.Pod.Name, t.Spec.AssignedSlave)
+	ctx := api.WithNamespace(api.NewDefaultContext(), t.Pod.Namespace)
+	pod, err := s.client.Pods(api.NamespaceValue(ctx)).Get(t.Pod.Name)
 	if err != nil {
 		if errors.IsNotFound(err) {
 			// attempt to delete
-			if err = s.deleter.deleteOne(&Pod{Pod: &oldPod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr {
-				log.Errorf("failed to delete pod: %v: %v", oldPod.Name, err)
+			if err = s.deleter.deleteOne(&Pod{Pod: &t.Pod}); err != nil && err != noSuchPodErr && err != noSuchTaskErr {
+				log.Errorf("failed to delete pod: %v: %v", t.Pod.Name, err)
 			}
 		} else {
 			//TODO(jdef) other errors should probably trigger a retry (w/ backoff).
 			//For now, drop the pod on the floor
-			log.Warning("aborting reconciliation for pod %v: %v", oldPod.Name, err)
+			log.Warning("aborting reconciliation for pod %v: %v", t.Pod.Name, err)
 		}
 		return
 	}
-	if oldPod.Spec.NodeName != pod.Spec.NodeName {
-		if annotatedForExecutor(pod) {
+
+	log.Infof("pod %v scheduled on %q according to apiserver", pod.Name, pod.Spec.NodeName)
+	if t.Spec.AssignedSlave != pod.Spec.NodeName {
+		if pod.Spec.NodeName == "" {
 			// pod is unscheduled.
 			// it's possible that we dropped the pod in the scheduler error handler
 			// because of task misalignment with the pod (task.Has(podtask.Launched) == true)
--- a/contrib/mesos/pkg/scheduler/plugin_test.go
+++ b/contrib/mesos/pkg/scheduler/plugin_test.go
@ -199,7 +199,7 @@ func NewTestPod() (*api.Pod, int) {
 		TypeMeta: api.TypeMeta{APIVersion: testapi.Version()},
 		ObjectMeta: api.ObjectMeta{
 			Name:      name,
-			Namespace: "default",
+			Namespace: api.NamespaceDefault,
 			SelfLink:  fmt.Sprintf("http://1.2.3.4/api/v1beta1/pods/%s", name),
 		},
 		Spec: api.PodSpec{
@ -418,7 +418,7 @@ func TestPlugin_LifeCycle(t *testing.T) {
 	c.Recorder = eventObserver

 	// create plugin
-	p := NewPlugin(c)
+	p := NewPlugin(c).(*schedulingPlugin)
 	assert.NotNil(p)

 	// run plugin
@ -514,11 +514,8 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		t.Fatalf("timed out waiting for launchTasks call")
 	}

-	// define generic pod startup
-	startPodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
-		// notify watchers of new pod
-		podListWatch.Add(pod, true)
-
+	// Launch a pod and wait until the scheduler driver is called
+	schedulePodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
 		// wait for failedScheduling event because there is no offer
 		assert.EventWithReason(eventObserver, "failedScheduling", "failedScheduling event not received")

@ -531,8 +528,6 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		// wait for driver.launchTasks call
 		select {
 		case launchedTask := <-launchedTasks:
-			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING))
-			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING))
 			for _, offer := range offers {
 				if offer.Id.GetValue() == launchedTask.offerId.GetValue() {
 					return pod, &launchedTask, offer
@ -540,12 +535,30 @@ func TestPlugin_LifeCycle(t *testing.T) {
 			}
 			t.Fatalf("unknown offer used to start a pod")
 			return nil, nil, nil
-
 		case <-time.After(5 * time.Second):
 			t.Fatal("timed out waiting for launchTasks")
 			return nil, nil, nil
 		}
 	}
+	// Launch a pod and wait until the scheduler driver is called
+	launchPodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
+		podListWatch.Add(pod, true)
+		return schedulePodWithOffers(pod, offers)
+	}
+
+	// Launch a pod, wait until the scheduler driver is called and report back that it is running
+	startPodWithOffers := func(pod *api.Pod, offers []*mesos.Offer) (*api.Pod, *LaunchedTask, *mesos.Offer) {
+		// notify about pod, offer resources and wait for scheduling
+		pod, launchedTask, offer := launchPodWithOffers(pod, offers)
+		if pod != nil {
+			// report back status
+			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_STAGING))
+			testScheduler.StatusUpdate(mockDriver, newTaskStatusForTask(launchedTask.taskInfo, mesos.TaskState_TASK_RUNNING))
+			return pod, launchedTask, offer
+		}
+
+		return nil, nil, nil
+	}

 	startTestPod := func() (*api.Pod, *LaunchedTask, *mesos.Offer) {
 		pod, i := NewTestPod()
@ -610,31 +623,42 @@ func TestPlugin_LifeCycle(t *testing.T) {
 		// wait until pod is looked up at the apiserver
 		assertext.EventuallyTrue(t, time.Second, func() bool {
 			return testApiServer.Stats(pod.Name) == beforePodLookups+1
-		}, "expect that reconcilePod will access apiserver for pod %v", pod.Name)
+		}, "expect that reconcileTask will access apiserver for pod %v", pod.Name)
+	}
+
+	launchTestPod := func() (*api.Pod, *LaunchedTask, *mesos.Offer) {
+		pod, i := NewTestPod()
+		offers := []*mesos.Offer{NewTestOffer(fmt.Sprintf("offer%d", i))}
+		return launchPodWithOffers(pod, offers)
 	}

 	// 1. with pod deleted from the apiserver
-	pod, launchedTask, _ = startTestPod()
+	//    expected: pod is removed from internal task registry
+	pod, launchedTask, _ = launchTestPod()
 	podListWatch.Delete(pod, false) // not notifying the watchers
 	failPodFromExecutor(launchedTask.taskInfo)

+	podKey, _ := podtask.MakePodKey(api.NewDefaultContext(), pod.Name)
+	assertext.EventuallyTrue(t, time.Second, func() bool {
+		t, _ := p.api.tasks().ForPod(podKey)
+		return t == nil
+	})
+
 	// 2. with pod still on the apiserver, not bound
-	pod, launchedTask, _ = startTestPod()
+	//    expected: pod is rescheduled
+	pod, launchedTask, _ = launchTestPod()
 	failPodFromExecutor(launchedTask.taskInfo)

-	// 3. with pod still on the apiserver, bound i.e. host!=""
-	pod, launchedTask, usedOffer = startTestPod()
-	pod.Annotations = map[string]string{
-		meta.BindingHostKey: *usedOffer.Hostname,
-	}
-	podListWatch.Modify(pod, false) // not notifying the watchers
-	failPodFromExecutor(launchedTask.taskInfo)
-
-	// 4. with pod still on the apiserver, bound i.e. host!="", notified via ListWatch
+	retryOffers := []*mesos.Offer{NewTestOffer("retry-offer")}
+	schedulePodWithOffers(pod, retryOffers)
+
+	// 3. with pod still on the apiserver, bound, notified via ListWatch
+	// expected: nothing, pod updates not supported, compare ReconcileTask function
 	pod, launchedTask, usedOffer = startTestPod()
 	pod.Annotations = map[string]string{
 		meta.BindingHostKey: *usedOffer.Hostname,
 	}
+	pod.Spec.NodeName = *usedOffer.Hostname
 	podListWatch.Modify(pod, true) // notifying the watchers
 	time.Sleep(time.Second / 2)
 	failPodFromExecutor(launchedTask.taskInfo)
--- a/contrib/mesos/pkg/scheduler/podtask/pod_task.go
+++ b/contrib/mesos/pkg/scheduler/podtask/pod_task.go
@ -71,12 +71,13 @@ type T struct {
 }

 type Spec struct {
-	SlaveID string
-	CPU     mresource.CPUShares
-	Memory  mresource.MegaBytes
-	PortMap []HostPortMapping
-	Ports   []uint64
-	Data    []byte
+	SlaveID       string
+	AssignedSlave string
+	CPU           mresource.CPUShares
+	Memory        mresource.MegaBytes
+	PortMap       []HostPortMapping
+	Ports         []uint64
+	Data          []byte
 }

 // mostly-clone this pod task. the clone will actually share the some fields:
@ -161,9 +162,10 @@ func (t *T) FillFromDetails(details *mesos.Offer) error {
 	log.V(3).Infof("Recording offer(s) %s/%s against pod %v: cpu: %.2f, mem: %.2f MB", details.Id, t.Pod.Namespace, t.Pod.Name, cpu, mem)

 	t.Spec = Spec{
-		SlaveID: details.GetSlaveId().GetValue(),
-		CPU:     cpu,
-		Memory:  mem,
+		SlaveID:       details.GetSlaveId().GetValue(),
+		AssignedSlave: *details.Hostname,
+		CPU:           cpu,
+		Memory:        mem,
 	}

 	// fill in port mapping
@ -346,8 +348,7 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
 		bindTime:   now,
 	}
 	var (
-		offerId  string
-		hostname string
+		offerId string
 	)
 	for _, k := range []string{
 		annotation.BindingHostKey,
@ -362,7 +363,7 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
 		}
 		switch k {
 		case annotation.BindingHostKey:
-			hostname = v
+			t.Spec.AssignedSlave = v
 		case annotation.SlaveIdKey:
 			t.Spec.SlaveID = v
 		case annotation.OfferIdKey:
@ -375,7 +376,7 @@ func RecoverFrom(pod api.Pod) (*T, bool, error) {
 			t.executor = &mesos.ExecutorInfo{ExecutorId: mutil.NewExecutorID(v)}
 		}
 	}
-	t.Offer = offers.Expired(offerId, hostname, 0)
+	t.Offer = offers.Expired(offerId, t.Spec.AssignedSlave, 0)
 	t.Flags[Launched] = struct{}{}
 	t.Flags[Bound] = struct{}{}
 	return t, true, nil
--- a/contrib/mesos/pkg/scheduler/scheduler.go
+++ b/contrib/mesos/pkg/scheduler/scheduler.go
@ -102,7 +102,7 @@ func (self *slaveStorage) getSlave(slaveId string) (*Slave, bool) {
 type PluginInterface interface {
 	// the apiserver may have a different state for the pod than we do
 	// so reconcile our records, but only for this one pod
-	reconcilePod(api.Pod)
+	reconcileTask(*podtask.T)

 	// execute the Scheduling plugin, should start a go routine and return immediately
 	Run(<-chan struct{})
@ -432,7 +432,7 @@ func (k *KubernetesScheduler) StatusUpdate(driver bindings.SchedulerDriver, task
 	case mesos.TaskState_TASK_FAILED:
 		if task, _ := k.taskRegistry.UpdateStatus(taskStatus); task != nil {
 			if task.Has(podtask.Launched) && !task.Has(podtask.Bound) {
-				go k.plugin.reconcilePod(task.Pod)
+				go k.plugin.reconcileTask(task)
 				return
 			}
 		} else {