k3s/pkg/controller/replication_controller.go

312 lines
9.8 KiB
Go
Raw Normal View History

2014-06-06 23:40:48 +00:00
/*
Copyright 2014 Google Inc. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controller
2014-06-06 23:40:48 +00:00
import (
"encoding/json"
"fmt"
"sort"
"sync"
2014-06-06 23:40:48 +00:00
"time"
2014-06-12 20:17:34 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api/errors"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api/validation"
2014-06-06 23:40:48 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client/record"
2015-03-15 21:51:41 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
2014-06-23 00:02:48 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
2014-06-06 23:40:48 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
"github.com/golang/glog"
2014-06-06 23:40:48 +00:00
)
// ReplicationManager is responsible for synchronizing ReplicationController objects stored
// in the system with actual running pods.
2014-06-06 23:40:48 +00:00
type ReplicationManager struct {
kubeClient client.Interface
2014-06-09 05:38:45 +00:00
podControl PodControlInterface
2014-06-18 00:56:18 +00:00
syncTime <-chan time.Time
// To allow injection of syncReplicationController for testing.
syncHandler func(controller api.ReplicationController) error
2014-06-06 23:40:48 +00:00
}
2014-07-10 11:47:10 +00:00
// PodControlInterface is an interface that knows how to add or delete pods
2014-06-06 23:40:48 +00:00
// created as an interface to allow testing.
2014-06-09 05:38:45 +00:00
type PodControlInterface interface {
2014-07-10 11:47:10 +00:00
// createReplica creates new replicated pods according to the spec.
createReplica(namespace string, controller api.ReplicationController)
2014-07-10 11:47:10 +00:00
// deletePod deletes the pod identified by podID.
2014-10-21 21:14:35 +00:00
deletePod(namespace string, podID string) error
2014-06-06 23:40:48 +00:00
}
2014-07-10 11:47:10 +00:00
// RealPodControl is the default implementation of PodControllerInterface.
2014-06-09 05:38:45 +00:00
type RealPodControl struct {
kubeClient client.Interface
recorder record.EventRecorder
2014-06-06 23:40:48 +00:00
}
// Time period of main replication controller sync loop
const DefaultSyncPeriod = 5 * time.Second
const CreatedByAnnotation = "kubernetes.io/created-by"
func (r RealPodControl) createReplica(namespace string, controller api.ReplicationController) {
desiredLabels := make(labels.Set)
for k, v := range controller.Spec.Template.Labels {
desiredLabels[k] = v
2014-06-06 23:40:48 +00:00
}
desiredAnnotations := make(labels.Set)
for k, v := range controller.Spec.Template.Annotations {
desiredAnnotations[k] = v
}
createdByRef, err := api.GetReference(&controller)
if err != nil {
util.HandleError(fmt.Errorf("unable to get controller reference: %v", err))
return
}
createdByRefJson, err := json.Marshal(createdByRef)
if err != nil {
util.HandleError(fmt.Errorf("unable to serialize controller reference: %v", err))
return
}
desiredAnnotations[CreatedByAnnotation] = string(createdByRefJson)
// use the dash (if the name isn't too long) to make the pod name a bit prettier
prefix := fmt.Sprintf("%s-", controller.Name)
if ok, _ := validation.ValidatePodName(prefix, true); !ok {
prefix = controller.Name
}
2014-09-08 01:31:11 +00:00
pod := &api.Pod{
ObjectMeta: api.ObjectMeta{
Labels: desiredLabels,
Annotations: desiredAnnotations,
GenerateName: prefix,
},
}
2014-11-13 15:52:13 +00:00
if err := api.Scheme.Convert(&controller.Spec.Template.Spec, &pod.Spec); err != nil {
util.HandleError(fmt.Errorf("unable to convert pod template: %v", err))
return
2014-06-06 23:40:48 +00:00
}
2014-11-06 22:53:28 +00:00
if labels.Set(pod.Labels).AsSelector().Empty() {
util.HandleError(fmt.Errorf("unable to create pod replica, no labels"))
2014-11-06 22:53:28 +00:00
return
2014-06-06 23:40:48 +00:00
}
2014-10-21 21:14:35 +00:00
if _, err := r.kubeClient.Pods(namespace).Create(pod); err != nil {
r.recorder.Eventf(&controller, "failedCreate", "Error creating: %v", err)
util.HandleError(fmt.Errorf("unable to create pod replica: %v", err))
2014-06-06 23:40:48 +00:00
}
}
2014-10-21 21:14:35 +00:00
func (r RealPodControl) deletePod(namespace, podID string) error {
return r.kubeClient.Pods(namespace).Delete(podID)
2014-06-06 23:40:48 +00:00
}
// NewReplicationManager creates a new ReplicationManager.
func NewReplicationManager(kubeClient client.Interface) *ReplicationManager {
eventBroadcaster := record.NewBroadcaster()
eventBroadcaster.StartRecordingToSink(kubeClient.Events(""))
rm := &ReplicationManager{
2014-06-06 23:40:48 +00:00
kubeClient: kubeClient,
2014-06-09 05:38:45 +00:00
podControl: RealPodControl{
2014-06-06 23:40:48 +00:00
kubeClient: kubeClient,
recorder: eventBroadcaster.NewRecorder(api.EventSource{Component: "replication-controller"}),
2014-06-06 23:40:48 +00:00
},
}
rm.syncHandler = rm.syncReplicationController
return rm
2014-06-06 23:40:48 +00:00
}
2014-07-10 11:47:10 +00:00
// Run begins watching and syncing.
2014-06-17 23:42:29 +00:00
func (rm *ReplicationManager) Run(period time.Duration) {
2014-06-18 00:56:18 +00:00
rm.syncTime = time.Tick(period)
resourceVersion := ""
go util.Forever(func() { rm.watchControllers(&resourceVersion) }, period)
2014-06-17 23:42:29 +00:00
}
// resourceVersion is a pointer to the resource version to use/update.
func (rm *ReplicationManager) watchControllers(resourceVersion *string) {
2014-10-21 21:14:35 +00:00
watching, err := rm.kubeClient.ReplicationControllers(api.NamespaceAll).Watch(
2014-08-05 23:20:50 +00:00
labels.Everything(),
2015-03-15 21:51:41 +00:00
fields.Everything(),
*resourceVersion,
2014-08-05 23:20:50 +00:00
)
if err != nil {
util.HandleError(fmt.Errorf("unable to watch: %v", err))
time.Sleep(5 * time.Second)
return
}
2014-06-14 01:11:32 +00:00
2014-06-06 23:40:48 +00:00
for {
2014-06-18 00:56:18 +00:00
select {
case <-rm.syncTime:
rm.synchronize()
case event, open := <-watching.ResultChan():
if !open {
// watchChannel has been closed, or something else went
// wrong with our watch call. Let the util.Forever()
// that called us call us again.
2014-06-18 00:56:18 +00:00
return
}
if event.Type == watch.Error {
util.HandleError(fmt.Errorf("error from watch during sync: %v", errors.FromObject(event.Object)))
// Clear the resource version, this may cause us to skip some elements on the watch,
// but we'll catch them on the synchronize() call, so it works out.
*resourceVersion = ""
continue
}
glog.V(4).Infof("Got watch: %#v", event)
rc, ok := event.Object.(*api.ReplicationController)
if !ok {
if status, ok := event.Object.(*api.Status); ok {
if status.Status == api.StatusFailure {
glog.Errorf("Failed to watch: %v", status)
// Clear resource version here, as above, this won't hurt consistency, but we
// should consider introspecting more carefully here. (or make the apiserver smarter)
// "why not both?"
*resourceVersion = ""
continue
}
}
util.HandleError(fmt.Errorf("unexpected object: %#v", event.Object))
continue
2014-06-18 00:56:18 +00:00
}
// If we get disconnected, start where we left off.
*resourceVersion = rc.ResourceVersion
// Sync even if this is a deletion event, to ensure that we leave
// it in the desired state.
glog.V(4).Infof("About to sync from watch: %q", rc.Name)
2014-10-27 17:04:39 +00:00
if err := rm.syncHandler(*rc); err != nil {
util.HandleError(fmt.Errorf("unexpected sync error: %v", err))
2014-10-27 17:04:39 +00:00
}
}
2014-06-06 23:40:48 +00:00
}
}
// filterActivePods returns pods that have not terminated.
func filterActivePods(pods []api.Pod) []*api.Pod {
var result []*api.Pod
for i := range pods {
if api.PodSucceeded != pods[i].Status.Phase &&
api.PodFailed != pods[i].Status.Phase {
result = append(result, &pods[i])
2014-06-06 23:40:48 +00:00
}
}
return result
}
type activePods []*api.Pod
func (s activePods) Len() int { return len(s) }
func (s activePods) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s activePods) Less(i, j int) bool {
// Unassigned < assigned
if s[i].Spec.Host == "" && s[j].Spec.Host != "" {
return true
}
// PodPending < PodUnknown < PodRunning
m := map[api.PodPhase]int{api.PodPending: 0, api.PodUnknown: 1, api.PodRunning: 2}
if m[s[i].Status.Phase] != m[s[j].Status.Phase] {
return m[s[i].Status.Phase] < m[s[j].Status.Phase]
}
// Not ready < ready
if !api.IsPodReady(s[i]) && api.IsPodReady(s[j]) {
return true
}
return false
}
func (rm *ReplicationManager) syncReplicationController(controller api.ReplicationController) error {
s := labels.Set(controller.Spec.Selector).AsSelector()
2015-04-20 18:53:06 +00:00
podList, err := rm.kubeClient.Pods(controller.Namespace).List(s, fields.Everything())
2014-06-06 23:40:48 +00:00
if err != nil {
return err
}
filteredList := filterActivePods(podList.Items)
numActivePods := len(filteredList)
diff := numActivePods - controller.Spec.Replicas
2014-06-06 23:40:48 +00:00
if diff < 0 {
diff *= -1
wait := sync.WaitGroup{}
wait.Add(diff)
glog.V(2).Infof("Too few %q replicas, creating %d", controller.Name, diff)
2014-06-06 23:40:48 +00:00
for i := 0; i < diff; i++ {
go func() {
defer wait.Done()
rm.podControl.createReplica(controller.Namespace, controller)
}()
2014-06-06 23:40:48 +00:00
}
wait.Wait()
2014-06-06 23:40:48 +00:00
} else if diff > 0 {
glog.V(2).Infof("Too many %q replicas, deleting %d", controller.Name, diff)
// Sort the pods in the order such that not-ready < ready, unscheduled
// < scheduled, and pending < running. This ensures that we delete pods
// in the earlier stages whenever possible.
sort.Sort(activePods(filteredList))
wait := sync.WaitGroup{}
wait.Add(diff)
2014-06-06 23:40:48 +00:00
for i := 0; i < diff; i++ {
go func(ix int) {
defer wait.Done()
rm.podControl.deletePod(controller.Namespace, filteredList[ix].Name)
}(i)
2014-06-06 23:40:48 +00:00
}
wait.Wait()
2014-06-06 23:40:48 +00:00
}
if controller.Status.Replicas != numActivePods {
controller.Status.Replicas = numActivePods
_, err = rm.kubeClient.ReplicationControllers(controller.Namespace).Update(&controller)
if err != nil {
return err
}
}
2014-06-06 23:40:48 +00:00
return nil
}
2014-06-17 23:42:29 +00:00
func (rm *ReplicationManager) synchronize() {
// TODO: remove this method completely and rely on the watch.
// Add resource version tracking to watch to make this work.
var controllers []api.ReplicationController
2014-10-21 21:14:35 +00:00
list, err := rm.kubeClient.ReplicationControllers(api.NamespaceAll).List(labels.Everything())
2014-06-17 23:42:29 +00:00
if err != nil {
util.HandleError(fmt.Errorf("synchronization error: %v", err))
return
2014-06-17 23:42:29 +00:00
}
controllers = list.Items
wg := sync.WaitGroup{}
wg.Add(len(controllers))
for ix := range controllers {
go func(ix int) {
defer wg.Done()
glog.V(4).Infof("periodic sync of %v", controllers[ix].Name)
err := rm.syncHandler(controllers[ix])
if err != nil {
util.HandleError(fmt.Errorf("error synchronizing: %v", err))
}
}(ix)
2014-06-06 23:40:48 +00:00
}
wg.Wait()
2014-06-06 23:40:48 +00:00
}