Fix several issues on running syncPods until network is configured.

Also fixed unittests and compiling.
pull/6/head
Dawn Chen 2015-06-22 23:07:40 -07:00
parent 192ffdfb25
commit 23200d303f
7 changed files with 59 additions and 27 deletions

View File

@ -2,6 +2,5 @@ DOCKER_OPTS=""
{% if grains.docker_opts is defined and grains.docker_opts %}
DOCKER_OPTS="${DOCKER_OPTS} {{grains.docker_opts}}"
{% endif %}
DOCKER_OPTS="${DOCKER_OPTS} --bridge=cbr0 --iptables=false --ip-masq=false"
DOCKER_NOFILE=1000000

View File

@ -354,6 +354,7 @@ func (ks *KubeletExecutorServer) createAndInitKubelet(
kc.DockerDaemonContainer,
kc.SystemContainer,
kc.ConfigureCBR0,
kc.PodCIDR,
kc.MaxPods,
kc.DockerExecHandler,
)

View File

@ -23,6 +23,7 @@ import (
"os/exec"
"regexp"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/golang/glog"
)
@ -43,10 +44,16 @@ func createCBR0(wantCIDR *net.IPNet) error {
return err
}
// restart docker
if err := exec.Command("service", "docker", "restart").Run(); err != nil {
glog.Error(err)
// For now just log the error. The containerRuntime check will catch docker failures.
// TODO (dawnchen) figure out what we should do for rkt here.
// For now just log the error. The containerRuntime check will catch docker failures.
// TODO (dawnchen) figure out what we should do for rkt here.
if util.UsingSystemdInitSystem() {
if err := exec.Command("systemctl", "restart", "docker").Run(); err != nil {
glog.Error(err)
}
} else {
if err := exec.Command("service", "docker", "restart").Run(); err != nil {
glog.Error(err)
}
}
glog.V(2).Info("Recreated cbr0 and restarted docker")
return nil
@ -60,7 +67,8 @@ func ensureCbr0(wantCIDR *net.IPNet) error {
if !exists {
glog.V(2).Infof("CBR0 doesn't exist, attempting to create it with range: %s", wantCIDR)
return createCBR0(wantCIDR)
} else if !cbr0CidrCorrect(wantCIDR) {
}
if !cbr0CidrCorrect(wantCIDR) {
glog.V(2).Infof("Attempting to recreate cbr0 with address range: %s", wantCIDR)
// delete cbr0
@ -78,8 +86,7 @@ func ensureCbr0(wantCIDR *net.IPNet) error {
}
func cbr0Exists() (bool, error) {
_, err := os.Stat("/sys/class/net/cbr0")
if err != nil {
if _, err := os.Stat("/sys/class/net/cbr0"); err != nil {
if os.IsNotExist(err) {
return false, nil
}
@ -103,6 +110,7 @@ func cbr0CidrCorrect(wantCIDR *net.IPNet) bool {
return false
}
cbr0CIDR.IP = cbr0IP
glog.V(5).Infof("Want cbr0 CIDR: %s, have cbr0 CIDR: %s", wantCIDR, cbr0CIDR)
return wantCIDR.IP.Equal(cbr0IP) && bytes.Equal(wantCIDR.Mask, cbr0CIDR.Mask)
}

View File

@ -321,8 +321,8 @@ func NewMainKubelet(
klet.containerManager = containerManager
// Start syncing node status immediately, this may set up things the runtime needs to run.
go util.Until(klet.syncNetworkStatus, 30*time.Second, util.NeverStop)
go klet.syncNodeStatus()
go klet.syncNetworkStatus()
// Wait for the runtime to be up with a timeout.
if err := waitUntilRuntimeIsUp(klet.containerRuntime, maxWaitForContainerRuntime); err != nil {
@ -419,7 +419,8 @@ type Kubelet struct {
lastTimestampRuntimeUp time.Time
// Network Status information
networkConfigured bool
networkConfigMutex sync.Mutex
networkConfigured bool
// Volume plugins.
volumePluginMgr volume.VolumePluginMgr
@ -717,6 +718,7 @@ func (kl *Kubelet) Run(updates <-chan PodUpdate) {
}
go util.Until(kl.updateRuntimeUp, 5*time.Second, util.NeverStop)
// Run the system oom watcher forever.
kl.statusManager.Start()
kl.syncLoop(updates, kl)
@ -1714,9 +1716,10 @@ func (kl *Kubelet) syncLoopIteration(updates <-chan PodUpdate, handler SyncHandl
glog.Infof("Skipping pod synchronization, container runtime is not up.")
return
}
if !kl.networkConfigured {
if !kl.doneNetworkConfigure() {
time.Sleep(5 * time.Second)
glog.Infof("Skipping pod synchronization, network is not configured")
return
}
unsyncedPod := false
podSyncTypes := make(map[types.UID]SyncPodType)
@ -1871,6 +1874,7 @@ func (kl *Kubelet) reconcileCBR0(podCIDR string) error {
glog.V(5).Info("PodCIDR not set. Will not configure cbr0.")
return nil
}
glog.V(5).Infof("PodCIDR is set to %q", podCIDR)
_, cidr, err := net.ParseCIDR(podCIDR)
if err != nil {
return err
@ -1906,19 +1910,19 @@ func (kl *Kubelet) recordNodeStatusEvent(event string) {
var oldNodeUnschedulable bool
func (kl *Kubelet) syncNetworkStatus() {
for {
networkConfigured := true
if kl.configureCBR0 {
if len(kl.podCIDR) == 0 {
networkConfigured = false
} else if err := kl.reconcileCBR0(kl.podCIDR); err != nil {
networkConfigured = false
glog.Errorf("Error configuring cbr0: %v", err)
}
kl.networkConfigMutex.Lock()
defer kl.networkConfigMutex.Unlock()
networkConfigured := true
if kl.configureCBR0 {
if len(kl.podCIDR) == 0 {
networkConfigured = false
} else if err := kl.reconcileCBR0(kl.podCIDR); err != nil {
networkConfigured = false
glog.Errorf("Error configuring cbr0: %v", err)
}
kl.networkConfigured = networkConfigured
time.Sleep(30 * time.Second)
}
kl.networkConfigured = networkConfigured
}
// setNodeStatus fills in the Status fields of the given Node, overwriting
@ -1997,11 +2001,13 @@ func (kl *Kubelet) setNodeStatus(node *api.Node) error {
// Check whether container runtime can be reported as up.
containerRuntimeUp := kl.containerRuntimeUp()
// Check whether network is configured properly
networkConfigured := kl.doneNetworkConfigure()
currentTime := util.Now()
var newNodeReadyCondition api.NodeCondition
var oldNodeReadyConditionStatus api.ConditionStatus
if containerRuntimeUp && kl.networkConfigured {
if containerRuntimeUp && networkConfigured {
newNodeReadyCondition = api.NodeCondition{
Type: api.NodeReady,
Status: api.ConditionTrue,
@ -2013,7 +2019,7 @@ func (kl *Kubelet) setNodeStatus(node *api.Node) error {
if !containerRuntimeUp {
reasons = append(reasons, "container runtime is down")
}
if !kl.networkConfigured {
if !networkConfigured {
reasons = append(reasons, "network not configured correctly")
}
newNodeReadyCondition = api.NodeCondition{
@ -2065,6 +2071,12 @@ func (kl *Kubelet) containerRuntimeUp() bool {
return kl.lastTimestampRuntimeUp.Add(kl.runtimeUpThreshold).After(time.Now())
}
func (kl *Kubelet) doneNetworkConfigure() bool {
kl.networkConfigMutex.Lock()
defer kl.networkConfigMutex.Unlock()
return kl.networkConfigured
}
// tryUpdateNodeStatus tries to update node status to master. If ReconcileCBR0
// is set, this function will also confirm that cbr0 is configured correctly.
func (kl *Kubelet) tryUpdateNodeStatus() error {

View File

@ -127,6 +127,7 @@ func newTestKubelet(t *testing.T) *TestKubelet {
}
kubelet.volumeManager = newVolumeManager()
kubelet.containerManager, _ = newContainerManager(mockCadvisor, "", "", "")
kubelet.networkConfigured = true
return &TestKubelet{kubelet, fakeRuntime, mockCadvisor, fakeKubeClient, fakeMirrorClient}
}

View File

@ -21,7 +21,6 @@ import (
"fmt"
"reflect"
"sync"
"time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
@ -60,8 +59,6 @@ func (s *statusManager) Start() {
err := s.syncBatch()
if err != nil {
glog.Warningf("Failed to updated pod status: %v", err)
// Errors and tight-looping are bad, m-kay
time.Sleep(30 * time.Second)
}
}, 0)
}

View File

@ -198,6 +198,20 @@ func CompileRegexps(regexpStrings []string) ([]*regexp.Regexp, error) {
return regexps, nil
}
// Detects if using systemd as the init system
// Please note that simply reading /proc/1/cmdline can be misleading because
// some installation of various init programs can automatically make /sbin/init
// a symlink or even a renamed version of their main program.
// TODO(dchen1107): realiably detects the init system using on the system:
// systemd, upstart, initd, etc.
func UsingSystemdInitSystem() bool {
if _, err := os.Stat("/run/systemd/system"); err != nil {
return true
}
return false
}
// Writes 'value' to /proc/<pid>/oom_score_adj. PID = 0 means self
func ApplyOomScoreAdj(pid int, value int) error {
if value < -1000 || value > 1000 {