From 7e175e8ad44e10f8e5c5df519ed1e1b3958e1e7b Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Mon, 17 May 2021 13:30:55 -0700 Subject: [PATCH] Handle conntrack-related sysctls in supervisor agent setup Signed-off-by: Brad Davidson --- go.mod | 2 + pkg/agent/run.go | 53 +++++++++++++++++++++- pkg/agent/syssetup/setup.go | 87 ++++++++++++++++++++++++++++++------- pkg/daemons/agent/agent.go | 11 +++-- scripts/download | 7 ++- vendor/modules.txt | 2 + 6 files changed, 139 insertions(+), 23 deletions(-) diff --git a/go.mod b/go.mod index 4ad889621d..0acd8fd465 100644 --- a/go.mod +++ b/go.mod @@ -26,6 +26,7 @@ replace ( // LOOK TO scripts/download FOR THE VERSION OF runc THAT WE ARE BUILDING/SHIPPING github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.0-rc93.0.20210414171415-3397a09ee932 github.com/opencontainers/runtime-spec => github.com/opencontainers/runtime-spec v1.0.3-0.20210316141917-a8c4a9ee0f6b + github.com/rancher/k3s/pkg/data => ./pkg/data go.etcd.io/etcd => github.com/k3s-io/etcd v0.5.0-alpha.5.0.20201208200253-50621aee4aea golang.org/x/crypto => golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83 golang.org/x/net => golang.org/x/net v0.0.0-20210224082022-3d97a244fca7 @@ -78,6 +79,7 @@ require ( github.com/go-bindata/go-bindata v3.1.2+incompatible github.com/go-sql-driver/mysql v1.4.1 github.com/golangplus/testing v1.0.0 // indirect + github.com/google/cadvisor v0.39.0 github.com/google/tcpproxy v0.0.0-20180808230851-dfa16c61dad2 github.com/google/uuid v1.2.0 github.com/gorilla/mux v1.8.0 diff --git a/pkg/agent/run.go b/pkg/agent/run.go index 9b8449f167..ee36846200 100644 --- a/pkg/agent/run.go +++ b/pkg/agent/run.go @@ -38,7 +38,10 @@ import ( v1 "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/client-go/tools/clientcmd" "k8s.io/controller-manager/app" + app2 "k8s.io/kubernetes/cmd/kube-proxy/app" + kubeproxyconfig "k8s.io/kubernetes/pkg/proxy/apis/config" utilsnet "k8s.io/utils/net" + utilpointer "k8s.io/utils/pointer" ) const ( @@ -86,7 +89,12 @@ func run(ctx context.Context, cfg cmds.Agent, proxy proxy.Proxy) error { return errors.Wrap(err, "failed to validate node-ip") } - syssetup.Configure(dualCluster || dualService || dualNode) + enableIPv6 := dualCluster || dualService || dualNode + conntrackConfig, err := getConntrackConfig(nodeConfig) + if err != nil { + return errors.Wrap(err, "failed to validate kube-proxy conntrack configuration") + } + syssetup.Configure(enableIPv6, conntrackConfig) if err := setupCriCtlConfig(cfg, nodeConfig); err != nil { return err @@ -138,6 +146,49 @@ func run(ctx context.Context, cfg cmds.Agent, proxy proxy.Proxy) error { return ctx.Err() } +// getConntrackConfig uses the kube-proxy code to parse the user-provided kube-proxy-arg values, and +// extract the conntrack settings so that K3s can set them itself. This allows us to soft-fail when +// running K3s in Docker, where kube-proxy is no longer allowed to set conntrack sysctls on newer kernels. +// When running rootless, we do not attempt to set conntrack sysctls - this behavior is copied from kubeadm. +func getConntrackConfig(nodeConfig *daemonconfig.Node) (*kubeproxyconfig.KubeProxyConntrackConfiguration, error) { + ctConfig := &kubeproxyconfig.KubeProxyConntrackConfiguration{ + MaxPerCore: utilpointer.Int32Ptr(0), + Min: utilpointer.Int32Ptr(0), + TCPEstablishedTimeout: &metav1.Duration{}, + TCPCloseWaitTimeout: &metav1.Duration{}, + } + + if nodeConfig.AgentConfig.Rootless { + return ctConfig, nil + } + + cmd := app2.NewProxyCommand() + if err := cmd.ParseFlags(daemonconfig.GetArgsList(map[string]string{}, nodeConfig.AgentConfig.ExtraKubeProxyArgs)); err != nil { + return nil, err + } + maxPerCore, err := cmd.Flags().GetInt32("conntrack-max-per-core") + if err != nil { + return nil, err + } + ctConfig.MaxPerCore = &maxPerCore + min, err := cmd.Flags().GetInt32("conntrack-min") + if err != nil { + return nil, err + } + ctConfig.Min = &min + establishedTimeout, err := cmd.Flags().GetDuration("conntrack-tcp-timeout-established") + if err != nil { + return nil, err + } + ctConfig.TCPEstablishedTimeout.Duration = establishedTimeout + closeWaitTimeout, err := cmd.Flags().GetDuration("conntrack-tcp-timeout-close-wait") + if err != nil { + return nil, err + } + ctConfig.TCPCloseWaitTimeout.Duration = closeWaitTimeout + return ctConfig, nil +} + func coreClient(cfg string) (kubernetes.Interface, error) { restConfig, err := clientcmd.BuildConfigFromFlags("", cfg) if err != nil { diff --git a/pkg/agent/syssetup/setup.go b/pkg/agent/syssetup/setup.go index 9a8aea4038..f5941d1be4 100644 --- a/pkg/agent/syssetup/setup.go +++ b/pkg/agent/syssetup/setup.go @@ -3,11 +3,16 @@ package syssetup import ( - "io/ioutil" "os" "os/exec" + "runtime" + "time" + "github.com/google/cadvisor/machine" + "github.com/google/cadvisor/utils/sysfs" "github.com/sirupsen/logrus" + kubeproxyconfig "k8s.io/kubernetes/pkg/proxy/apis/config" + "k8s.io/kubernetes/pkg/util/sysctl" ) func loadKernelModule(moduleName string) { @@ -16,18 +21,14 @@ func loadKernelModule(moduleName string) { return } - if err := exec.Command("modprobe", moduleName).Run(); err != nil { - logrus.Warn("Failed to start " + moduleName + " module") + if err := exec.Command("modprobe", "--", moduleName).Run(); err != nil { + logrus.Warnf("Failed to load kernel module %v with modprobe", moduleName) } } -func enableSystemControl(file string) { - if err := ioutil.WriteFile(file, []byte("1"), 0640); err != nil { - logrus.Warnf("Failed to write value 1 at "+file+": %v", err) - } -} - -func Configure(enableIPv6 bool) { +// Configure loads required kernel modules and sets sysctls required for other components to +// function properly. +func Configure(enableIPv6 bool, config *kubeproxyconfig.KubeProxyConntrackConfiguration) { loadKernelModule("overlay") loadKernelModule("nf_conntrack") loadKernelModule("br_netfilter") @@ -39,12 +40,66 @@ func Configure(enableIPv6 bool) { // Kernel is inconsistent about how devconf is configured for // new network namespaces between ipv4 and ipv6. Make sure to // enable forwarding on all and default for both ipv4 and ipv6. - enableSystemControl("/proc/sys/net/ipv4/conf/all/forwarding") - enableSystemControl("/proc/sys/net/ipv4/conf/default/forwarding") - enableSystemControl("/proc/sys/net/bridge/bridge-nf-call-iptables") + sysctls := map[string]int{ + "net/ipv4/conf/all/forwarding": 1, + "net/ipv4/conf/default/forwarding": 1, + "net/bridge/bridge-nf-call-iptables": 1, + } + if enableIPv6 { - enableSystemControl("/proc/sys/net/ipv6/conf/all/forwarding") - enableSystemControl("/proc/sys/net/ipv6/conf/default/forwarding") - enableSystemControl("/proc/sys/net/bridge/bridge-nf-call-ip6tables") + sysctls["net/ipv6/conf/all/forwarding"] = 1 + sysctls["net/ipv6/conf/default/forwarding"] = 1 + sysctls["net/bridge/bridge-nf-call-ip6tables"] = 1 + } + + if conntrackMax := getConntrackMax(config); conntrackMax > 0 { + sysctls["net/netfilter/nf_conntrack_max"] = conntrackMax + } + if config.TCPEstablishedTimeout.Duration > 0 { + sysctls["net/netfilter/nf_conntrack_tcp_timeout_established"] = int(config.TCPEstablishedTimeout.Duration / time.Second) + } + if config.TCPCloseWaitTimeout.Duration > 0 { + sysctls["net/netfilter/nf_conntrack_tcp_timeout_close_wait"] = int(config.TCPCloseWaitTimeout.Duration / time.Second) + } + + sys := sysctl.New() + for entry, value := range sysctls { + if val, _ := sys.GetSysctl(entry); val != value { + logrus.Infof("Set sysctl '%v' to %v", entry, value) + if err := sys.SetSysctl(entry, value); err != nil { + logrus.Errorf("Failed to set sysctl: %v", err) + } + } } } + +// getConntrackMax is cribbed from kube-proxy, as recent kernels no longer allow non-init namespaces +// to set conntrack-related sysctls. +// ref: https://github.com/kubernetes/kubernetes/blob/v1.21.1/cmd/kube-proxy/app/server.go#L780 +// ref: https://github.com/kubernetes-sigs/kind/issues/2240 +func getConntrackMax(config *kubeproxyconfig.KubeProxyConntrackConfiguration) int { + if config.MaxPerCore != nil && *config.MaxPerCore > 0 { + floor := 0 + if config.Min != nil { + floor = int(*config.Min) + } + scaled := int(*config.MaxPerCore) * detectNumCPU() + if scaled > floor { + logrus.Debugf("getConntrackMax: using scaled conntrack-max-per-core") + return scaled + } + logrus.Debugf("getConntrackMax: using conntrack-min") + return floor + } + return 0 +} + +// detectNumCPU is also cribbed from kube-proxy +func detectNumCPU() int { + // try get numCPU from /sys firstly due to a known issue (https://github.com/kubernetes/kubernetes/issues/99225) + _, numCPU, err := machine.GetTopology(sysfs.NewRealSysFs()) + if err != nil || numCPU < 1 { + return runtime.NumCPU() + } + return numCPU +} diff --git a/pkg/daemons/agent/agent.go b/pkg/daemons/agent/agent.go index 9701a93bb3..b8fa6e2449 100644 --- a/pkg/daemons/agent/agent.go +++ b/pkg/daemons/agent/agent.go @@ -45,10 +45,13 @@ func Agent(config *config.Agent) error { func startKubeProxy(cfg *config.Agent) error { argsMap := map[string]string{ - "proxy-mode": "iptables", - "healthz-bind-address": "127.0.0.1", - "kubeconfig": cfg.KubeConfigKubeProxy, - "cluster-cidr": util.JoinIPNets(cfg.ClusterCIDRs), + "proxy-mode": "iptables", + "healthz-bind-address": "127.0.0.1", + "kubeconfig": cfg.KubeConfigKubeProxy, + "cluster-cidr": util.JoinIPNets(cfg.ClusterCIDRs), + "conntrack-max-per-core": "0", + "conntrack-tcp-timeout-established": "0s", + "conntrack-tcp-timeout-close-wait": "0s", } if cfg.NodeName != "" { argsMap["hostname-override"] = cfg.NodeName diff --git a/scripts/download b/scripts/download index 9622cb8e8a..9628f72a13 100755 --- a/scripts/download +++ b/scripts/download @@ -8,17 +8,20 @@ RUNC_VERSION=v1.0.0-rc94 ROOT_VERSION=v0.8.1 TRAEFIK_VERSION=9.18.2 # appVersion: 2.4.8 CHARTS_DIR=build/static/charts +RUNC_DIR=build/src/github.com/opencontainers/runc DATA_DIR=build/data export TZ=UTC umask 022 +rm -rf ${CHARTS_DIR} +rm -rf ${RUNC_DIR} mkdir -p ${CHARTS_DIR} mkdir -p ${DATA_DIR} curl --compressed -sfL https://github.com/k3s-io/k3s-root/releases/download/${ROOT_VERSION}/k3s-root-${ARCH}.tar | tar xf - -git clone --depth=1 https://github.com/opencontainers/runc build/src/github.com/opencontainers/runc || true -pushd build/src/github.com/opencontainers/runc +git clone --depth=1 https://github.com/opencontainers/runc ${RUNC_DIR} || true +pushd ${RUNC_DIR} git fetch --all --tags git checkout ${RUNC_VERSION} -b k3s popd diff --git a/vendor/modules.txt b/vendor/modules.txt index eb8f6c4ad3..bfa5065a4a 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -554,6 +554,7 @@ github.com/golang/snappy # github.com/google/btree v1.0.0 github.com/google/btree # github.com/google/cadvisor v0.39.0 +## explicit github.com/google/cadvisor/accelerators github.com/google/cadvisor/cache/memory github.com/google/cadvisor/collector @@ -3195,6 +3196,7 @@ sigs.k8s.io/yaml # github.com/matryer/moq => github.com/rancher/moq v0.0.0-20190404221404-ee5226d43009 # github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.0-rc93.0.20210414171415-3397a09ee932 # github.com/opencontainers/runtime-spec => github.com/opencontainers/runtime-spec v1.0.3-0.20210316141917-a8c4a9ee0f6b +# github.com/rancher/k3s/pkg/data => ./pkg/data # go.etcd.io/etcd => github.com/k3s-io/etcd v0.5.0-alpha.5.0.20201208200253-50621aee4aea # golang.org/x/crypto => golang.org/x/crypto v0.0.0-20210220033148-5ea612d1eb83 # golang.org/x/net => golang.org/x/net v0.0.0-20210224082022-3d97a244fca7