mirror of https://github.com/k3s-io/k3s
1785 lines
68 KiB
Go
1785 lines
68 KiB
Go
/*
|
|
Copyright 2017 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package ipvs
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"net"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"k8s.io/klog"
|
|
|
|
"k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
"k8s.io/apimachinery/pkg/util/sets"
|
|
"k8s.io/apimachinery/pkg/util/wait"
|
|
"k8s.io/client-go/tools/record"
|
|
"k8s.io/kubernetes/pkg/proxy"
|
|
"k8s.io/kubernetes/pkg/proxy/healthcheck"
|
|
"k8s.io/kubernetes/pkg/proxy/metrics"
|
|
utilproxy "k8s.io/kubernetes/pkg/proxy/util"
|
|
"k8s.io/kubernetes/pkg/util/async"
|
|
"k8s.io/kubernetes/pkg/util/conntrack"
|
|
utilipset "k8s.io/kubernetes/pkg/util/ipset"
|
|
utiliptables "k8s.io/kubernetes/pkg/util/iptables"
|
|
utilipvs "k8s.io/kubernetes/pkg/util/ipvs"
|
|
utilnet "k8s.io/kubernetes/pkg/util/net"
|
|
utilsysctl "k8s.io/kubernetes/pkg/util/sysctl"
|
|
utilexec "k8s.io/utils/exec"
|
|
)
|
|
|
|
const (
|
|
// kubeServicesChain is the services portal chain
|
|
kubeServicesChain utiliptables.Chain = "KUBE-SERVICES"
|
|
|
|
// KubeFireWallChain is the kubernetes firewall chain.
|
|
KubeFireWallChain utiliptables.Chain = "KUBE-FIREWALL"
|
|
|
|
// kubePostroutingChain is the kubernetes postrouting chain
|
|
kubePostroutingChain utiliptables.Chain = "KUBE-POSTROUTING"
|
|
|
|
// KubeMarkMasqChain is the mark-for-masquerade chain
|
|
KubeMarkMasqChain utiliptables.Chain = "KUBE-MARK-MASQ"
|
|
|
|
// KubeNodePortChain is the kubernetes node port chain
|
|
KubeNodePortChain utiliptables.Chain = "KUBE-NODE-PORT"
|
|
|
|
// KubeMarkDropChain is the mark-for-drop chain
|
|
KubeMarkDropChain utiliptables.Chain = "KUBE-MARK-DROP"
|
|
|
|
// KubeForwardChain is the kubernetes forward chain
|
|
KubeForwardChain utiliptables.Chain = "KUBE-FORWARD"
|
|
|
|
// KubeLoadBalancerChain is the kubernetes chain for loadbalancer type service
|
|
KubeLoadBalancerChain utiliptables.Chain = "KUBE-LOAD-BALANCER"
|
|
|
|
// DefaultScheduler is the default ipvs scheduler algorithm - round robin.
|
|
DefaultScheduler = "rr"
|
|
|
|
// DefaultDummyDevice is the default dummy interface which ipvs service address will bind to it.
|
|
DefaultDummyDevice = "kube-ipvs0"
|
|
)
|
|
|
|
// iptablesJumpChain is tables of iptables chains that ipvs proxier used to install iptables or cleanup iptables.
|
|
// `to` is the iptables chain we want to operate.
|
|
// `from` is the source iptables chain
|
|
var iptablesJumpChain = []struct {
|
|
table utiliptables.Table
|
|
from utiliptables.Chain
|
|
to utiliptables.Chain
|
|
comment string
|
|
}{
|
|
{utiliptables.TableNAT, utiliptables.ChainOutput, kubeServicesChain, "kubernetes service portals"},
|
|
{utiliptables.TableNAT, utiliptables.ChainPrerouting, kubeServicesChain, "kubernetes service portals"},
|
|
{utiliptables.TableNAT, utiliptables.ChainPostrouting, kubePostroutingChain, "kubernetes postrouting rules"},
|
|
{utiliptables.TableFilter, utiliptables.ChainForward, KubeForwardChain, "kubernetes forwarding rules"},
|
|
}
|
|
|
|
var iptablesChains = []struct {
|
|
table utiliptables.Table
|
|
chain utiliptables.Chain
|
|
}{
|
|
{utiliptables.TableNAT, kubeServicesChain},
|
|
{utiliptables.TableNAT, kubePostroutingChain},
|
|
{utiliptables.TableNAT, KubeFireWallChain},
|
|
{utiliptables.TableNAT, KubeNodePortChain},
|
|
{utiliptables.TableNAT, KubeLoadBalancerChain},
|
|
{utiliptables.TableNAT, KubeMarkMasqChain},
|
|
{utiliptables.TableFilter, KubeForwardChain},
|
|
}
|
|
|
|
// ipsetInfo is all ipset we needed in ipvs proxier
|
|
var ipsetInfo = []struct {
|
|
name string
|
|
setType utilipset.Type
|
|
comment string
|
|
}{
|
|
{kubeLoopBackIPSet, utilipset.HashIPPortIP, kubeLoopBackIPSetComment},
|
|
{kubeClusterIPSet, utilipset.HashIPPort, kubeClusterIPSetComment},
|
|
{kubeExternalIPSet, utilipset.HashIPPort, kubeExternalIPSetComment},
|
|
{kubeLoadBalancerSet, utilipset.HashIPPort, kubeLoadBalancerSetComment},
|
|
{kubeLoadbalancerFWSet, utilipset.HashIPPort, kubeLoadbalancerFWSetComment},
|
|
{kubeLoadBalancerLocalSet, utilipset.HashIPPort, kubeLoadBalancerLocalSetComment},
|
|
{kubeLoadBalancerSourceIPSet, utilipset.HashIPPortIP, kubeLoadBalancerSourceIPSetComment},
|
|
{kubeLoadBalancerSourceCIDRSet, utilipset.HashIPPortNet, kubeLoadBalancerSourceCIDRSetComment},
|
|
{kubeNodePortSetTCP, utilipset.BitmapPort, kubeNodePortSetTCPComment},
|
|
{kubeNodePortLocalSetTCP, utilipset.BitmapPort, kubeNodePortLocalSetTCPComment},
|
|
{kubeNodePortSetUDP, utilipset.BitmapPort, kubeNodePortSetUDPComment},
|
|
{kubeNodePortLocalSetUDP, utilipset.BitmapPort, kubeNodePortLocalSetUDPComment},
|
|
{kubeNodePortSetSCTP, utilipset.BitmapPort, kubeNodePortSetSCTPComment},
|
|
{kubeNodePortLocalSetSCTP, utilipset.BitmapPort, kubeNodePortLocalSetSCTPComment},
|
|
}
|
|
|
|
// ipsetWithIptablesChain is the ipsets list with iptables source chain and the chain jump to
|
|
// `iptables -t nat -A <from> -m set --match-set <name> <matchType> -j <to>`
|
|
// example: iptables -t nat -A KUBE-SERVICES -m set --match-set KUBE-NODE-PORT-TCP dst -j KUBE-NODE-PORT
|
|
// ipsets with other match rules will be created Individually.
|
|
// Note: kubeNodePortLocalSetTCP must be prior to kubeNodePortSetTCP, the same for UDP.
|
|
var ipsetWithIptablesChain = []struct {
|
|
name string
|
|
from string
|
|
to string
|
|
matchType string
|
|
protocolMatch string
|
|
}{
|
|
{kubeLoopBackIPSet, string(kubePostroutingChain), "MASQUERADE", "dst,dst,src", ""},
|
|
{kubeLoadBalancerSet, string(kubeServicesChain), string(KubeLoadBalancerChain), "dst,dst", ""},
|
|
{kubeLoadbalancerFWSet, string(KubeLoadBalancerChain), string(KubeFireWallChain), "dst,dst", ""},
|
|
{kubeLoadBalancerSourceCIDRSet, string(KubeFireWallChain), "RETURN", "dst,dst,src", ""},
|
|
{kubeLoadBalancerSourceIPSet, string(KubeFireWallChain), "RETURN", "dst,dst,src", ""},
|
|
{kubeLoadBalancerLocalSet, string(KubeLoadBalancerChain), "RETURN", "dst,dst", ""},
|
|
{kubeNodePortLocalSetTCP, string(KubeNodePortChain), "RETURN", "dst", "tcp"},
|
|
{kubeNodePortSetTCP, string(KubeNodePortChain), string(KubeMarkMasqChain), "dst", "tcp"},
|
|
{kubeNodePortLocalSetUDP, string(KubeNodePortChain), "RETURN", "dst", "udp"},
|
|
{kubeNodePortSetUDP, string(KubeNodePortChain), string(KubeMarkMasqChain), "dst", "udp"},
|
|
{kubeNodePortSetSCTP, string(kubeServicesChain), string(KubeNodePortChain), "dst", "sctp"},
|
|
{kubeNodePortLocalSetSCTP, string(KubeNodePortChain), "RETURN", "dst", "sctp"},
|
|
}
|
|
|
|
// In IPVS proxy mode, the following flags need to be set
|
|
const sysctlRouteLocalnet = "net/ipv4/conf/all/route_localnet"
|
|
const sysctlBridgeCallIPTables = "net/bridge/bridge-nf-call-iptables"
|
|
const sysctlVSConnTrack = "net/ipv4/vs/conntrack"
|
|
const sysctlConnReuse = "net/ipv4/vs/conn_reuse_mode"
|
|
const sysctlExpireNoDestConn = "net/ipv4/vs/expire_nodest_conn"
|
|
const sysctlExpireQuiescentTemplate = "net/ipv4/vs/expire_quiescent_template"
|
|
const sysctlForward = "net/ipv4/ip_forward"
|
|
const sysctlArpIgnore = "net/ipv4/conf/all/arp_ignore"
|
|
const sysctlArpAnnounce = "net/ipv4/conf/all/arp_announce"
|
|
|
|
// Proxier is an ipvs based proxy for connections between a localhost:lport
|
|
// and services that provide the actual backends.
|
|
type Proxier struct {
|
|
// endpointsChanges and serviceChanges contains all changes to endpoints and
|
|
// services that happened since last syncProxyRules call. For a single object,
|
|
// changes are accumulated, i.e. previous is state from before all of them,
|
|
// current is state after applying all of those.
|
|
endpointsChanges *proxy.EndpointChangeTracker
|
|
serviceChanges *proxy.ServiceChangeTracker
|
|
|
|
mu sync.Mutex // protects the following fields
|
|
serviceMap proxy.ServiceMap
|
|
endpointsMap proxy.EndpointsMap
|
|
portsMap map[utilproxy.LocalPort]utilproxy.Closeable
|
|
// endpointsSynced and servicesSynced are set to true when corresponding
|
|
// objects are synced after startup. This is used to avoid updating ipvs rules
|
|
// with some partial data after kube-proxy restart.
|
|
endpointsSynced bool
|
|
servicesSynced bool
|
|
initialized int32
|
|
syncRunner *async.BoundedFrequencyRunner // governs calls to syncProxyRules
|
|
|
|
// These are effectively const and do not need the mutex to be held.
|
|
syncPeriod time.Duration
|
|
minSyncPeriod time.Duration
|
|
// Values are CIDR's to exclude when cleaning up IPVS rules.
|
|
excludeCIDRs []string
|
|
iptables utiliptables.Interface
|
|
ipvs utilipvs.Interface
|
|
ipset utilipset.Interface
|
|
exec utilexec.Interface
|
|
masqueradeAll bool
|
|
masqueradeMark string
|
|
clusterCIDR string
|
|
hostname string
|
|
nodeIP net.IP
|
|
portMapper utilproxy.PortOpener
|
|
recorder record.EventRecorder
|
|
healthChecker healthcheck.Server
|
|
healthzServer healthcheck.HealthzUpdater
|
|
ipvsScheduler string
|
|
// Added as a member to the struct to allow injection for testing.
|
|
ipGetter IPGetter
|
|
// The following buffers are used to reuse memory and avoid allocations
|
|
// that are significantly impacting performance.
|
|
iptablesData *bytes.Buffer
|
|
filterChainsData *bytes.Buffer
|
|
natChains *bytes.Buffer
|
|
filterChains *bytes.Buffer
|
|
natRules *bytes.Buffer
|
|
filterRules *bytes.Buffer
|
|
// Added as a member to the struct to allow injection for testing.
|
|
netlinkHandle NetLinkHandle
|
|
// ipsetList is the list of ipsets that ipvs proxier used.
|
|
ipsetList map[string]*IPSet
|
|
// Values are as a parameter to select the interfaces which nodeport works.
|
|
nodePortAddresses []string
|
|
// networkInterfacer defines an interface for several net library functions.
|
|
// Inject for test purpose.
|
|
networkInterfacer utilproxy.NetworkInterfacer
|
|
gracefuldeleteManager *GracefulTerminationManager
|
|
}
|
|
|
|
// IPGetter helps get node network interface IP
|
|
type IPGetter interface {
|
|
NodeIPs() ([]net.IP, error)
|
|
}
|
|
|
|
// realIPGetter is a real NodeIP handler, it implements IPGetter.
|
|
type realIPGetter struct {
|
|
// nl is a handle for revoking netlink interface
|
|
nl NetLinkHandle
|
|
}
|
|
|
|
// NodeIPs returns all LOCAL type IP addresses from host which are taken as the Node IPs of NodePort service.
|
|
// It will list source IP exists in local route table with `kernel` protocol type, and filter out IPVS proxier
|
|
// created dummy device `kube-ipvs0` For example,
|
|
// $ ip route show table local type local proto kernel
|
|
// 10.0.0.1 dev kube-ipvs0 scope host src 10.0.0.1
|
|
// 10.0.0.10 dev kube-ipvs0 scope host src 10.0.0.10
|
|
// 10.0.0.252 dev kube-ipvs0 scope host src 10.0.0.252
|
|
// 100.106.89.164 dev eth0 scope host src 100.106.89.164
|
|
// 127.0.0.0/8 dev lo scope host src 127.0.0.1
|
|
// 127.0.0.1 dev lo scope host src 127.0.0.1
|
|
// 172.17.0.1 dev docker0 scope host src 172.17.0.1
|
|
// 192.168.122.1 dev virbr0 scope host src 192.168.122.1
|
|
// Then filter out dev==kube-ipvs0, and cut the unique src IP fields,
|
|
// Node IP set: [100.106.89.164, 127.0.0.1, 192.168.122.1]
|
|
func (r *realIPGetter) NodeIPs() (ips []net.IP, err error) {
|
|
// Pass in empty filter device name for list all LOCAL type addresses.
|
|
nodeAddress, err := r.nl.GetLocalAddresses("", DefaultDummyDevice)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error listing LOCAL type addresses from host, error: %v", err)
|
|
}
|
|
// translate ip string to IP
|
|
for _, ipStr := range nodeAddress.UnsortedList() {
|
|
ips = append(ips, net.ParseIP(ipStr))
|
|
}
|
|
return ips, nil
|
|
}
|
|
|
|
// Proxier implements ProxyProvider
|
|
var _ proxy.ProxyProvider = &Proxier{}
|
|
|
|
// NewProxier returns a new Proxier given an iptables and ipvs Interface instance.
|
|
// Because of the iptables and ipvs logic, it is assumed that there is only a single Proxier active on a machine.
|
|
// An error will be returned if it fails to update or acquire the initial lock.
|
|
// Once a proxier is created, it will keep iptables and ipvs rules up to date in the background and
|
|
// will not terminate if a particular iptables or ipvs call fails.
|
|
func NewProxier(ipt utiliptables.Interface,
|
|
ipvs utilipvs.Interface,
|
|
ipset utilipset.Interface,
|
|
sysctl utilsysctl.Interface,
|
|
exec utilexec.Interface,
|
|
syncPeriod time.Duration,
|
|
minSyncPeriod time.Duration,
|
|
excludeCIDRs []string,
|
|
masqueradeAll bool,
|
|
masqueradeBit int,
|
|
clusterCIDR string,
|
|
hostname string,
|
|
nodeIP net.IP,
|
|
recorder record.EventRecorder,
|
|
healthzServer healthcheck.HealthzUpdater,
|
|
scheduler string,
|
|
nodePortAddresses []string,
|
|
) (*Proxier, error) {
|
|
// Set the route_localnet sysctl we need for
|
|
if val, _ := sysctl.GetSysctl(sysctlRouteLocalnet); val != 1 {
|
|
if err := sysctl.SetSysctl(sysctlRouteLocalnet, 1); err != nil {
|
|
return nil, fmt.Errorf("can't set sysctl %s: %v", sysctlRouteLocalnet, err)
|
|
}
|
|
}
|
|
|
|
// Proxy needs br_netfilter and bridge-nf-call-iptables=1 when containers
|
|
// are connected to a Linux bridge (but not SDN bridges). Until most
|
|
// plugins handle this, log when config is missing
|
|
if val, err := sysctl.GetSysctl(sysctlBridgeCallIPTables); err == nil && val != 1 {
|
|
klog.Infof("missing br-netfilter module or unset sysctl br-nf-call-iptables; proxy may not work as intended")
|
|
}
|
|
|
|
// Set the conntrack sysctl we need for
|
|
if val, _ := sysctl.GetSysctl(sysctlVSConnTrack); val != 1 {
|
|
if err := sysctl.SetSysctl(sysctlVSConnTrack, 1); err != nil {
|
|
return nil, fmt.Errorf("can't set sysctl %s: %v", sysctlVSConnTrack, err)
|
|
}
|
|
}
|
|
|
|
// Set the connection reuse mode
|
|
if val, _ := sysctl.GetSysctl(sysctlConnReuse); val != 0 {
|
|
if err := sysctl.SetSysctl(sysctlConnReuse, 0); err != nil {
|
|
return nil, fmt.Errorf("can't set sysctl %s: %v", sysctlConnReuse, err)
|
|
}
|
|
}
|
|
|
|
// Set the expire_nodest_conn sysctl we need for
|
|
if val, _ := sysctl.GetSysctl(sysctlExpireNoDestConn); val != 1 {
|
|
if err := sysctl.SetSysctl(sysctlExpireNoDestConn, 1); err != nil {
|
|
return nil, fmt.Errorf("can't set sysctl %s: %v", sysctlExpireNoDestConn, err)
|
|
}
|
|
}
|
|
|
|
// Set the expire_quiescent_template sysctl we need for
|
|
if val, _ := sysctl.GetSysctl(sysctlExpireQuiescentTemplate); val != 1 {
|
|
if err := sysctl.SetSysctl(sysctlExpireQuiescentTemplate, 1); err != nil {
|
|
return nil, fmt.Errorf("can't set sysctl %s: %v", sysctlExpireQuiescentTemplate, err)
|
|
}
|
|
}
|
|
|
|
// Set the ip_forward sysctl we need for
|
|
if val, _ := sysctl.GetSysctl(sysctlForward); val != 1 {
|
|
if err := sysctl.SetSysctl(sysctlForward, 1); err != nil {
|
|
return nil, fmt.Errorf("can't set sysctl %s: %v", sysctlForward, err)
|
|
}
|
|
}
|
|
|
|
// Set the arp_ignore sysctl we need for
|
|
if val, _ := sysctl.GetSysctl(sysctlArpIgnore); val != 1 {
|
|
if err := sysctl.SetSysctl(sysctlArpIgnore, 1); err != nil {
|
|
return nil, fmt.Errorf("can't set sysctl %s: %v", sysctlArpIgnore, err)
|
|
}
|
|
}
|
|
|
|
// Set the arp_announce sysctl we need for
|
|
if val, _ := sysctl.GetSysctl(sysctlArpAnnounce); val != 2 {
|
|
if err := sysctl.SetSysctl(sysctlArpAnnounce, 2); err != nil {
|
|
return nil, fmt.Errorf("can't set sysctl %s: %v", sysctlArpAnnounce, err)
|
|
}
|
|
}
|
|
|
|
// Generate the masquerade mark to use for SNAT rules.
|
|
masqueradeValue := 1 << uint(masqueradeBit)
|
|
masqueradeMark := fmt.Sprintf("%#08x/%#08x", masqueradeValue, masqueradeValue)
|
|
|
|
if nodeIP == nil {
|
|
klog.Warningf("invalid nodeIP, initializing kube-proxy with 127.0.0.1 as nodeIP")
|
|
nodeIP = net.ParseIP("127.0.0.1")
|
|
}
|
|
|
|
isIPv6 := utilnet.IsIPv6(nodeIP)
|
|
|
|
klog.V(2).Infof("nodeIP: %v, isIPv6: %v", nodeIP, isIPv6)
|
|
|
|
if len(clusterCIDR) == 0 {
|
|
klog.Warningf("clusterCIDR not specified, unable to distinguish between internal and external traffic")
|
|
} else if utilnet.IsIPv6CIDR(clusterCIDR) != isIPv6 {
|
|
return nil, fmt.Errorf("clusterCIDR %s has incorrect IP version: expect isIPv6=%t", clusterCIDR, isIPv6)
|
|
}
|
|
|
|
if len(scheduler) == 0 {
|
|
klog.Warningf("IPVS scheduler not specified, use %s by default", DefaultScheduler)
|
|
scheduler = DefaultScheduler
|
|
}
|
|
|
|
healthChecker := healthcheck.NewServer(hostname, recorder, nil, nil) // use default implementations of deps
|
|
|
|
proxier := &Proxier{
|
|
portsMap: make(map[utilproxy.LocalPort]utilproxy.Closeable),
|
|
serviceMap: make(proxy.ServiceMap),
|
|
serviceChanges: proxy.NewServiceChangeTracker(newServiceInfo, &isIPv6, recorder),
|
|
endpointsMap: make(proxy.EndpointsMap),
|
|
endpointsChanges: proxy.NewEndpointChangeTracker(hostname, nil, &isIPv6, recorder),
|
|
syncPeriod: syncPeriod,
|
|
minSyncPeriod: minSyncPeriod,
|
|
excludeCIDRs: excludeCIDRs,
|
|
iptables: ipt,
|
|
masqueradeAll: masqueradeAll,
|
|
masqueradeMark: masqueradeMark,
|
|
exec: exec,
|
|
clusterCIDR: clusterCIDR,
|
|
hostname: hostname,
|
|
nodeIP: nodeIP,
|
|
portMapper: &listenPortOpener{},
|
|
recorder: recorder,
|
|
healthChecker: healthChecker,
|
|
healthzServer: healthzServer,
|
|
ipvs: ipvs,
|
|
ipvsScheduler: scheduler,
|
|
ipGetter: &realIPGetter{nl: NewNetLinkHandle()},
|
|
iptablesData: bytes.NewBuffer(nil),
|
|
filterChainsData: bytes.NewBuffer(nil),
|
|
natChains: bytes.NewBuffer(nil),
|
|
natRules: bytes.NewBuffer(nil),
|
|
filterChains: bytes.NewBuffer(nil),
|
|
filterRules: bytes.NewBuffer(nil),
|
|
netlinkHandle: NewNetLinkHandle(),
|
|
ipset: ipset,
|
|
nodePortAddresses: nodePortAddresses,
|
|
networkInterfacer: utilproxy.RealNetwork{},
|
|
gracefuldeleteManager: NewGracefulTerminationManager(ipvs),
|
|
}
|
|
// initialize ipsetList with all sets we needed
|
|
proxier.ipsetList = make(map[string]*IPSet)
|
|
for _, is := range ipsetInfo {
|
|
proxier.ipsetList[is.name] = NewIPSet(ipset, is.name, is.setType, isIPv6, is.comment)
|
|
}
|
|
burstSyncs := 2
|
|
klog.V(3).Infof("minSyncPeriod: %v, syncPeriod: %v, burstSyncs: %d", minSyncPeriod, syncPeriod, burstSyncs)
|
|
proxier.syncRunner = async.NewBoundedFrequencyRunner("sync-runner", proxier.syncProxyRules, minSyncPeriod, syncPeriod, burstSyncs)
|
|
proxier.gracefuldeleteManager.Run()
|
|
return proxier, nil
|
|
}
|
|
|
|
// internal struct for string service information
|
|
type serviceInfo struct {
|
|
*proxy.BaseServiceInfo
|
|
// The following fields are computed and stored for performance reasons.
|
|
serviceNameString string
|
|
}
|
|
|
|
// returns a new proxy.ServicePort which abstracts a serviceInfo
|
|
func newServiceInfo(port *v1.ServicePort, service *v1.Service, baseInfo *proxy.BaseServiceInfo) proxy.ServicePort {
|
|
info := &serviceInfo{BaseServiceInfo: baseInfo}
|
|
|
|
// Store the following for performance reasons.
|
|
svcName := types.NamespacedName{Namespace: service.Namespace, Name: service.Name}
|
|
svcPortName := proxy.ServicePortName{NamespacedName: svcName, Port: port.Name}
|
|
info.serviceNameString = svcPortName.String()
|
|
|
|
return info
|
|
}
|
|
|
|
// KernelHandler can handle the current installed kernel modules.
|
|
type KernelHandler interface {
|
|
GetModules() ([]string, error)
|
|
}
|
|
|
|
// LinuxKernelHandler implements KernelHandler interface.
|
|
type LinuxKernelHandler struct {
|
|
executor utilexec.Interface
|
|
}
|
|
|
|
// NewLinuxKernelHandler initializes LinuxKernelHandler with exec.
|
|
func NewLinuxKernelHandler() *LinuxKernelHandler {
|
|
return &LinuxKernelHandler{
|
|
executor: utilexec.New(),
|
|
}
|
|
}
|
|
|
|
// GetModules returns all installed kernel modules.
|
|
func (handle *LinuxKernelHandler) GetModules() ([]string, error) {
|
|
// Check whether IPVS required kernel modules are built-in
|
|
kernelVersion, ipvsModules, err := utilipvs.GetKernelVersionAndIPVSMods(handle.executor)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
builtinModsFilePath := fmt.Sprintf("/lib/modules/%s/modules.builtin", kernelVersion)
|
|
b, err := ioutil.ReadFile(builtinModsFilePath)
|
|
if err != nil {
|
|
klog.Warningf("Failed to read file %s with error %v. You can ignore this message when kube-proxy is running inside container without mounting /lib/modules", builtinModsFilePath, err)
|
|
}
|
|
var bmods []string
|
|
for _, module := range ipvsModules {
|
|
if match, _ := regexp.Match(module+".ko", b); match {
|
|
bmods = append(bmods, module)
|
|
}
|
|
}
|
|
|
|
// Try to load IPVS required kernel modules using modprobe first
|
|
for _, kmod := range ipvsModules {
|
|
err := handle.executor.Command("modprobe", "--", kmod).Run()
|
|
if err != nil {
|
|
klog.Warningf("Failed to load kernel module %v with modprobe. "+
|
|
"You can ignore this message when kube-proxy is running inside container without mounting /lib/modules", kmod)
|
|
}
|
|
}
|
|
|
|
// Find out loaded kernel modules
|
|
out, err := handle.executor.Command("cut", "-f1", "-d", " ", "/proc/modules").CombinedOutput()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
mods := strings.Split(string(out), "\n")
|
|
return append(mods, bmods...), nil
|
|
}
|
|
|
|
// CanUseIPVSProxier returns true if we can use the ipvs Proxier.
|
|
// This is determined by checking if all the required kernel modules can be loaded. It may
|
|
// return an error if it fails to get the kernel modules information without error, in which
|
|
// case it will also return false.
|
|
func CanUseIPVSProxier(handle KernelHandler, ipsetver IPSetVersioner) (bool, error) {
|
|
mods, err := handle.GetModules()
|
|
if err != nil {
|
|
return false, fmt.Errorf("error getting installed ipvs required kernel modules: %v", err)
|
|
}
|
|
wantModules := sets.NewString()
|
|
loadModules := sets.NewString()
|
|
linuxKernelHandler := NewLinuxKernelHandler()
|
|
_, ipvsModules, _ := utilipvs.GetKernelVersionAndIPVSMods(linuxKernelHandler.executor)
|
|
wantModules.Insert(ipvsModules...)
|
|
loadModules.Insert(mods...)
|
|
modules := wantModules.Difference(loadModules).UnsortedList()
|
|
var missingMods []string
|
|
ConntrackiMissingCounter := 0
|
|
for _, mod := range modules {
|
|
if strings.Contains(mod, "nf_conntrack") {
|
|
ConntrackiMissingCounter++
|
|
} else {
|
|
missingMods = append(missingMods, mod)
|
|
}
|
|
}
|
|
if ConntrackiMissingCounter == 2 {
|
|
missingMods = append(missingMods, "nf_conntrack_ipv4(or nf_conntrack for Linux kernel 4.19 and later)")
|
|
}
|
|
|
|
if len(missingMods) != 0 {
|
|
return false, fmt.Errorf("IPVS proxier will not be used because the following required kernel modules are not loaded: %v", missingMods)
|
|
}
|
|
|
|
// Check ipset version
|
|
versionString, err := ipsetver.GetVersion()
|
|
if err != nil {
|
|
return false, fmt.Errorf("error getting ipset version, error: %v", err)
|
|
}
|
|
if !checkMinVersion(versionString) {
|
|
return false, fmt.Errorf("ipset version: %s is less than min required version: %s", versionString, MinIPSetCheckVersion)
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
// CleanupIptablesLeftovers removes all iptables rules and chains created by the Proxier
|
|
// It returns true if an error was encountered. Errors are logged.
|
|
func cleanupIptablesLeftovers(ipt utiliptables.Interface) (encounteredError bool) {
|
|
// Unlink the iptables chains created by ipvs Proxier
|
|
for _, jc := range iptablesJumpChain {
|
|
args := []string{
|
|
"-m", "comment", "--comment", jc.comment,
|
|
"-j", string(jc.to),
|
|
}
|
|
if err := ipt.DeleteRule(jc.table, jc.from, args...); err != nil {
|
|
if !utiliptables.IsNotFoundError(err) {
|
|
klog.Errorf("Error removing iptables rules in ipvs proxier: %v", err)
|
|
encounteredError = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// Flush and remove all of our chains. Flushing all chains before removing them also removes all links between chains first.
|
|
for _, ch := range iptablesChains {
|
|
if err := ipt.FlushChain(ch.table, ch.chain); err != nil {
|
|
if !utiliptables.IsNotFoundError(err) {
|
|
klog.Errorf("Error removing iptables rules in ipvs proxier: %v", err)
|
|
encounteredError = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// Remove all of our chains.
|
|
for _, ch := range iptablesChains {
|
|
if err := ipt.DeleteChain(ch.table, ch.chain); err != nil {
|
|
if !utiliptables.IsNotFoundError(err) {
|
|
klog.Errorf("Error removing iptables rules in ipvs proxier: %v", err)
|
|
encounteredError = true
|
|
}
|
|
}
|
|
}
|
|
|
|
return encounteredError
|
|
}
|
|
|
|
// CleanupLeftovers clean up all ipvs and iptables rules created by ipvs Proxier.
|
|
func CleanupLeftovers(ipvs utilipvs.Interface, ipt utiliptables.Interface, ipset utilipset.Interface, cleanupIPVS bool) (encounteredError bool) {
|
|
if cleanupIPVS {
|
|
// Return immediately when ipvs interface is nil - Probably initialization failed in somewhere.
|
|
if ipvs == nil {
|
|
return true
|
|
}
|
|
encounteredError = false
|
|
err := ipvs.Flush()
|
|
if err != nil {
|
|
klog.Errorf("Error flushing IPVS rules: %v", err)
|
|
encounteredError = true
|
|
}
|
|
}
|
|
// Delete dummy interface created by ipvs Proxier.
|
|
nl := NewNetLinkHandle()
|
|
err := nl.DeleteDummyDevice(DefaultDummyDevice)
|
|
if err != nil {
|
|
klog.Errorf("Error deleting dummy device %s created by IPVS proxier: %v", DefaultDummyDevice, err)
|
|
encounteredError = true
|
|
}
|
|
// Clear iptables created by ipvs Proxier.
|
|
encounteredError = cleanupIptablesLeftovers(ipt) || encounteredError
|
|
// Destroy ip sets created by ipvs Proxier. We should call it after cleaning up
|
|
// iptables since we can NOT delete ip set which is still referenced by iptables.
|
|
for _, set := range ipsetInfo {
|
|
err = ipset.DestroySet(set.name)
|
|
if err != nil {
|
|
if !utilipset.IsNotFoundError(err) {
|
|
klog.Errorf("Error removing ipset %s, error: %v", set.name, err)
|
|
encounteredError = true
|
|
}
|
|
}
|
|
}
|
|
return encounteredError
|
|
}
|
|
|
|
// Sync is called to synchronize the proxier state to iptables and ipvs as soon as possible.
|
|
func (proxier *Proxier) Sync() {
|
|
proxier.syncRunner.Run()
|
|
}
|
|
|
|
// SyncLoop runs periodic work. This is expected to run as a goroutine or as the main loop of the app. It does not return.
|
|
func (proxier *Proxier) SyncLoop() {
|
|
// Update healthz timestamp at beginning in case Sync() never succeeds.
|
|
if proxier.healthzServer != nil {
|
|
proxier.healthzServer.UpdateTimestamp()
|
|
}
|
|
proxier.syncRunner.Loop(wait.NeverStop)
|
|
}
|
|
|
|
func (proxier *Proxier) setInitialized(value bool) {
|
|
var initialized int32
|
|
if value {
|
|
initialized = 1
|
|
}
|
|
atomic.StoreInt32(&proxier.initialized, initialized)
|
|
}
|
|
|
|
func (proxier *Proxier) isInitialized() bool {
|
|
return atomic.LoadInt32(&proxier.initialized) > 0
|
|
}
|
|
|
|
// OnServiceAdd is called whenever creation of new service object is observed.
|
|
func (proxier *Proxier) OnServiceAdd(service *v1.Service) {
|
|
proxier.OnServiceUpdate(nil, service)
|
|
}
|
|
|
|
// OnServiceUpdate is called whenever modification of an existing service object is observed.
|
|
func (proxier *Proxier) OnServiceUpdate(oldService, service *v1.Service) {
|
|
if proxier.serviceChanges.Update(oldService, service) && proxier.isInitialized() {
|
|
proxier.syncRunner.Run()
|
|
}
|
|
}
|
|
|
|
// OnServiceDelete is called whenever deletion of an existing service object is observed.
|
|
func (proxier *Proxier) OnServiceDelete(service *v1.Service) {
|
|
proxier.OnServiceUpdate(service, nil)
|
|
}
|
|
|
|
// OnServiceSynced is called once all the initial even handlers were called and the state is fully propagated to local cache.
|
|
func (proxier *Proxier) OnServiceSynced() {
|
|
proxier.mu.Lock()
|
|
proxier.servicesSynced = true
|
|
proxier.setInitialized(proxier.servicesSynced && proxier.endpointsSynced)
|
|
proxier.mu.Unlock()
|
|
|
|
// Sync unconditionally - this is called once per lifetime.
|
|
proxier.syncProxyRules()
|
|
}
|
|
|
|
// OnEndpointsAdd is called whenever creation of new endpoints object is observed.
|
|
func (proxier *Proxier) OnEndpointsAdd(endpoints *v1.Endpoints) {
|
|
proxier.OnEndpointsUpdate(nil, endpoints)
|
|
}
|
|
|
|
// OnEndpointsUpdate is called whenever modification of an existing endpoints object is observed.
|
|
func (proxier *Proxier) OnEndpointsUpdate(oldEndpoints, endpoints *v1.Endpoints) {
|
|
if proxier.endpointsChanges.Update(oldEndpoints, endpoints) && proxier.isInitialized() {
|
|
proxier.syncRunner.Run()
|
|
}
|
|
}
|
|
|
|
// OnEndpointsDelete is called whenever deletion of an existing endpoints object is observed.
|
|
func (proxier *Proxier) OnEndpointsDelete(endpoints *v1.Endpoints) {
|
|
proxier.OnEndpointsUpdate(endpoints, nil)
|
|
}
|
|
|
|
// OnEndpointsSynced is called once all the initial event handlers were called and the state is fully propagated to local cache.
|
|
func (proxier *Proxier) OnEndpointsSynced() {
|
|
proxier.mu.Lock()
|
|
proxier.endpointsSynced = true
|
|
proxier.setInitialized(proxier.servicesSynced && proxier.endpointsSynced)
|
|
proxier.mu.Unlock()
|
|
|
|
// Sync unconditionally - this is called once per lifetime.
|
|
proxier.syncProxyRules()
|
|
}
|
|
|
|
// EntryInvalidErr indicates if an ipset entry is invalid or not
|
|
const EntryInvalidErr = "error adding entry %s to ipset %s"
|
|
|
|
// This is where all of the ipvs calls happen.
|
|
// assumes proxier.mu is held
|
|
func (proxier *Proxier) syncProxyRules() {
|
|
proxier.mu.Lock()
|
|
defer proxier.mu.Unlock()
|
|
|
|
start := time.Now()
|
|
defer func() {
|
|
metrics.SyncProxyRulesLatency.Observe(metrics.SinceInMicroseconds(start))
|
|
klog.V(4).Infof("syncProxyRules took %v", time.Since(start))
|
|
}()
|
|
// don't sync rules till we've received services and endpoints
|
|
if !proxier.endpointsSynced || !proxier.servicesSynced {
|
|
klog.V(2).Info("Not syncing ipvs rules until Services and Endpoints have been received from master")
|
|
return
|
|
}
|
|
|
|
// We assume that if this was called, we really want to sync them,
|
|
// even if nothing changed in the meantime. In other words, callers are
|
|
// responsible for detecting no-op changes and not calling this function.
|
|
serviceUpdateResult := proxy.UpdateServiceMap(proxier.serviceMap, proxier.serviceChanges)
|
|
endpointUpdateResult := proxy.UpdateEndpointsMap(proxier.endpointsMap, proxier.endpointsChanges)
|
|
|
|
staleServices := serviceUpdateResult.UDPStaleClusterIP
|
|
// merge stale services gathered from updateEndpointsMap
|
|
for _, svcPortName := range endpointUpdateResult.StaleServiceNames {
|
|
if svcInfo, ok := proxier.serviceMap[svcPortName]; ok && svcInfo != nil && svcInfo.GetProtocol() == v1.ProtocolUDP {
|
|
klog.V(2).Infof("Stale udp service %v -> %s", svcPortName, svcInfo.ClusterIPString())
|
|
staleServices.Insert(svcInfo.ClusterIPString())
|
|
}
|
|
}
|
|
|
|
klog.V(3).Infof("Syncing ipvs Proxier rules")
|
|
|
|
// Begin install iptables
|
|
|
|
// Reset all buffers used later.
|
|
// This is to avoid memory reallocations and thus improve performance.
|
|
proxier.natChains.Reset()
|
|
proxier.natRules.Reset()
|
|
proxier.filterChains.Reset()
|
|
proxier.filterRules.Reset()
|
|
|
|
// Write table headers.
|
|
writeLine(proxier.filterChains, "*filter")
|
|
writeLine(proxier.natChains, "*nat")
|
|
|
|
proxier.createAndLinkeKubeChain()
|
|
|
|
// make sure dummy interface exists in the system where ipvs Proxier will bind service address on it
|
|
_, err := proxier.netlinkHandle.EnsureDummyDevice(DefaultDummyDevice)
|
|
if err != nil {
|
|
klog.Errorf("Failed to create dummy interface: %s, error: %v", DefaultDummyDevice, err)
|
|
return
|
|
}
|
|
|
|
// make sure ip sets exists in the system.
|
|
for _, set := range proxier.ipsetList {
|
|
if err := ensureIPSet(set); err != nil {
|
|
return
|
|
}
|
|
set.resetEntries()
|
|
}
|
|
|
|
// Accumulate the set of local ports that we will be holding open once this update is complete
|
|
replacementPortsMap := map[utilproxy.LocalPort]utilproxy.Closeable{}
|
|
// activeIPVSServices represents IPVS service successfully created in this round of sync
|
|
activeIPVSServices := map[string]bool{}
|
|
// currentIPVSServices represent IPVS services listed from the system
|
|
currentIPVSServices := make(map[string]*utilipvs.VirtualServer)
|
|
// activeBindAddrs represents ip address successfully bind to DefaultDummyDevice in this round of sync
|
|
activeBindAddrs := map[string]bool{}
|
|
|
|
// Build IPVS rules for each service.
|
|
for svcName, svc := range proxier.serviceMap {
|
|
svcInfo, ok := svc.(*serviceInfo)
|
|
if !ok {
|
|
klog.Errorf("Failed to cast serviceInfo %q", svcName.String())
|
|
continue
|
|
}
|
|
protocol := strings.ToLower(string(svcInfo.Protocol))
|
|
// Precompute svcNameString; with many services the many calls
|
|
// to ServicePortName.String() show up in CPU profiles.
|
|
svcNameString := svcName.String()
|
|
|
|
// Handle traffic that loops back to the originator with SNAT.
|
|
for _, e := range proxier.endpointsMap[svcName] {
|
|
ep, ok := e.(*proxy.BaseEndpointInfo)
|
|
if !ok {
|
|
klog.Errorf("Failed to cast BaseEndpointInfo %q", e.String())
|
|
continue
|
|
}
|
|
if !ep.IsLocal {
|
|
continue
|
|
}
|
|
epIP := ep.IP()
|
|
epPort, err := ep.Port()
|
|
// Error parsing this endpoint has been logged. Skip to next endpoint.
|
|
if epIP == "" || err != nil {
|
|
continue
|
|
}
|
|
entry := &utilipset.Entry{
|
|
IP: epIP,
|
|
Port: epPort,
|
|
Protocol: protocol,
|
|
IP2: epIP,
|
|
SetType: utilipset.HashIPPortIP,
|
|
}
|
|
if valid := proxier.ipsetList[kubeLoopBackIPSet].validateEntry(entry); !valid {
|
|
klog.Errorf("%s", fmt.Sprintf(EntryInvalidErr, entry, proxier.ipsetList[kubeLoopBackIPSet].Name))
|
|
continue
|
|
}
|
|
proxier.ipsetList[kubeLoopBackIPSet].activeEntries.Insert(entry.String())
|
|
}
|
|
|
|
// Capture the clusterIP.
|
|
// ipset call
|
|
entry := &utilipset.Entry{
|
|
IP: svcInfo.ClusterIP.String(),
|
|
Port: svcInfo.Port,
|
|
Protocol: protocol,
|
|
SetType: utilipset.HashIPPort,
|
|
}
|
|
// add service Cluster IP:Port to kubeServiceAccess ip set for the purpose of solving hairpin.
|
|
// proxier.kubeServiceAccessSet.activeEntries.Insert(entry.String())
|
|
if valid := proxier.ipsetList[kubeClusterIPSet].validateEntry(entry); !valid {
|
|
klog.Errorf("%s", fmt.Sprintf(EntryInvalidErr, entry, proxier.ipsetList[kubeClusterIPSet].Name))
|
|
continue
|
|
}
|
|
proxier.ipsetList[kubeClusterIPSet].activeEntries.Insert(entry.String())
|
|
// ipvs call
|
|
serv := &utilipvs.VirtualServer{
|
|
Address: svcInfo.ClusterIP,
|
|
Port: uint16(svcInfo.Port),
|
|
Protocol: string(svcInfo.Protocol),
|
|
Scheduler: proxier.ipvsScheduler,
|
|
}
|
|
// Set session affinity flag and timeout for IPVS service
|
|
if svcInfo.SessionAffinityType == v1.ServiceAffinityClientIP {
|
|
serv.Flags |= utilipvs.FlagPersistent
|
|
serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds)
|
|
}
|
|
// We need to bind ClusterIP to dummy interface, so set `bindAddr` parameter to `true` in syncService()
|
|
if err := proxier.syncService(svcNameString, serv, true); err == nil {
|
|
activeIPVSServices[serv.String()] = true
|
|
activeBindAddrs[serv.Address.String()] = true
|
|
// ExternalTrafficPolicy only works for NodePort and external LB traffic, does not affect ClusterIP
|
|
// So we still need clusterIP rules in onlyNodeLocalEndpoints mode.
|
|
if err := proxier.syncEndpoint(svcName, false, serv); err != nil {
|
|
klog.Errorf("Failed to sync endpoint for service: %v, err: %v", serv, err)
|
|
}
|
|
} else {
|
|
klog.Errorf("Failed to sync service: %v, err: %v", serv, err)
|
|
}
|
|
|
|
// Capture externalIPs.
|
|
for _, externalIP := range svcInfo.ExternalIPs {
|
|
if local, err := utilproxy.IsLocalIP(externalIP); err != nil {
|
|
klog.Errorf("can't determine if IP is local, assuming not: %v", err)
|
|
// We do not start listening on SCTP ports, according to our agreement in the
|
|
// SCTP support KEP
|
|
} else if local && (svcInfo.GetProtocol() != v1.ProtocolSCTP) {
|
|
lp := utilproxy.LocalPort{
|
|
Description: "externalIP for " + svcNameString,
|
|
IP: externalIP,
|
|
Port: svcInfo.Port,
|
|
Protocol: protocol,
|
|
}
|
|
if proxier.portsMap[lp] != nil {
|
|
klog.V(4).Infof("Port %s was open before and is still needed", lp.String())
|
|
replacementPortsMap[lp] = proxier.portsMap[lp]
|
|
} else {
|
|
socket, err := proxier.portMapper.OpenLocalPort(&lp)
|
|
if err != nil {
|
|
msg := fmt.Sprintf("can't open %s, skipping this externalIP: %v", lp.String(), err)
|
|
|
|
proxier.recorder.Eventf(
|
|
&v1.ObjectReference{
|
|
Kind: "Node",
|
|
Name: proxier.hostname,
|
|
UID: types.UID(proxier.hostname),
|
|
Namespace: "",
|
|
}, v1.EventTypeWarning, err.Error(), msg)
|
|
klog.Error(msg)
|
|
continue
|
|
}
|
|
replacementPortsMap[lp] = socket
|
|
}
|
|
} // We're holding the port, so it's OK to install IPVS rules.
|
|
|
|
// ipset call
|
|
entry := &utilipset.Entry{
|
|
IP: externalIP,
|
|
Port: svcInfo.Port,
|
|
Protocol: protocol,
|
|
SetType: utilipset.HashIPPort,
|
|
}
|
|
// We have to SNAT packets to external IPs.
|
|
if valid := proxier.ipsetList[kubeExternalIPSet].validateEntry(entry); !valid {
|
|
klog.Errorf("%s", fmt.Sprintf(EntryInvalidErr, entry, proxier.ipsetList[kubeExternalIPSet].Name))
|
|
continue
|
|
}
|
|
proxier.ipsetList[kubeExternalIPSet].activeEntries.Insert(entry.String())
|
|
|
|
// ipvs call
|
|
serv := &utilipvs.VirtualServer{
|
|
Address: net.ParseIP(externalIP),
|
|
Port: uint16(svcInfo.Port),
|
|
Protocol: string(svcInfo.Protocol),
|
|
Scheduler: proxier.ipvsScheduler,
|
|
}
|
|
if svcInfo.SessionAffinityType == v1.ServiceAffinityClientIP {
|
|
serv.Flags |= utilipvs.FlagPersistent
|
|
serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds)
|
|
}
|
|
if err := proxier.syncService(svcNameString, serv, true); err == nil {
|
|
activeIPVSServices[serv.String()] = true
|
|
activeBindAddrs[serv.Address.String()] = true
|
|
if err := proxier.syncEndpoint(svcName, false, serv); err != nil {
|
|
klog.Errorf("Failed to sync endpoint for service: %v, err: %v", serv, err)
|
|
}
|
|
} else {
|
|
klog.Errorf("Failed to sync service: %v, err: %v", serv, err)
|
|
}
|
|
}
|
|
|
|
// Capture load-balancer ingress.
|
|
for _, ingress := range svcInfo.LoadBalancerStatus.Ingress {
|
|
if ingress.IP != "" {
|
|
// ipset call
|
|
entry = &utilipset.Entry{
|
|
IP: ingress.IP,
|
|
Port: svcInfo.Port,
|
|
Protocol: protocol,
|
|
SetType: utilipset.HashIPPort,
|
|
}
|
|
// add service load balancer ingressIP:Port to kubeServiceAccess ip set for the purpose of solving hairpin.
|
|
// proxier.kubeServiceAccessSet.activeEntries.Insert(entry.String())
|
|
// If we are proxying globally, we need to masquerade in case we cross nodes.
|
|
// If we are proxying only locally, we can retain the source IP.
|
|
if valid := proxier.ipsetList[kubeLoadBalancerSet].validateEntry(entry); !valid {
|
|
klog.Errorf("%s", fmt.Sprintf(EntryInvalidErr, entry, proxier.ipsetList[kubeLoadBalancerSet].Name))
|
|
continue
|
|
}
|
|
proxier.ipsetList[kubeLoadBalancerSet].activeEntries.Insert(entry.String())
|
|
// insert loadbalancer entry to lbIngressLocalSet if service externaltrafficpolicy=local
|
|
if svcInfo.OnlyNodeLocalEndpoints {
|
|
if valid := proxier.ipsetList[kubeLoadBalancerLocalSet].validateEntry(entry); !valid {
|
|
klog.Errorf("%s", fmt.Sprintf(EntryInvalidErr, entry, proxier.ipsetList[kubeLoadBalancerLocalSet].Name))
|
|
continue
|
|
}
|
|
proxier.ipsetList[kubeLoadBalancerLocalSet].activeEntries.Insert(entry.String())
|
|
}
|
|
if len(svcInfo.LoadBalancerSourceRanges) != 0 {
|
|
// The service firewall rules are created based on ServiceSpec.loadBalancerSourceRanges field.
|
|
// This currently works for loadbalancers that preserves source ips.
|
|
// For loadbalancers which direct traffic to service NodePort, the firewall rules will not apply.
|
|
if valid := proxier.ipsetList[kubeLoadbalancerFWSet].validateEntry(entry); !valid {
|
|
klog.Errorf("%s", fmt.Sprintf(EntryInvalidErr, entry, proxier.ipsetList[kubeLoadbalancerFWSet].Name))
|
|
continue
|
|
}
|
|
proxier.ipsetList[kubeLoadbalancerFWSet].activeEntries.Insert(entry.String())
|
|
allowFromNode := false
|
|
for _, src := range svcInfo.LoadBalancerSourceRanges {
|
|
// ipset call
|
|
entry = &utilipset.Entry{
|
|
IP: ingress.IP,
|
|
Port: svcInfo.Port,
|
|
Protocol: protocol,
|
|
Net: src,
|
|
SetType: utilipset.HashIPPortNet,
|
|
}
|
|
// enumerate all white list source cidr
|
|
if valid := proxier.ipsetList[kubeLoadBalancerSourceCIDRSet].validateEntry(entry); !valid {
|
|
klog.Errorf("%s", fmt.Sprintf(EntryInvalidErr, entry, proxier.ipsetList[kubeLoadBalancerSourceCIDRSet].Name))
|
|
continue
|
|
}
|
|
proxier.ipsetList[kubeLoadBalancerSourceCIDRSet].activeEntries.Insert(entry.String())
|
|
|
|
// ignore error because it has been validated
|
|
_, cidr, _ := net.ParseCIDR(src)
|
|
if cidr.Contains(proxier.nodeIP) {
|
|
allowFromNode = true
|
|
}
|
|
}
|
|
// generally, ip route rule was added to intercept request to loadbalancer vip from the
|
|
// loadbalancer's backend hosts. In this case, request will not hit the loadbalancer but loop back directly.
|
|
// Need to add the following rule to allow request on host.
|
|
if allowFromNode {
|
|
entry = &utilipset.Entry{
|
|
IP: ingress.IP,
|
|
Port: svcInfo.Port,
|
|
Protocol: protocol,
|
|
IP2: ingress.IP,
|
|
SetType: utilipset.HashIPPortIP,
|
|
}
|
|
// enumerate all white list source ip
|
|
if valid := proxier.ipsetList[kubeLoadBalancerSourceIPSet].validateEntry(entry); !valid {
|
|
klog.Errorf("%s", fmt.Sprintf(EntryInvalidErr, entry, proxier.ipsetList[kubeLoadBalancerSourceIPSet].Name))
|
|
continue
|
|
}
|
|
proxier.ipsetList[kubeLoadBalancerSourceIPSet].activeEntries.Insert(entry.String())
|
|
}
|
|
}
|
|
|
|
// ipvs call
|
|
serv := &utilipvs.VirtualServer{
|
|
Address: net.ParseIP(ingress.IP),
|
|
Port: uint16(svcInfo.Port),
|
|
Protocol: string(svcInfo.Protocol),
|
|
Scheduler: proxier.ipvsScheduler,
|
|
}
|
|
if svcInfo.SessionAffinityType == v1.ServiceAffinityClientIP {
|
|
serv.Flags |= utilipvs.FlagPersistent
|
|
serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds)
|
|
}
|
|
if err := proxier.syncService(svcNameString, serv, true); err == nil {
|
|
// check if service need skip endpoints that not in same host as kube-proxy
|
|
onlyLocal := svcInfo.SessionAffinityType == v1.ServiceAffinityClientIP && svcInfo.OnlyNodeLocalEndpoints
|
|
activeIPVSServices[serv.String()] = true
|
|
activeBindAddrs[serv.Address.String()] = true
|
|
if err := proxier.syncEndpoint(svcName, onlyLocal, serv); err != nil {
|
|
klog.Errorf("Failed to sync endpoint for service: %v, err: %v", serv, err)
|
|
}
|
|
} else {
|
|
klog.Errorf("Failed to sync service: %v, err: %v", serv, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
if svcInfo.NodePort != 0 {
|
|
addresses, err := utilproxy.GetNodeAddresses(proxier.nodePortAddresses, proxier.networkInterfacer)
|
|
if err != nil {
|
|
klog.Errorf("Failed to get node ip address matching nodeport cidr: %v", err)
|
|
continue
|
|
}
|
|
|
|
var lps []utilproxy.LocalPort
|
|
for address := range addresses {
|
|
lp := utilproxy.LocalPort{
|
|
Description: "nodePort for " + svcNameString,
|
|
IP: address,
|
|
Port: svcInfo.NodePort,
|
|
Protocol: protocol,
|
|
}
|
|
if utilproxy.IsZeroCIDR(address) {
|
|
// Empty IP address means all
|
|
lp.IP = ""
|
|
lps = append(lps, lp)
|
|
// If we encounter a zero CIDR, then there is no point in processing the rest of the addresses.
|
|
break
|
|
}
|
|
lps = append(lps, lp)
|
|
}
|
|
|
|
// For ports on node IPs, open the actual port and hold it.
|
|
for _, lp := range lps {
|
|
if proxier.portsMap[lp] != nil {
|
|
klog.V(4).Infof("Port %s was open before and is still needed", lp.String())
|
|
replacementPortsMap[lp] = proxier.portsMap[lp]
|
|
// We do not start listening on SCTP ports, according to our agreement in the
|
|
// SCTP support KEP
|
|
} else if svcInfo.GetProtocol() != v1.ProtocolSCTP {
|
|
socket, err := proxier.portMapper.OpenLocalPort(&lp)
|
|
if err != nil {
|
|
klog.Errorf("can't open %s, skipping this nodePort: %v", lp.String(), err)
|
|
continue
|
|
}
|
|
if lp.Protocol == "udp" {
|
|
isIPv6 := utilnet.IsIPv6(svcInfo.ClusterIP)
|
|
conntrack.ClearEntriesForPort(proxier.exec, lp.Port, isIPv6, v1.ProtocolUDP)
|
|
}
|
|
replacementPortsMap[lp] = socket
|
|
} // We're holding the port, so it's OK to install ipvs rules.
|
|
}
|
|
|
|
// Nodeports need SNAT, unless they're local.
|
|
// ipset call
|
|
entry = &utilipset.Entry{
|
|
// No need to provide ip info
|
|
Port: svcInfo.NodePort,
|
|
Protocol: protocol,
|
|
SetType: utilipset.BitmapPort,
|
|
}
|
|
var nodePortSet *IPSet
|
|
switch protocol {
|
|
case "tcp":
|
|
nodePortSet = proxier.ipsetList[kubeNodePortSetTCP]
|
|
case "udp":
|
|
nodePortSet = proxier.ipsetList[kubeNodePortSetUDP]
|
|
case "sctp":
|
|
nodePortSet = proxier.ipsetList[kubeNodePortSetSCTP]
|
|
default:
|
|
// It should never hit
|
|
klog.Errorf("Unsupported protocol type: %s", protocol)
|
|
}
|
|
if nodePortSet != nil {
|
|
if valid := nodePortSet.validateEntry(entry); !valid {
|
|
klog.Errorf("%s", fmt.Sprintf(EntryInvalidErr, entry, nodePortSet.Name))
|
|
continue
|
|
}
|
|
nodePortSet.activeEntries.Insert(entry.String())
|
|
}
|
|
|
|
// Add externaltrafficpolicy=local type nodeport entry
|
|
if svcInfo.OnlyNodeLocalEndpoints {
|
|
var nodePortLocalSet *IPSet
|
|
switch protocol {
|
|
case "tcp":
|
|
nodePortLocalSet = proxier.ipsetList[kubeNodePortLocalSetTCP]
|
|
case "udp":
|
|
nodePortLocalSet = proxier.ipsetList[kubeNodePortLocalSetUDP]
|
|
case "sctp":
|
|
nodePortLocalSet = proxier.ipsetList[kubeNodePortLocalSetSCTP]
|
|
default:
|
|
// It should never hit
|
|
klog.Errorf("Unsupported protocol type: %s", protocol)
|
|
}
|
|
if nodePortLocalSet != nil {
|
|
if valid := nodePortLocalSet.validateEntry(entry); !valid {
|
|
klog.Errorf("%s", fmt.Sprintf(EntryInvalidErr, entry, nodePortLocalSet.Name))
|
|
continue
|
|
}
|
|
nodePortLocalSet.activeEntries.Insert(entry.String())
|
|
}
|
|
}
|
|
|
|
// Build ipvs kernel routes for each node ip address
|
|
var nodeIPs []net.IP
|
|
for address := range addresses {
|
|
if !utilproxy.IsZeroCIDR(address) {
|
|
nodeIPs = append(nodeIPs, net.ParseIP(address))
|
|
continue
|
|
}
|
|
// zero cidr
|
|
nodeIPs, err = proxier.ipGetter.NodeIPs()
|
|
if err != nil {
|
|
klog.Errorf("Failed to list all node IPs from host, err: %v", err)
|
|
}
|
|
}
|
|
for _, nodeIP := range nodeIPs {
|
|
// ipvs call
|
|
serv := &utilipvs.VirtualServer{
|
|
Address: nodeIP,
|
|
Port: uint16(svcInfo.NodePort),
|
|
Protocol: string(svcInfo.Protocol),
|
|
Scheduler: proxier.ipvsScheduler,
|
|
}
|
|
if svcInfo.SessionAffinityType == v1.ServiceAffinityClientIP {
|
|
serv.Flags |= utilipvs.FlagPersistent
|
|
serv.Timeout = uint32(svcInfo.StickyMaxAgeSeconds)
|
|
}
|
|
// There is no need to bind Node IP to dummy interface, so set parameter `bindAddr` to `false`.
|
|
if err := proxier.syncService(svcNameString, serv, false); err == nil {
|
|
activeIPVSServices[serv.String()] = true
|
|
if err := proxier.syncEndpoint(svcName, svcInfo.OnlyNodeLocalEndpoints, serv); err != nil {
|
|
klog.Errorf("Failed to sync endpoint for service: %v, err: %v", serv, err)
|
|
}
|
|
} else {
|
|
klog.Errorf("Failed to sync service: %v, err: %v", serv, err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// sync ipset entries
|
|
for _, set := range proxier.ipsetList {
|
|
set.syncIPSetEntries()
|
|
}
|
|
|
|
// Tail call iptables rules for ipset, make sure only call iptables once
|
|
// in a single loop per ip set.
|
|
proxier.writeIptablesRules()
|
|
|
|
// Sync iptables rules.
|
|
// NOTE: NoFlushTables is used so we don't flush non-kubernetes chains in the table.
|
|
proxier.iptablesData.Reset()
|
|
proxier.iptablesData.Write(proxier.natChains.Bytes())
|
|
proxier.iptablesData.Write(proxier.natRules.Bytes())
|
|
proxier.iptablesData.Write(proxier.filterChains.Bytes())
|
|
proxier.iptablesData.Write(proxier.filterRules.Bytes())
|
|
|
|
klog.V(5).Infof("Restoring iptables rules: %s", proxier.iptablesData.Bytes())
|
|
err = proxier.iptables.RestoreAll(proxier.iptablesData.Bytes(), utiliptables.NoFlushTables, utiliptables.RestoreCounters)
|
|
if err != nil {
|
|
klog.Errorf("Failed to execute iptables-restore: %v\nRules:\n%s", err, proxier.iptablesData.Bytes())
|
|
// Revert new local ports.
|
|
utilproxy.RevertPorts(replacementPortsMap, proxier.portsMap)
|
|
return
|
|
}
|
|
|
|
// Close old local ports and save new ones.
|
|
for k, v := range proxier.portsMap {
|
|
if replacementPortsMap[k] == nil {
|
|
v.Close()
|
|
}
|
|
}
|
|
proxier.portsMap = replacementPortsMap
|
|
|
|
// Get legacy bind address
|
|
// currentBindAddrs represents ip addresses bind to DefaultDummyDevice from the system
|
|
currentBindAddrs, err := proxier.netlinkHandle.ListBindAddress(DefaultDummyDevice)
|
|
if err != nil {
|
|
klog.Errorf("Failed to get bind address, err: %v", err)
|
|
}
|
|
legacyBindAddrs := proxier.getLegacyBindAddr(activeBindAddrs, currentBindAddrs)
|
|
|
|
// Clean up legacy IPVS services and unbind addresses
|
|
appliedSvcs, err := proxier.ipvs.GetVirtualServers()
|
|
if err == nil {
|
|
for _, appliedSvc := range appliedSvcs {
|
|
currentIPVSServices[appliedSvc.String()] = appliedSvc
|
|
}
|
|
} else {
|
|
klog.Errorf("Failed to get ipvs service, err: %v", err)
|
|
}
|
|
proxier.cleanLegacyService(activeIPVSServices, currentIPVSServices, legacyBindAddrs)
|
|
|
|
// Update healthz timestamp
|
|
if proxier.healthzServer != nil {
|
|
proxier.healthzServer.UpdateTimestamp()
|
|
}
|
|
|
|
// Update healthchecks. The endpoints list might include services that are
|
|
// not "OnlyLocal", but the services list will not, and the healthChecker
|
|
// will just drop those endpoints.
|
|
if err := proxier.healthChecker.SyncServices(serviceUpdateResult.HCServiceNodePorts); err != nil {
|
|
klog.Errorf("Error syncing healthcheck services: %v", err)
|
|
}
|
|
if err := proxier.healthChecker.SyncEndpoints(endpointUpdateResult.HCEndpointsLocalIPSize); err != nil {
|
|
klog.Errorf("Error syncing healthcheck endpoints: %v", err)
|
|
}
|
|
|
|
// Finish housekeeping.
|
|
// TODO: these could be made more consistent.
|
|
for _, svcIP := range staleServices.UnsortedList() {
|
|
if err := conntrack.ClearEntriesForIP(proxier.exec, svcIP, v1.ProtocolUDP); err != nil {
|
|
klog.Errorf("Failed to delete stale service IP %s connections, error: %v", svcIP, err)
|
|
}
|
|
}
|
|
proxier.deleteEndpointConnections(endpointUpdateResult.StaleEndpoints)
|
|
}
|
|
|
|
// writeIptablesRules write all iptables rules to proxier.natRules or proxier.FilterRules that ipvs proxier needed
|
|
// according to proxier.ipsetList information and the ipset match relationship that `ipsetWithIptablesChain` specified.
|
|
// some ipset(kubeClusterIPSet for example) have particular match rules and iptables jump relation should be sync separately.
|
|
func (proxier *Proxier) writeIptablesRules() {
|
|
// We are creating those slices ones here to avoid memory reallocations
|
|
// in every loop. Note that reuse the memory, instead of doing:
|
|
// slice = <some new slice>
|
|
// you should always do one of the below:
|
|
// slice = slice[:0] // and then append to it
|
|
// slice = append(slice[:0], ...)
|
|
// To avoid growing this slice, we arbitrarily set its size to 64,
|
|
// there is never more than that many arguments for a single line.
|
|
// Note that even if we go over 64, it will still be correct - it
|
|
// is just for efficiency, not correctness.
|
|
args := make([]string, 64)
|
|
|
|
for _, set := range ipsetWithIptablesChain {
|
|
if _, find := proxier.ipsetList[set.name]; find && !proxier.ipsetList[set.name].isEmpty() {
|
|
args = append(args[:0], "-A", set.from)
|
|
if set.protocolMatch != "" {
|
|
args = append(args, "-p", set.protocolMatch)
|
|
}
|
|
args = append(args,
|
|
"-m", "comment", "--comment", proxier.ipsetList[set.name].getComment(),
|
|
"-m", "set", "--match-set", set.name,
|
|
set.matchType,
|
|
)
|
|
writeLine(proxier.natRules, append(args, "-j", set.to)...)
|
|
}
|
|
}
|
|
|
|
if !proxier.ipsetList[kubeClusterIPSet].isEmpty() {
|
|
args = append(args[:0],
|
|
"-A", string(kubeServicesChain),
|
|
"-m", "comment", "--comment", proxier.ipsetList[kubeClusterIPSet].getComment(),
|
|
"-m", "set", "--match-set", kubeClusterIPSet,
|
|
)
|
|
if proxier.masqueradeAll {
|
|
writeLine(proxier.natRules, append(args, "dst,dst", "-j", string(KubeMarkMasqChain))...)
|
|
} else if len(proxier.clusterCIDR) > 0 {
|
|
// This masquerades off-cluster traffic to a service VIP. The idea
|
|
// is that you can establish a static route for your Service range,
|
|
// routing to any node, and that node will bridge into the Service
|
|
// for you. Since that might bounce off-node, we masquerade here.
|
|
// If/when we support "Local" policy for VIPs, we should update this.
|
|
writeLine(proxier.natRules, append(args, "dst,dst", "! -s", proxier.clusterCIDR, "-j", string(KubeMarkMasqChain))...)
|
|
} else {
|
|
// Masquerade all OUTPUT traffic coming from a service ip.
|
|
// The kube dummy interface has all service VIPs assigned which
|
|
// results in the service VIP being picked as the source IP to reach
|
|
// a VIP. This leads to a connection from VIP:<random port> to
|
|
// VIP:<service port>.
|
|
// Always masquerading OUTPUT (node-originating) traffic with a VIP
|
|
// source ip and service port destination fixes the outgoing connections.
|
|
writeLine(proxier.natRules, append(args, "src,dst", "-j", string(KubeMarkMasqChain))...)
|
|
}
|
|
}
|
|
|
|
if !proxier.ipsetList[kubeExternalIPSet].isEmpty() {
|
|
// Build masquerade rules for packets to external IPs.
|
|
args = append(args[:0],
|
|
"-A", string(kubeServicesChain),
|
|
"-m", "comment", "--comment", proxier.ipsetList[kubeExternalIPSet].getComment(),
|
|
"-m", "set", "--match-set", kubeExternalIPSet,
|
|
"dst,dst",
|
|
)
|
|
writeLine(proxier.natRules, append(args, "-j", string(KubeMarkMasqChain))...)
|
|
// Allow traffic for external IPs that does not come from a bridge (i.e. not from a container)
|
|
// nor from a local process to be forwarded to the service.
|
|
// This rule roughly translates to "all traffic from off-machine".
|
|
// This is imperfect in the face of network plugins that might not use a bridge, but we can revisit that later.
|
|
externalTrafficOnlyArgs := append(args,
|
|
"-m", "physdev", "!", "--physdev-is-in",
|
|
"-m", "addrtype", "!", "--src-type", "LOCAL")
|
|
writeLine(proxier.natRules, append(externalTrafficOnlyArgs, "-j", "ACCEPT")...)
|
|
dstLocalOnlyArgs := append(args, "-m", "addrtype", "--dst-type", "LOCAL")
|
|
// Allow traffic bound for external IPs that happen to be recognized as local IPs to stay local.
|
|
// This covers cases like GCE load-balancers which get added to the local routing table.
|
|
writeLine(proxier.natRules, append(dstLocalOnlyArgs, "-j", "ACCEPT")...)
|
|
}
|
|
|
|
// -A KUBE-SERVICES -m addrtype --dst-type LOCAL -j KUBE-NODE-PORT
|
|
args = append(args[:0],
|
|
"-A", string(kubeServicesChain),
|
|
"-m", "addrtype", "--dst-type", "LOCAL",
|
|
)
|
|
writeLine(proxier.natRules, append(args, "-j", string(KubeNodePortChain))...)
|
|
|
|
// mark drop for KUBE-LOAD-BALANCER
|
|
writeLine(proxier.natRules, []string{
|
|
"-A", string(KubeLoadBalancerChain),
|
|
"-j", string(KubeMarkMasqChain),
|
|
}...)
|
|
|
|
// mark drop for KUBE-FIRE-WALL
|
|
writeLine(proxier.natRules, []string{
|
|
"-A", string(KubeFireWallChain),
|
|
"-j", string(KubeMarkDropChain),
|
|
}...)
|
|
|
|
// Accept all traffic with destination of ipvs virtual service, in case other iptables rules
|
|
// block the traffic, that may result in ipvs rules invalid.
|
|
// Those rules must be in the end of KUBE-SERVICE chain
|
|
proxier.acceptIPVSTraffic()
|
|
|
|
// If the masqueradeMark has been added then we want to forward that same
|
|
// traffic, this allows NodePort traffic to be forwarded even if the default
|
|
// FORWARD policy is not accept.
|
|
writeLine(proxier.filterRules,
|
|
"-A", string(KubeForwardChain),
|
|
"-m", "comment", "--comment", `"kubernetes forwarding rules"`,
|
|
"-m", "mark", "--mark", proxier.masqueradeMark,
|
|
"-j", "ACCEPT",
|
|
)
|
|
|
|
// The following rules can only be set if clusterCIDR has been defined.
|
|
if len(proxier.clusterCIDR) != 0 {
|
|
// The following two rules ensure the traffic after the initial packet
|
|
// accepted by the "kubernetes forwarding rules" rule above will be
|
|
// accepted, to be as specific as possible the traffic must be sourced
|
|
// or destined to the clusterCIDR (to/from a pod).
|
|
writeLine(proxier.filterRules,
|
|
"-A", string(KubeForwardChain),
|
|
"-s", proxier.clusterCIDR,
|
|
"-m", "comment", "--comment", `"kubernetes forwarding conntrack pod source rule"`,
|
|
"-m", "conntrack",
|
|
"--ctstate", "RELATED,ESTABLISHED",
|
|
"-j", "ACCEPT",
|
|
)
|
|
writeLine(proxier.filterRules,
|
|
"-A", string(KubeForwardChain),
|
|
"-m", "comment", "--comment", `"kubernetes forwarding conntrack pod destination rule"`,
|
|
"-d", proxier.clusterCIDR,
|
|
"-m", "conntrack",
|
|
"--ctstate", "RELATED,ESTABLISHED",
|
|
"-j", "ACCEPT",
|
|
)
|
|
}
|
|
|
|
// Write the end-of-table markers.
|
|
writeLine(proxier.filterRules, "COMMIT")
|
|
writeLine(proxier.natRules, "COMMIT")
|
|
}
|
|
|
|
func (proxier *Proxier) acceptIPVSTraffic() {
|
|
sets := []string{kubeClusterIPSet, kubeLoadBalancerSet}
|
|
for _, set := range sets {
|
|
var matchType string
|
|
if !proxier.ipsetList[set].isEmpty() {
|
|
switch proxier.ipsetList[set].SetType {
|
|
case utilipset.BitmapPort:
|
|
matchType = "dst"
|
|
default:
|
|
matchType = "dst,dst"
|
|
}
|
|
writeLine(proxier.natRules, []string{
|
|
"-A", string(kubeServicesChain),
|
|
"-m", "set", "--match-set", set, matchType,
|
|
"-j", "ACCEPT",
|
|
}...)
|
|
}
|
|
}
|
|
}
|
|
|
|
// createAndLinkeKubeChain create all kube chains that ipvs proxier need and write basic link.
|
|
func (proxier *Proxier) createAndLinkeKubeChain() {
|
|
existingFilterChains := proxier.getExistingChains(proxier.filterChainsData, utiliptables.TableFilter)
|
|
existingNATChains := proxier.getExistingChains(proxier.iptablesData, utiliptables.TableNAT)
|
|
|
|
// Make sure we keep stats for the top-level chains
|
|
for _, ch := range iptablesChains {
|
|
if _, err := proxier.iptables.EnsureChain(ch.table, ch.chain); err != nil {
|
|
klog.Errorf("Failed to ensure that %s chain %s exists: %v", ch.table, ch.chain, err)
|
|
return
|
|
}
|
|
if ch.table == utiliptables.TableNAT {
|
|
if chain, ok := existingNATChains[ch.chain]; ok {
|
|
writeBytesLine(proxier.natChains, chain)
|
|
} else {
|
|
writeLine(proxier.natChains, utiliptables.MakeChainLine(kubePostroutingChain))
|
|
}
|
|
} else {
|
|
if chain, ok := existingFilterChains[KubeForwardChain]; ok {
|
|
writeBytesLine(proxier.filterChains, chain)
|
|
} else {
|
|
writeLine(proxier.filterChains, utiliptables.MakeChainLine(KubeForwardChain))
|
|
}
|
|
}
|
|
}
|
|
|
|
for _, jc := range iptablesJumpChain {
|
|
args := []string{"-m", "comment", "--comment", jc.comment, "-j", string(jc.to)}
|
|
if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, jc.table, jc.from, args...); err != nil {
|
|
klog.Errorf("Failed to ensure that %s chain %s jumps to %s: %v", jc.table, jc.from, jc.to, err)
|
|
}
|
|
}
|
|
|
|
// Install the kubernetes-specific postrouting rules. We use a whole chain for
|
|
// this so that it is easier to flush and change, for example if the mark
|
|
// value should ever change.
|
|
writeLine(proxier.natRules, []string{
|
|
"-A", string(kubePostroutingChain),
|
|
"-m", "comment", "--comment", `"kubernetes service traffic requiring SNAT"`,
|
|
"-m", "mark", "--mark", proxier.masqueradeMark,
|
|
"-j", "MASQUERADE",
|
|
}...)
|
|
|
|
// Install the kubernetes-specific masquerade mark rule. We use a whole chain for
|
|
// this so that it is easier to flush and change, for example if the mark
|
|
// value should ever change.
|
|
writeLine(proxier.natRules, []string{
|
|
"-A", string(KubeMarkMasqChain),
|
|
"-j", "MARK", "--set-xmark", proxier.masqueradeMark,
|
|
}...)
|
|
}
|
|
|
|
// getExistingChains get iptables-save output so we can check for existing chains and rules.
|
|
// This will be a map of chain name to chain with rules as stored in iptables-save/iptables-restore
|
|
// Result may SHARE memory with contents of buffer.
|
|
func (proxier *Proxier) getExistingChains(buffer *bytes.Buffer, table utiliptables.Table) map[utiliptables.Chain][]byte {
|
|
buffer.Reset()
|
|
err := proxier.iptables.SaveInto(table, buffer)
|
|
if err != nil { // if we failed to get any rules
|
|
klog.Errorf("Failed to execute iptables-save, syncing all rules: %v", err)
|
|
} else { // otherwise parse the output
|
|
return utiliptables.GetChainLines(table, buffer.Bytes())
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// After a UDP endpoint has been removed, we must flush any pending conntrack entries to it, or else we
|
|
// risk sending more traffic to it, all of which will be lost (because UDP).
|
|
// This assumes the proxier mutex is held
|
|
func (proxier *Proxier) deleteEndpointConnections(connectionMap []proxy.ServiceEndpoint) {
|
|
for _, epSvcPair := range connectionMap {
|
|
if svcInfo, ok := proxier.serviceMap[epSvcPair.ServicePortName]; ok && svcInfo.GetProtocol() == v1.ProtocolUDP {
|
|
endpointIP := utilproxy.IPPart(epSvcPair.Endpoint)
|
|
err := conntrack.ClearEntriesForNAT(proxier.exec, svcInfo.ClusterIPString(), endpointIP, v1.ProtocolUDP)
|
|
if err != nil {
|
|
klog.Errorf("Failed to delete %s endpoint connections, error: %v", epSvcPair.ServicePortName.String(), err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (proxier *Proxier) syncService(svcName string, vs *utilipvs.VirtualServer, bindAddr bool) error {
|
|
appliedVirtualServer, _ := proxier.ipvs.GetVirtualServer(vs)
|
|
if appliedVirtualServer == nil || !appliedVirtualServer.Equal(vs) {
|
|
if appliedVirtualServer == nil {
|
|
// IPVS service is not found, create a new service
|
|
klog.V(3).Infof("Adding new service %q %s:%d/%s", svcName, vs.Address, vs.Port, vs.Protocol)
|
|
if err := proxier.ipvs.AddVirtualServer(vs); err != nil {
|
|
klog.Errorf("Failed to add IPVS service %q: %v", svcName, err)
|
|
return err
|
|
}
|
|
} else {
|
|
// IPVS service was changed, update the existing one
|
|
// During updates, service VIP will not go down
|
|
klog.V(3).Infof("IPVS service %s was changed", svcName)
|
|
if err := proxier.ipvs.UpdateVirtualServer(vs); err != nil {
|
|
klog.Errorf("Failed to update IPVS service, err:%v", err)
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
// bind service address to dummy interface even if service not changed,
|
|
// in case that service IP was removed by other processes
|
|
if bindAddr {
|
|
klog.V(4).Infof("Bind addr %s", vs.Address.String())
|
|
_, err := proxier.netlinkHandle.EnsureAddressBind(vs.Address.String(), DefaultDummyDevice)
|
|
if err != nil {
|
|
klog.Errorf("Failed to bind service address to dummy device %q: %v", svcName, err)
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (proxier *Proxier) syncEndpoint(svcPortName proxy.ServicePortName, onlyNodeLocalEndpoints bool, vs *utilipvs.VirtualServer) error {
|
|
appliedVirtualServer, err := proxier.ipvs.GetVirtualServer(vs)
|
|
if err != nil || appliedVirtualServer == nil {
|
|
klog.Errorf("Failed to get IPVS service, error: %v", err)
|
|
return err
|
|
}
|
|
|
|
// curEndpoints represents IPVS destinations listed from current system.
|
|
curEndpoints := sets.NewString()
|
|
// newEndpoints represents Endpoints watched from API Server.
|
|
newEndpoints := sets.NewString()
|
|
|
|
curDests, err := proxier.ipvs.GetRealServers(appliedVirtualServer)
|
|
if err != nil {
|
|
klog.Errorf("Failed to list IPVS destinations, error: %v", err)
|
|
return err
|
|
}
|
|
for _, des := range curDests {
|
|
curEndpoints.Insert(des.String())
|
|
}
|
|
|
|
for _, epInfo := range proxier.endpointsMap[svcPortName] {
|
|
if onlyNodeLocalEndpoints && !epInfo.GetIsLocal() {
|
|
continue
|
|
}
|
|
newEndpoints.Insert(epInfo.String())
|
|
}
|
|
|
|
// Create new endpoints
|
|
for _, ep := range newEndpoints.List() {
|
|
ip, port, err := net.SplitHostPort(ep)
|
|
if err != nil {
|
|
klog.Errorf("Failed to parse endpoint: %v, error: %v", ep, err)
|
|
continue
|
|
}
|
|
portNum, err := strconv.Atoi(port)
|
|
if err != nil {
|
|
klog.Errorf("Failed to parse endpoint port %s, error: %v", port, err)
|
|
continue
|
|
}
|
|
|
|
newDest := &utilipvs.RealServer{
|
|
Address: net.ParseIP(ip),
|
|
Port: uint16(portNum),
|
|
Weight: 1,
|
|
}
|
|
|
|
if curEndpoints.Has(ep) {
|
|
// check if newEndpoint is in gracefulDelete list, if true, delete this ep immediately
|
|
uniqueRS := GetUniqueRSName(vs, newDest)
|
|
if !proxier.gracefuldeleteManager.InTerminationList(uniqueRS) {
|
|
continue
|
|
}
|
|
klog.V(5).Infof("new ep %q is in graceful delete list", uniqueRS)
|
|
err := proxier.gracefuldeleteManager.MoveRSOutofGracefulDeleteList(uniqueRS)
|
|
if err != nil {
|
|
klog.Errorf("Failed to delete endpoint: %v in gracefulDeleteQueue, error: %v", ep, err)
|
|
continue
|
|
}
|
|
}
|
|
err = proxier.ipvs.AddRealServer(appliedVirtualServer, newDest)
|
|
if err != nil {
|
|
klog.Errorf("Failed to add destination: %v, error: %v", newDest, err)
|
|
continue
|
|
}
|
|
}
|
|
// Delete old endpoints
|
|
for _, ep := range curEndpoints.Difference(newEndpoints).UnsortedList() {
|
|
// if curEndpoint is in gracefulDelete, skip
|
|
uniqueRS := vs.String() + "/" + ep
|
|
if proxier.gracefuldeleteManager.InTerminationList(uniqueRS) {
|
|
continue
|
|
}
|
|
ip, port, err := net.SplitHostPort(ep)
|
|
if err != nil {
|
|
klog.Errorf("Failed to parse endpoint: %v, error: %v", ep, err)
|
|
continue
|
|
}
|
|
portNum, err := strconv.Atoi(port)
|
|
if err != nil {
|
|
klog.Errorf("Failed to parse endpoint port %s, error: %v", port, err)
|
|
continue
|
|
}
|
|
|
|
delDest := &utilipvs.RealServer{
|
|
Address: net.ParseIP(ip),
|
|
Port: uint16(portNum),
|
|
}
|
|
|
|
klog.V(5).Infof("Using graceful delete to delete: %v", uniqueRS)
|
|
err = proxier.gracefuldeleteManager.GracefulDeleteRS(appliedVirtualServer, delDest)
|
|
if err != nil {
|
|
klog.Errorf("Failed to delete destination: %v, error: %v", uniqueRS, err)
|
|
continue
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (proxier *Proxier) cleanLegacyService(activeServices map[string]bool, currentServices map[string]*utilipvs.VirtualServer, legacyBindAddrs map[string]bool) {
|
|
for cs := range currentServices {
|
|
svc := currentServices[cs]
|
|
if _, ok := activeServices[cs]; !ok {
|
|
// This service was not processed in the latest sync loop so before deleting it,
|
|
okayToDelete := true
|
|
rsList, _ := proxier.ipvs.GetRealServers(svc)
|
|
|
|
// If we still have real servers graceful termination is not done
|
|
if len(rsList) > 0 {
|
|
okayToDelete = false
|
|
}
|
|
// Applying graceful termination to all real servers
|
|
for _, rs := range rsList {
|
|
uniqueRS := GetUniqueRSName(svc, rs)
|
|
// If RS is already in the graceful termination list, no need to add it again
|
|
if proxier.gracefuldeleteManager.InTerminationList(uniqueRS) {
|
|
continue
|
|
}
|
|
klog.V(5).Infof("Using graceful delete to delete: %v", uniqueRS)
|
|
if err := proxier.gracefuldeleteManager.GracefulDeleteRS(svc, rs); err != nil {
|
|
klog.Errorf("Failed to delete destination: %v, error: %v", uniqueRS, err)
|
|
}
|
|
}
|
|
// make sure it does not fall within an excluded CIDR range.
|
|
for _, excludedCIDR := range proxier.excludeCIDRs {
|
|
// Any validation of this CIDR already should have occurred.
|
|
_, n, _ := net.ParseCIDR(excludedCIDR)
|
|
if n.Contains(svc.Address) {
|
|
okayToDelete = false
|
|
break
|
|
}
|
|
}
|
|
if okayToDelete {
|
|
klog.V(4).Infof("Delete service %s", svc.String())
|
|
if err := proxier.ipvs.DeleteVirtualServer(svc); err != nil {
|
|
klog.Errorf("Failed to delete service %s, error: %v", svc.String(), err)
|
|
}
|
|
addr := svc.Address.String()
|
|
if _, ok := legacyBindAddrs[addr]; ok {
|
|
klog.V(4).Infof("Unbinding address %s", addr)
|
|
if err := proxier.netlinkHandle.UnbindAddress(addr, DefaultDummyDevice); err != nil {
|
|
klog.Errorf("Failed to unbind service addr %s from dummy interface %s: %v", addr, DefaultDummyDevice, err)
|
|
} else {
|
|
// In case we delete a multi-port service, avoid trying to unbind multiple times
|
|
delete(legacyBindAddrs, addr)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (proxier *Proxier) getLegacyBindAddr(activeBindAddrs map[string]bool, currentBindAddrs []string) map[string]bool {
|
|
legacyAddrs := make(map[string]bool)
|
|
isIpv6 := utilnet.IsIPv6(proxier.nodeIP)
|
|
for _, addr := range currentBindAddrs {
|
|
addrIsIpv6 := utilnet.IsIPv6(net.ParseIP(addr))
|
|
if addrIsIpv6 && !isIpv6 || !addrIsIpv6 && isIpv6 {
|
|
continue
|
|
}
|
|
if _, ok := activeBindAddrs[addr]; !ok {
|
|
legacyAddrs[addr] = true
|
|
}
|
|
}
|
|
return legacyAddrs
|
|
}
|
|
|
|
// Join all words with spaces, terminate with newline and write to buff.
|
|
func writeLine(buf *bytes.Buffer, words ...string) {
|
|
// We avoid strings.Join for performance reasons.
|
|
for i := range words {
|
|
buf.WriteString(words[i])
|
|
if i < len(words)-1 {
|
|
buf.WriteByte(' ')
|
|
} else {
|
|
buf.WriteByte('\n')
|
|
}
|
|
}
|
|
}
|
|
|
|
func writeBytesLine(buf *bytes.Buffer, bytes []byte) {
|
|
buf.Write(bytes)
|
|
buf.WriteByte('\n')
|
|
}
|
|
|
|
// listenPortOpener opens ports by calling bind() and listen().
|
|
type listenPortOpener struct{}
|
|
|
|
// OpenLocalPort holds the given local port open.
|
|
func (l *listenPortOpener) OpenLocalPort(lp *utilproxy.LocalPort) (utilproxy.Closeable, error) {
|
|
return openLocalPort(lp)
|
|
}
|
|
|
|
func openLocalPort(lp *utilproxy.LocalPort) (utilproxy.Closeable, error) {
|
|
// For ports on node IPs, open the actual port and hold it, even though we
|
|
// use iptables to redirect traffic.
|
|
// This ensures a) that it's safe to use that port and b) that (a) stays
|
|
// true. The risk is that some process on the node (e.g. sshd or kubelet)
|
|
// is using a port and we give that same port out to a Service. That would
|
|
// be bad because iptables would silently claim the traffic but the process
|
|
// would never know.
|
|
// NOTE: We should not need to have a real listen()ing socket - bind()
|
|
// should be enough, but I can't figure out a way to e2e test without
|
|
// it. Tools like 'ss' and 'netstat' do not show sockets that are
|
|
// bind()ed but not listen()ed, and at least the default debian netcat
|
|
// has no way to avoid about 10 seconds of retries.
|
|
var socket utilproxy.Closeable
|
|
switch lp.Protocol {
|
|
case "tcp":
|
|
listener, err := net.Listen("tcp", net.JoinHostPort(lp.IP, strconv.Itoa(lp.Port)))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
socket = listener
|
|
case "udp":
|
|
addr, err := net.ResolveUDPAddr("udp", net.JoinHostPort(lp.IP, strconv.Itoa(lp.Port)))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
conn, err := net.ListenUDP("udp", addr)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
socket = conn
|
|
default:
|
|
return nil, fmt.Errorf("unknown protocol %q", lp.Protocol)
|
|
}
|
|
klog.V(2).Infof("Opened local port %s", lp.String())
|
|
return socket, nil
|
|
}
|
|
|
|
// ipvs Proxier fall back on iptables when it needs to do SNAT for engress packets
|
|
// It will only operate iptables *nat table.
|
|
// Create and link the kube postrouting chain for SNAT packets.
|
|
// Chain POSTROUTING (policy ACCEPT)
|
|
// target prot opt source destination
|
|
// KUBE-POSTROUTING all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules *
|
|
// Maintain by kubelet network sync loop
|
|
|
|
// *nat
|
|
// :KUBE-POSTROUTING - [0:0]
|
|
// Chain KUBE-POSTROUTING (1 references)
|
|
// target prot opt source destination
|
|
// MASQUERADE all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes service traffic requiring SNAT */ mark match 0x4000/0x4000
|
|
|
|
// :KUBE-MARK-MASQ - [0:0]
|
|
// Chain KUBE-MARK-MASQ (0 references)
|
|
// target prot opt source destination
|
|
// MARK all -- 0.0.0.0/0 0.0.0.0/0 MARK or 0x4000
|