2019-01-12 04:58:27 +00:00
/ *
Copyright 2014 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package userspace
import (
"fmt"
"net"
2019-08-30 18:33:25 +00:00
"reflect"
2019-01-12 04:58:27 +00:00
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
2019-08-30 18:33:25 +00:00
libcontainersystem "github.com/opencontainers/runc/libcontainer/system"
2019-09-27 21:51:53 +00:00
v1 "k8s.io/api/core/v1"
2019-01-12 04:58:27 +00:00
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
utilnet "k8s.io/apimachinery/pkg/util/net"
"k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
2019-12-12 01:27:03 +00:00
servicehelper "k8s.io/cloud-provider/service/helpers"
2020-08-10 17:43:49 +00:00
"k8s.io/klog/v2"
2019-01-12 04:58:27 +00:00
"k8s.io/kubernetes/pkg/proxy"
2019-09-27 21:51:53 +00:00
"k8s.io/kubernetes/pkg/proxy/config"
2019-01-12 04:58:27 +00:00
utilproxy "k8s.io/kubernetes/pkg/proxy/util"
2019-08-30 18:33:25 +00:00
"k8s.io/kubernetes/pkg/util/async"
2019-01-12 04:58:27 +00:00
"k8s.io/kubernetes/pkg/util/conntrack"
"k8s.io/kubernetes/pkg/util/iptables"
utilexec "k8s.io/utils/exec"
2020-03-26 21:07:15 +00:00
netutils "k8s.io/utils/net"
2019-01-12 04:58:27 +00:00
)
type portal struct {
ip net . IP
port int
isExternal bool
}
// ServiceInfo contains information and state for a particular proxied service
type ServiceInfo struct {
// Timeout is the read/write timeout (used for UDP connections)
Timeout time . Duration
// ActiveClients is the cache of active UDP clients being proxied by this proxy for this service
ActiveClients * ClientCache
isAliveAtomic int32 // Only access this with atomic ops
portal portal
protocol v1 . Protocol
proxyPort int
socket ProxySocket
nodePort int
loadBalancerStatus v1 . LoadBalancerStatus
sessionAffinityType v1 . ServiceAffinity
stickyMaxAgeSeconds int
// Deprecated, but required for back-compat (including e2e)
externalIPs [ ] string
2020-12-01 01:06:26 +00:00
// isStartedAtomic is set to non-zero when the service's socket begins
// accepting requests. Used in testcases. Only access this with atomic ops.
isStartedAtomic int32
// isFinishedAtomic is set to non-zero when the service's socket shuts
// down. Used in testcases. Only access this with atomic ops.
isFinishedAtomic int32
}
func ( info * ServiceInfo ) setStarted ( ) {
atomic . StoreInt32 ( & info . isStartedAtomic , 1 )
}
func ( info * ServiceInfo ) IsStarted ( ) bool {
return atomic . LoadInt32 ( & info . isStartedAtomic ) != 0
}
func ( info * ServiceInfo ) setFinished ( ) {
atomic . StoreInt32 ( & info . isFinishedAtomic , 1 )
}
func ( info * ServiceInfo ) IsFinished ( ) bool {
return atomic . LoadInt32 ( & info . isFinishedAtomic ) != 0
2019-01-12 04:58:27 +00:00
}
func ( info * ServiceInfo ) setAlive ( b bool ) {
var i int32
if b {
i = 1
}
atomic . StoreInt32 ( & info . isAliveAtomic , i )
}
func ( info * ServiceInfo ) IsAlive ( ) bool {
return atomic . LoadInt32 ( & info . isAliveAtomic ) != 0
}
func logTimeout ( err error ) bool {
if e , ok := err . ( net . Error ) ; ok {
if e . Timeout ( ) {
klog . V ( 3 ) . Infof ( "connection to endpoint closed due to inactivity" )
return true
}
}
return false
}
// ProxySocketFunc is a function which constructs a ProxySocket from a protocol, ip, and port
type ProxySocketFunc func ( protocol v1 . Protocol , ip net . IP , port int ) ( ProxySocket , error )
2019-08-30 18:33:25 +00:00
const numBurstSyncs int = 2
type serviceChange struct {
current * v1 . Service
previous * v1 . Service
}
// Interface for async runner; abstracted for testing
type asyncRunnerInterface interface {
Run ( )
Loop ( <- chan struct { } )
}
2019-01-12 04:58:27 +00:00
// Proxier is a simple proxy for TCP connections between a localhost:lport
// and services that provide the actual implementations.
type Proxier struct {
2019-09-27 21:51:53 +00:00
// EndpointSlice support has not been added for this proxier yet.
config . NoopEndpointSliceHandler
2019-12-12 01:27:03 +00:00
// TODO(imroc): implement node handler for userspace proxier.
config . NoopNodeHandler
2019-09-27 21:51:53 +00:00
2019-01-12 04:58:27 +00:00
loadBalancer LoadBalancer
mu sync . Mutex // protects serviceMap
serviceMap map [ proxy . ServicePortName ] * ServiceInfo
syncPeriod time . Duration
2019-08-30 18:33:25 +00:00
minSyncPeriod time . Duration
2019-01-12 04:58:27 +00:00
udpIdleTimeout time . Duration
portMapMutex sync . Mutex
portMap map [ portMapKey ] * portMapValue
listenIP net . IP
iptables iptables . Interface
hostIP net . IP
2020-03-26 21:07:15 +00:00
localAddrs netutils . IPSet
2019-01-12 04:58:27 +00:00
proxyPorts PortAllocator
makeProxySocket ProxySocketFunc
exec utilexec . Interface
2019-08-30 18:33:25 +00:00
// endpointsSynced and servicesSynced are set to 1 when the corresponding
// objects are synced after startup. This is used to avoid updating iptables
// with some partial data after kube-proxy restart.
endpointsSynced int32
servicesSynced int32
initialized int32
// protects serviceChanges
serviceChangesLock sync . Mutex
serviceChanges map [ types . NamespacedName ] * serviceChange // map of service changes
syncRunner asyncRunnerInterface // governs calls to syncProxyRules
stopChan chan struct { }
2019-01-12 04:58:27 +00:00
}
2019-09-27 21:51:53 +00:00
// assert Proxier is a proxy.Provider
var _ proxy . Provider = & Proxier { }
2019-01-12 04:58:27 +00:00
// A key for the portMap. The ip has to be a string because slices can't be map
// keys.
type portMapKey struct {
ip string
port int
protocol v1 . Protocol
}
func ( k * portMapKey ) String ( ) string {
return fmt . Sprintf ( "%s/%s" , net . JoinHostPort ( k . ip , strconv . Itoa ( k . port ) ) , k . protocol )
}
// A value for the portMap
type portMapValue struct {
owner proxy . ServicePortName
socket interface {
Close ( ) error
}
}
var (
// ErrProxyOnLocalhost is returned by NewProxier if the user requests a proxier on
// the loopback address. May be checked for by callers of NewProxier to know whether
// the caller provided invalid input.
ErrProxyOnLocalhost = fmt . Errorf ( "cannot proxy on localhost" )
)
// NewProxier returns a new Proxier given a LoadBalancer and an address on
// which to listen. Because of the iptables logic, It is assumed that there
// is only a single Proxier active on a machine. An error will be returned if
// the proxier cannot be started due to an invalid ListenIP (loopback) or
// if iptables fails to update or acquire the initial lock. Once a proxier is
// created, it will keep iptables up to date in the background and will not
// terminate if a particular iptables call fails.
func NewProxier ( loadBalancer LoadBalancer , listenIP net . IP , iptables iptables . Interface , exec utilexec . Interface , pr utilnet . PortRange , syncPeriod , minSyncPeriod , udpIdleTimeout time . Duration , nodePortAddresses [ ] string ) ( * Proxier , error ) {
return NewCustomProxier ( loadBalancer , listenIP , iptables , exec , pr , syncPeriod , minSyncPeriod , udpIdleTimeout , nodePortAddresses , newProxySocket )
}
// NewCustomProxier functions similarly to NewProxier, returning a new Proxier
// for the given LoadBalancer and address. The new proxier is constructed using
// the ProxySocket constructor provided, however, instead of constructing the
// default ProxySockets.
func NewCustomProxier ( loadBalancer LoadBalancer , listenIP net . IP , iptables iptables . Interface , exec utilexec . Interface , pr utilnet . PortRange , syncPeriod , minSyncPeriod , udpIdleTimeout time . Duration , nodePortAddresses [ ] string , makeProxySocket ProxySocketFunc ) ( * Proxier , error ) {
if listenIP . Equal ( localhostIPv4 ) || listenIP . Equal ( localhostIPv6 ) {
return nil , ErrProxyOnLocalhost
}
// If listenIP is given, assume that is the intended host IP. Otherwise
// try to find a suitable host IP address from network interfaces.
var err error
hostIP := listenIP
if hostIP . Equal ( net . IPv4zero ) || hostIP . Equal ( net . IPv6zero ) {
hostIP , err = utilnet . ChooseHostInterface ( )
if err != nil {
return nil , fmt . Errorf ( "failed to select a host interface: %v" , err )
}
}
err = setRLimit ( 64 * 1000 )
if err != nil {
2019-08-30 18:33:25 +00:00
if ! libcontainersystem . RunningInUserNS ( ) {
2019-03-29 00:03:05 +00:00
return nil , fmt . Errorf ( "failed to set open file handler limit to 64000: %v" , err )
}
2020-08-10 17:43:49 +00:00
klog . Errorf ( "failed to set open file handler limit to 64000: %v (running in UserNS, ignoring the error)" , err )
2019-01-12 04:58:27 +00:00
}
proxyPorts := newPortAllocator ( pr )
klog . V ( 2 ) . Infof ( "Setting proxy IP to %v and initializing iptables" , hostIP )
return createProxier ( loadBalancer , listenIP , iptables , exec , hostIP , proxyPorts , syncPeriod , minSyncPeriod , udpIdleTimeout , makeProxySocket )
}
func createProxier ( loadBalancer LoadBalancer , listenIP net . IP , iptables iptables . Interface , exec utilexec . Interface , hostIP net . IP , proxyPorts PortAllocator , syncPeriod , minSyncPeriod , udpIdleTimeout time . Duration , makeProxySocket ProxySocketFunc ) ( * Proxier , error ) {
// convenient to pass nil for tests..
if proxyPorts == nil {
proxyPorts = newPortAllocator ( utilnet . PortRange { } )
}
// Set up the iptables foundations we need.
if err := iptablesInit ( iptables ) ; err != nil {
return nil , fmt . Errorf ( "failed to initialize iptables: %v" , err )
}
// Flush old iptables rules (since the bound ports will be invalid after a restart).
// When OnUpdate() is first called, the rules will be recreated.
if err := iptablesFlush ( iptables ) ; err != nil {
return nil , fmt . Errorf ( "failed to flush iptables: %v" , err )
}
2019-08-30 18:33:25 +00:00
proxier := & Proxier {
loadBalancer : loadBalancer ,
serviceMap : make ( map [ proxy . ServicePortName ] * ServiceInfo ) ,
serviceChanges : make ( map [ types . NamespacedName ] * serviceChange ) ,
portMap : make ( map [ portMapKey ] * portMapValue ) ,
syncPeriod : syncPeriod ,
2019-01-12 04:58:27 +00:00
minSyncPeriod : minSyncPeriod ,
udpIdleTimeout : udpIdleTimeout ,
listenIP : listenIP ,
iptables : iptables ,
hostIP : hostIP ,
proxyPorts : proxyPorts ,
makeProxySocket : makeProxySocket ,
exec : exec ,
2019-08-30 18:33:25 +00:00
stopChan : make ( chan struct { } ) ,
}
klog . V ( 3 ) . Infof ( "minSyncPeriod: %v, syncPeriod: %v, burstSyncs: %d" , minSyncPeriod , syncPeriod , numBurstSyncs )
proxier . syncRunner = async . NewBoundedFrequencyRunner ( "userspace-proxy-sync-runner" , proxier . syncProxyRules , minSyncPeriod , syncPeriod , numBurstSyncs )
return proxier , nil
2019-01-12 04:58:27 +00:00
}
// CleanupLeftovers removes all iptables rules and chains created by the Proxier
// It returns true if an error was encountered. Errors are logged.
func CleanupLeftovers ( ipt iptables . Interface ) ( encounteredError bool ) {
// NOTE: Warning, this needs to be kept in sync with the userspace Proxier,
// we want to ensure we remove all of the iptables rules it creates.
// Currently they are all in iptablesInit()
// Delete Rules first, then Flush and Delete Chains
args := [ ] string { "-m" , "comment" , "--comment" , "handle ClusterIPs; NOTE: this must be before the NodePort rules" }
if err := ipt . DeleteRule ( iptables . TableNAT , iptables . ChainOutput , append ( args , "-j" , string ( iptablesHostPortalChain ) ) ... ) ; err != nil {
if ! iptables . IsNotFoundError ( err ) {
klog . Errorf ( "Error removing userspace rule: %v" , err )
encounteredError = true
}
}
if err := ipt . DeleteRule ( iptables . TableNAT , iptables . ChainPrerouting , append ( args , "-j" , string ( iptablesContainerPortalChain ) ) ... ) ; err != nil {
if ! iptables . IsNotFoundError ( err ) {
klog . Errorf ( "Error removing userspace rule: %v" , err )
encounteredError = true
}
}
args = [ ] string { "-m" , "addrtype" , "--dst-type" , "LOCAL" }
args = append ( args , "-m" , "comment" , "--comment" , "handle service NodePorts; NOTE: this must be the last rule in the chain" )
if err := ipt . DeleteRule ( iptables . TableNAT , iptables . ChainOutput , append ( args , "-j" , string ( iptablesHostNodePortChain ) ) ... ) ; err != nil {
if ! iptables . IsNotFoundError ( err ) {
klog . Errorf ( "Error removing userspace rule: %v" , err )
encounteredError = true
}
}
if err := ipt . DeleteRule ( iptables . TableNAT , iptables . ChainPrerouting , append ( args , "-j" , string ( iptablesContainerNodePortChain ) ) ... ) ; err != nil {
if ! iptables . IsNotFoundError ( err ) {
klog . Errorf ( "Error removing userspace rule: %v" , err )
encounteredError = true
}
}
args = [ ] string { "-m" , "comment" , "--comment" , "Ensure that non-local NodePort traffic can flow" }
if err := ipt . DeleteRule ( iptables . TableFilter , iptables . ChainInput , append ( args , "-j" , string ( iptablesNonLocalNodePortChain ) ) ... ) ; err != nil {
if ! iptables . IsNotFoundError ( err ) {
klog . Errorf ( "Error removing userspace rule: %v" , err )
encounteredError = true
}
}
// flush and delete chains.
tableChains := map [ iptables . Table ] [ ] iptables . Chain {
iptables . TableNAT : { iptablesContainerPortalChain , iptablesHostPortalChain , iptablesHostNodePortChain , iptablesContainerNodePortChain } ,
iptables . TableFilter : { iptablesNonLocalNodePortChain } ,
}
for table , chains := range tableChains {
for _ , c := range chains {
// flush chain, then if successful delete, delete will fail if flush fails.
if err := ipt . FlushChain ( table , c ) ; err != nil {
if ! iptables . IsNotFoundError ( err ) {
klog . Errorf ( "Error flushing userspace chain: %v" , err )
encounteredError = true
}
} else {
if err = ipt . DeleteChain ( table , c ) ; err != nil {
if ! iptables . IsNotFoundError ( err ) {
klog . Errorf ( "Error deleting userspace chain: %v" , err )
encounteredError = true
}
}
}
}
}
return encounteredError
}
2019-08-30 18:33:25 +00:00
// shutdown closes all service port proxies and returns from the proxy's
// sync loop. Used from testcases.
func ( proxier * Proxier ) shutdown ( ) {
proxier . mu . Lock ( )
defer proxier . mu . Unlock ( )
for serviceName , info := range proxier . serviceMap {
proxier . stopProxy ( serviceName , info )
}
proxier . cleanupStaleStickySessions ( )
close ( proxier . stopChan )
}
func ( proxier * Proxier ) isInitialized ( ) bool {
return atomic . LoadInt32 ( & proxier . initialized ) > 0
}
// Sync is called to synchronize the proxier state to iptables as soon as possible.
2019-01-12 04:58:27 +00:00
func ( proxier * Proxier ) Sync ( ) {
2019-08-30 18:33:25 +00:00
proxier . syncRunner . Run ( )
}
func ( proxier * Proxier ) syncProxyRules ( ) {
start := time . Now ( )
defer func ( ) {
2020-08-10 17:43:49 +00:00
klog . V ( 4 ) . Infof ( "userspace syncProxyRules took %v" , time . Since ( start ) )
2019-08-30 18:33:25 +00:00
} ( )
// don't sync rules till we've received services and endpoints
if ! proxier . isInitialized ( ) {
klog . V ( 2 ) . Info ( "Not syncing userspace proxy until Services and Endpoints have been received from master" )
return
}
2019-01-12 04:58:27 +00:00
if err := iptablesInit ( proxier . iptables ) ; err != nil {
klog . Errorf ( "Failed to ensure iptables: %v" , err )
}
2019-08-30 18:33:25 +00:00
proxier . serviceChangesLock . Lock ( )
changes := proxier . serviceChanges
proxier . serviceChanges = make ( map [ types . NamespacedName ] * serviceChange )
proxier . serviceChangesLock . Unlock ( )
proxier . mu . Lock ( )
defer proxier . mu . Unlock ( )
2020-08-10 17:43:49 +00:00
klog . V ( 4 ) . Infof ( "userspace proxy: processing %d service events" , len ( changes ) )
2019-08-30 18:33:25 +00:00
for _ , change := range changes {
existingPorts := proxier . mergeService ( change . current )
proxier . unmergeService ( change . previous , existingPorts )
}
2020-03-26 21:07:15 +00:00
localAddrs , err := utilproxy . GetLocalAddrs ( )
if err != nil {
klog . Errorf ( "Failed to get local addresses during proxy sync: %s, assuming IPs are not local" , err )
} else if len ( localAddrs ) == 0 {
klog . Warning ( "No local addresses were found, assuming all external IPs are not local" )
}
localAddrSet := netutils . IPSet { }
localAddrSet . Insert ( localAddrs ... )
proxier . localAddrs = localAddrSet
2019-01-12 04:58:27 +00:00
proxier . ensurePortals ( )
proxier . cleanupStaleStickySessions ( )
}
// SyncLoop runs periodic work. This is expected to run as a goroutine or as the main loop of the app. It does not return.
func ( proxier * Proxier ) SyncLoop ( ) {
2019-08-30 18:33:25 +00:00
proxier . syncRunner . Loop ( proxier . stopChan )
2019-01-12 04:58:27 +00:00
}
// Ensure that portals exist for all services.
func ( proxier * Proxier ) ensurePortals ( ) {
// NB: This does not remove rules that should not be present.
for name , info := range proxier . serviceMap {
err := proxier . openPortal ( name , info )
if err != nil {
klog . Errorf ( "Failed to ensure portal for %q: %v" , name , err )
}
}
}
// clean up any stale sticky session records in the hash map.
func ( proxier * Proxier ) cleanupStaleStickySessions ( ) {
for name := range proxier . serviceMap {
proxier . loadBalancer . CleanupStaleStickySessions ( name )
}
}
func ( proxier * Proxier ) stopProxy ( service proxy . ServicePortName , info * ServiceInfo ) error {
delete ( proxier . serviceMap , service )
info . setAlive ( false )
err := info . socket . Close ( )
port := info . socket . ListenPort ( )
proxier . proxyPorts . Release ( port )
return err
}
func ( proxier * Proxier ) getServiceInfo ( service proxy . ServicePortName ) ( * ServiceInfo , bool ) {
proxier . mu . Lock ( )
defer proxier . mu . Unlock ( )
info , ok := proxier . serviceMap [ service ]
return info , ok
}
2019-08-30 18:33:25 +00:00
// addServiceOnPortInternal starts listening for a new service, returning the ServiceInfo.
2019-01-12 04:58:27 +00:00
// Pass proxyPort=0 to allocate a random port. The timeout only applies to UDP
// connections, for now.
2019-08-30 18:33:25 +00:00
func ( proxier * Proxier ) addServiceOnPortInternal ( service proxy . ServicePortName , protocol v1 . Protocol , proxyPort int , timeout time . Duration ) ( * ServiceInfo , error ) {
2019-01-12 04:58:27 +00:00
sock , err := proxier . makeProxySocket ( protocol , proxier . listenIP , proxyPort )
if err != nil {
return nil , err
}
_ , portStr , err := net . SplitHostPort ( sock . Addr ( ) . String ( ) )
if err != nil {
sock . Close ( )
return nil , err
}
portNum , err := strconv . Atoi ( portStr )
if err != nil {
sock . Close ( )
return nil , err
}
si := & ServiceInfo {
Timeout : timeout ,
ActiveClients : newClientCache ( ) ,
isAliveAtomic : 1 ,
proxyPort : portNum ,
protocol : protocol ,
socket : sock ,
sessionAffinityType : v1 . ServiceAffinityNone , // default
}
2019-08-30 18:33:25 +00:00
proxier . serviceMap [ service ] = si
2019-01-12 04:58:27 +00:00
klog . V ( 2 ) . Infof ( "Proxying for service %q on %s port %d" , service , protocol , portNum )
2020-12-01 01:06:26 +00:00
go func ( ) {
2019-01-12 04:58:27 +00:00
defer runtime . HandleCrash ( )
sock . ProxyLoop ( service , si , proxier . loadBalancer )
2020-12-01 01:06:26 +00:00
} ( )
2019-01-12 04:58:27 +00:00
return si , nil
}
2019-08-30 18:33:25 +00:00
func ( proxier * Proxier ) cleanupPortalAndProxy ( serviceName proxy . ServicePortName , info * ServiceInfo ) error {
if err := proxier . closePortal ( serviceName , info ) ; err != nil {
return fmt . Errorf ( "Failed to close portal for %q: %v" , serviceName , err )
}
if err := proxier . stopProxy ( serviceName , info ) ; err != nil {
return fmt . Errorf ( "Failed to stop service %q: %v" , serviceName , err )
}
return nil
}
2019-01-12 04:58:27 +00:00
func ( proxier * Proxier ) mergeService ( service * v1 . Service ) sets . String {
if service == nil {
return nil
}
2020-08-10 17:43:49 +00:00
if utilproxy . ShouldSkipService ( service ) {
2019-01-12 04:58:27 +00:00
return nil
}
existingPorts := sets . NewString ( )
2020-08-10 17:43:49 +00:00
svcName := types . NamespacedName { Namespace : service . Namespace , Name : service . Name }
2019-01-12 04:58:27 +00:00
for i := range service . Spec . Ports {
servicePort := & service . Spec . Ports [ i ]
serviceName := proxy . ServicePortName { NamespacedName : svcName , Port : servicePort . Name }
existingPorts . Insert ( servicePort . Name )
2019-08-30 18:33:25 +00:00
info , exists := proxier . serviceMap [ serviceName ]
2019-01-12 04:58:27 +00:00
// TODO: check health of the socket? What if ProxyLoop exited?
if exists && sameConfig ( info , service , servicePort ) {
// Nothing changed.
continue
}
if exists {
klog . V ( 4 ) . Infof ( "Something changed for service %q: stopping it" , serviceName )
2019-08-30 18:33:25 +00:00
if err := proxier . cleanupPortalAndProxy ( serviceName , info ) ; err != nil {
klog . Error ( err )
2019-01-12 04:58:27 +00:00
}
2020-12-01 01:06:26 +00:00
info . setFinished ( )
2019-01-12 04:58:27 +00:00
}
proxyPort , err := proxier . proxyPorts . AllocateNext ( )
if err != nil {
klog . Errorf ( "failed to allocate proxy port for service %q: %v" , serviceName , err )
continue
}
serviceIP := net . ParseIP ( service . Spec . ClusterIP )
klog . V ( 1 ) . Infof ( "Adding new service %q at %s/%s" , serviceName , net . JoinHostPort ( serviceIP . String ( ) , strconv . Itoa ( int ( servicePort . Port ) ) ) , servicePort . Protocol )
2019-08-30 18:33:25 +00:00
info , err = proxier . addServiceOnPortInternal ( serviceName , servicePort . Protocol , proxyPort , proxier . udpIdleTimeout )
2019-01-12 04:58:27 +00:00
if err != nil {
klog . Errorf ( "Failed to start proxy for %q: %v" , serviceName , err )
continue
}
info . portal . ip = serviceIP
info . portal . port = int ( servicePort . Port )
info . externalIPs = service . Spec . ExternalIPs
// Deep-copy in case the service instance changes
info . loadBalancerStatus = * service . Status . LoadBalancer . DeepCopy ( )
info . nodePort = int ( servicePort . NodePort )
info . sessionAffinityType = service . Spec . SessionAffinity
// Kube-apiserver side guarantees SessionAffinityConfig won't be nil when session affinity type is ClientIP
if service . Spec . SessionAffinity == v1 . ServiceAffinityClientIP {
info . stickyMaxAgeSeconds = int ( * service . Spec . SessionAffinityConfig . ClientIP . TimeoutSeconds )
}
klog . V ( 4 ) . Infof ( "info: %#v" , info )
if err := proxier . openPortal ( serviceName , info ) ; err != nil {
klog . Errorf ( "Failed to open portal for %q: %v" , serviceName , err )
}
proxier . loadBalancer . NewService ( serviceName , info . sessionAffinityType , info . stickyMaxAgeSeconds )
2020-12-01 01:06:26 +00:00
info . setStarted ( )
2019-01-12 04:58:27 +00:00
}
return existingPorts
}
func ( proxier * Proxier ) unmergeService ( service * v1 . Service , existingPorts sets . String ) {
if service == nil {
return
}
2020-08-10 17:43:49 +00:00
if utilproxy . ShouldSkipService ( service ) {
2019-01-12 04:58:27 +00:00
return
}
staleUDPServices := sets . NewString ( )
2020-08-10 17:43:49 +00:00
svcName := types . NamespacedName { Namespace : service . Namespace , Name : service . Name }
2019-01-12 04:58:27 +00:00
for i := range service . Spec . Ports {
servicePort := & service . Spec . Ports [ i ]
if existingPorts . Has ( servicePort . Name ) {
continue
}
serviceName := proxy . ServicePortName { NamespacedName : svcName , Port : servicePort . Name }
klog . V ( 1 ) . Infof ( "Stopping service %q" , serviceName )
info , exists := proxier . serviceMap [ serviceName ]
if ! exists {
klog . Errorf ( "Service %q is being removed but doesn't exist" , serviceName )
continue
}
if proxier . serviceMap [ serviceName ] . protocol == v1 . ProtocolUDP {
staleUDPServices . Insert ( proxier . serviceMap [ serviceName ] . portal . ip . String ( ) )
}
2019-08-30 18:33:25 +00:00
if err := proxier . cleanupPortalAndProxy ( serviceName , info ) ; err != nil {
klog . Error ( err )
2019-01-12 04:58:27 +00:00
}
proxier . loadBalancer . DeleteService ( serviceName )
2020-12-01 01:06:26 +00:00
info . setFinished ( )
2019-01-12 04:58:27 +00:00
}
for _ , svcIP := range staleUDPServices . UnsortedList ( ) {
if err := conntrack . ClearEntriesForIP ( proxier . exec , svcIP , v1 . ProtocolUDP ) ; err != nil {
klog . Errorf ( "Failed to delete stale service IP %s connections, error: %v" , svcIP , err )
}
}
}
2019-08-30 18:33:25 +00:00
func ( proxier * Proxier ) serviceChange ( previous , current * v1 . Service , detail string ) {
var svcName types . NamespacedName
if current != nil {
svcName = types . NamespacedName { Namespace : current . Namespace , Name : current . Name }
} else {
svcName = types . NamespacedName { Namespace : previous . Namespace , Name : previous . Name }
}
klog . V ( 4 ) . Infof ( "userspace proxy: %s for %s" , detail , svcName )
proxier . serviceChangesLock . Lock ( )
defer proxier . serviceChangesLock . Unlock ( )
change , exists := proxier . serviceChanges [ svcName ]
if ! exists {
// change.previous is only set for new changes. We must keep
// the oldest service info (or nil) because correct unmerging
// depends on the next update/del after a merge, not subsequent
// updates.
change = & serviceChange { previous : previous }
proxier . serviceChanges [ svcName ] = change
}
// Always use the most current service (or nil) as change.current
change . current = current
if reflect . DeepEqual ( change . previous , change . current ) {
// collapsed change had no effect
delete ( proxier . serviceChanges , svcName )
} else if proxier . isInitialized ( ) {
// change will have an effect, ask the proxy to sync
proxier . syncRunner . Run ( )
}
}
2020-08-10 17:43:49 +00:00
// OnServiceAdd is called whenever creation of new service object
// is observed.
2019-01-12 04:58:27 +00:00
func ( proxier * Proxier ) OnServiceAdd ( service * v1 . Service ) {
2019-08-30 18:33:25 +00:00
proxier . serviceChange ( nil , service , "OnServiceAdd" )
2019-01-12 04:58:27 +00:00
}
2020-08-10 17:43:49 +00:00
// OnServiceUpdate is called whenever modification of an existing
// service object is observed.
2019-01-12 04:58:27 +00:00
func ( proxier * Proxier ) OnServiceUpdate ( oldService , service * v1 . Service ) {
2019-08-30 18:33:25 +00:00
proxier . serviceChange ( oldService , service , "OnServiceUpdate" )
2019-01-12 04:58:27 +00:00
}
2020-08-10 17:43:49 +00:00
// OnServiceDelete is called whenever deletion of an existing service
// object is observed.
2019-01-12 04:58:27 +00:00
func ( proxier * Proxier ) OnServiceDelete ( service * v1 . Service ) {
2019-08-30 18:33:25 +00:00
proxier . serviceChange ( service , nil , "OnServiceDelete" )
2019-01-12 04:58:27 +00:00
}
2020-08-10 17:43:49 +00:00
// OnServiceSynced is called once all the initial event handlers were
// called and the state is fully propagated to local cache.
2019-01-12 04:58:27 +00:00
func ( proxier * Proxier ) OnServiceSynced ( ) {
2019-08-30 18:33:25 +00:00
klog . V ( 2 ) . Infof ( "userspace OnServiceSynced" )
// Mark services as initialized and (if endpoints are already
// initialized) the entire proxy as initialized
atomic . StoreInt32 ( & proxier . servicesSynced , 1 )
if atomic . LoadInt32 ( & proxier . endpointsSynced ) > 0 {
atomic . StoreInt32 ( & proxier . initialized , 1 )
}
// Must sync from a goroutine to avoid blocking the
// service event handler on startup with large numbers
// of initial objects
go proxier . syncProxyRules ( )
}
2020-08-10 17:43:49 +00:00
// OnEndpointsAdd is called whenever creation of new endpoints object
// is observed.
2019-08-30 18:33:25 +00:00
func ( proxier * Proxier ) OnEndpointsAdd ( endpoints * v1 . Endpoints ) {
proxier . loadBalancer . OnEndpointsAdd ( endpoints )
}
2020-08-10 17:43:49 +00:00
// OnEndpointsUpdate is called whenever modification of an existing
// endpoints object is observed.
2019-08-30 18:33:25 +00:00
func ( proxier * Proxier ) OnEndpointsUpdate ( oldEndpoints , endpoints * v1 . Endpoints ) {
proxier . loadBalancer . OnEndpointsUpdate ( oldEndpoints , endpoints )
}
2020-08-10 17:43:49 +00:00
// OnEndpointsDelete is called whenever deletion of an existing endpoints
// object is observed.
2019-08-30 18:33:25 +00:00
func ( proxier * Proxier ) OnEndpointsDelete ( endpoints * v1 . Endpoints ) {
proxier . loadBalancer . OnEndpointsDelete ( endpoints )
}
2020-08-10 17:43:49 +00:00
// OnEndpointsSynced is called once all the initial event handlers were
// called and the state is fully propagated to local cache.
2019-08-30 18:33:25 +00:00
func ( proxier * Proxier ) OnEndpointsSynced ( ) {
klog . V ( 2 ) . Infof ( "userspace OnEndpointsSynced" )
proxier . loadBalancer . OnEndpointsSynced ( )
// Mark endpoints as initialized and (if services are already
// initialized) the entire proxy as initialized
atomic . StoreInt32 ( & proxier . endpointsSynced , 1 )
if atomic . LoadInt32 ( & proxier . servicesSynced ) > 0 {
atomic . StoreInt32 ( & proxier . initialized , 1 )
}
// Must sync from a goroutine to avoid blocking the
// service event handler on startup with large numbers
// of initial objects
go proxier . syncProxyRules ( )
2019-01-12 04:58:27 +00:00
}
func sameConfig ( info * ServiceInfo , service * v1 . Service , port * v1 . ServicePort ) bool {
if info . protocol != port . Protocol || info . portal . port != int ( port . Port ) || info . nodePort != int ( port . NodePort ) {
return false
}
if ! info . portal . ip . Equal ( net . ParseIP ( service . Spec . ClusterIP ) ) {
return false
}
if ! ipsEqual ( info . externalIPs , service . Spec . ExternalIPs ) {
return false
}
2019-12-12 01:27:03 +00:00
if ! servicehelper . LoadBalancerStatusEqual ( & info . loadBalancerStatus , & service . Status . LoadBalancer ) {
2019-01-12 04:58:27 +00:00
return false
}
if info . sessionAffinityType != service . Spec . SessionAffinity {
return false
}
return true
}
func ipsEqual ( lhs , rhs [ ] string ) bool {
if len ( lhs ) != len ( rhs ) {
return false
}
for i := range lhs {
if lhs [ i ] != rhs [ i ] {
return false
}
}
return true
}
func ( proxier * Proxier ) openPortal ( service proxy . ServicePortName , info * ServiceInfo ) error {
err := proxier . openOnePortal ( info . portal , info . protocol , proxier . listenIP , info . proxyPort , service )
if err != nil {
return err
}
for _ , publicIP := range info . externalIPs {
err = proxier . openOnePortal ( portal { net . ParseIP ( publicIP ) , info . portal . port , true } , info . protocol , proxier . listenIP , info . proxyPort , service )
if err != nil {
return err
}
}
for _ , ingress := range info . loadBalancerStatus . Ingress {
if ingress . IP != "" {
err = proxier . openOnePortal ( portal { net . ParseIP ( ingress . IP ) , info . portal . port , false } , info . protocol , proxier . listenIP , info . proxyPort , service )
if err != nil {
return err
}
}
}
if info . nodePort != 0 {
err = proxier . openNodePort ( info . nodePort , info . protocol , proxier . listenIP , info . proxyPort , service )
if err != nil {
return err
}
}
return nil
}
func ( proxier * Proxier ) openOnePortal ( portal portal , protocol v1 . Protocol , proxyIP net . IP , proxyPort int , name proxy . ServicePortName ) error {
2020-03-26 21:07:15 +00:00
if proxier . localAddrs . Len ( ) > 0 && proxier . localAddrs . Has ( portal . ip ) {
2019-01-12 04:58:27 +00:00
err := proxier . claimNodePort ( portal . ip , portal . port , protocol , name )
if err != nil {
return err
}
}
// Handle traffic from containers.
args := proxier . iptablesContainerPortalArgs ( portal . ip , portal . isExternal , false , portal . port , protocol , proxyIP , proxyPort , name )
portalAddress := net . JoinHostPort ( portal . ip . String ( ) , strconv . Itoa ( portal . port ) )
existed , err := proxier . iptables . EnsureRule ( iptables . Append , iptables . TableNAT , iptablesContainerPortalChain , args ... )
if err != nil {
klog . Errorf ( "Failed to install iptables %s rule for service %q, args:%v" , iptablesContainerPortalChain , name , args )
return err
}
if ! existed {
klog . V ( 3 ) . Infof ( "Opened iptables from-containers portal for service %q on %s %s" , name , protocol , portalAddress )
}
if portal . isExternal {
args := proxier . iptablesContainerPortalArgs ( portal . ip , false , true , portal . port , protocol , proxyIP , proxyPort , name )
existed , err := proxier . iptables . EnsureRule ( iptables . Append , iptables . TableNAT , iptablesContainerPortalChain , args ... )
if err != nil {
klog . Errorf ( "Failed to install iptables %s rule that opens service %q for local traffic, args:%v" , iptablesContainerPortalChain , name , args )
return err
}
if ! existed {
klog . V ( 3 ) . Infof ( "Opened iptables from-containers portal for service %q on %s %s for local traffic" , name , protocol , portalAddress )
}
args = proxier . iptablesHostPortalArgs ( portal . ip , true , portal . port , protocol , proxyIP , proxyPort , name )
existed , err = proxier . iptables . EnsureRule ( iptables . Append , iptables . TableNAT , iptablesHostPortalChain , args ... )
if err != nil {
klog . Errorf ( "Failed to install iptables %s rule for service %q for dst-local traffic" , iptablesHostPortalChain , name )
return err
}
if ! existed {
klog . V ( 3 ) . Infof ( "Opened iptables from-host portal for service %q on %s %s for dst-local traffic" , name , protocol , portalAddress )
}
return nil
}
// Handle traffic from the host.
args = proxier . iptablesHostPortalArgs ( portal . ip , false , portal . port , protocol , proxyIP , proxyPort , name )
existed , err = proxier . iptables . EnsureRule ( iptables . Append , iptables . TableNAT , iptablesHostPortalChain , args ... )
if err != nil {
klog . Errorf ( "Failed to install iptables %s rule for service %q" , iptablesHostPortalChain , name )
return err
}
if ! existed {
klog . V ( 3 ) . Infof ( "Opened iptables from-host portal for service %q on %s %s" , name , protocol , portalAddress )
}
return nil
}
// Marks a port as being owned by a particular service, or returns error if already claimed.
// Idempotent: reclaiming with the same owner is not an error
func ( proxier * Proxier ) claimNodePort ( ip net . IP , port int , protocol v1 . Protocol , owner proxy . ServicePortName ) error {
proxier . portMapMutex . Lock ( )
defer proxier . portMapMutex . Unlock ( )
// TODO: We could pre-populate some reserved ports into portMap and/or blacklist some well-known ports
key := portMapKey { ip : ip . String ( ) , port : port , protocol : protocol }
existing , found := proxier . portMap [ key ]
if ! found {
// Hold the actual port open, even though we use iptables to redirect
// it. This ensures that a) it's safe to take and b) that stays true.
// NOTE: We should not need to have a real listen()ing socket - bind()
// should be enough, but I can't figure out a way to e2e test without
// it. Tools like 'ss' and 'netstat' do not show sockets that are
// bind()ed but not listen()ed, and at least the default debian netcat
// has no way to avoid about 10 seconds of retries.
socket , err := proxier . makeProxySocket ( protocol , ip , port )
if err != nil {
return fmt . Errorf ( "can't open node port for %s: %v" , key . String ( ) , err )
}
proxier . portMap [ key ] = & portMapValue { owner : owner , socket : socket }
klog . V ( 2 ) . Infof ( "Claimed local port %s" , key . String ( ) )
return nil
}
if existing . owner == owner {
// We are idempotent
return nil
}
return fmt . Errorf ( "Port conflict detected on port %s. %v vs %v" , key . String ( ) , owner , existing )
}
// Release a claim on a port. Returns an error if the owner does not match the claim.
// Tolerates release on an unclaimed port, to simplify .
func ( proxier * Proxier ) releaseNodePort ( ip net . IP , port int , protocol v1 . Protocol , owner proxy . ServicePortName ) error {
proxier . portMapMutex . Lock ( )
defer proxier . portMapMutex . Unlock ( )
key := portMapKey { ip : ip . String ( ) , port : port , protocol : protocol }
existing , found := proxier . portMap [ key ]
if ! found {
// We tolerate this, it happens if we are cleaning up a failed allocation
klog . Infof ( "Ignoring release on unowned port: %v" , key )
return nil
}
if existing . owner != owner {
return fmt . Errorf ( "Port conflict detected on port %v (unowned unlock). %v vs %v" , key , owner , existing )
}
delete ( proxier . portMap , key )
existing . socket . Close ( )
return nil
}
func ( proxier * Proxier ) openNodePort ( nodePort int , protocol v1 . Protocol , proxyIP net . IP , proxyPort int , name proxy . ServicePortName ) error {
// TODO: Do we want to allow containers to access public services? Probably yes.
// TODO: We could refactor this to be the same code as portal, but with IP == nil
err := proxier . claimNodePort ( nil , nodePort , protocol , name )
if err != nil {
return err
}
// Handle traffic from containers.
args := proxier . iptablesContainerPortalArgs ( nil , false , false , nodePort , protocol , proxyIP , proxyPort , name )
existed , err := proxier . iptables . EnsureRule ( iptables . Append , iptables . TableNAT , iptablesContainerNodePortChain , args ... )
if err != nil {
klog . Errorf ( "Failed to install iptables %s rule for service %q" , iptablesContainerNodePortChain , name )
return err
}
if ! existed {
klog . Infof ( "Opened iptables from-containers public port for service %q on %s port %d" , name , protocol , nodePort )
}
// Handle traffic from the host.
args = proxier . iptablesHostNodePortArgs ( nodePort , protocol , proxyIP , proxyPort , name )
existed , err = proxier . iptables . EnsureRule ( iptables . Append , iptables . TableNAT , iptablesHostNodePortChain , args ... )
if err != nil {
klog . Errorf ( "Failed to install iptables %s rule for service %q" , iptablesHostNodePortChain , name )
return err
}
if ! existed {
klog . Infof ( "Opened iptables from-host public port for service %q on %s port %d" , name , protocol , nodePort )
}
args = proxier . iptablesNonLocalNodePortArgs ( nodePort , protocol , proxyIP , proxyPort , name )
existed , err = proxier . iptables . EnsureRule ( iptables . Append , iptables . TableFilter , iptablesNonLocalNodePortChain , args ... )
if err != nil {
klog . Errorf ( "Failed to install iptables %s rule for service %q" , iptablesNonLocalNodePortChain , name )
return err
}
if ! existed {
klog . Infof ( "Opened iptables from-non-local public port for service %q on %s port %d" , name , protocol , nodePort )
}
return nil
}
func ( proxier * Proxier ) closePortal ( service proxy . ServicePortName , info * ServiceInfo ) error {
// Collect errors and report them all at the end.
el := proxier . closeOnePortal ( info . portal , info . protocol , proxier . listenIP , info . proxyPort , service )
for _ , publicIP := range info . externalIPs {
el = append ( el , proxier . closeOnePortal ( portal { net . ParseIP ( publicIP ) , info . portal . port , true } , info . protocol , proxier . listenIP , info . proxyPort , service ) ... )
}
for _ , ingress := range info . loadBalancerStatus . Ingress {
if ingress . IP != "" {
el = append ( el , proxier . closeOnePortal ( portal { net . ParseIP ( ingress . IP ) , info . portal . port , false } , info . protocol , proxier . listenIP , info . proxyPort , service ) ... )
}
}
if info . nodePort != 0 {
el = append ( el , proxier . closeNodePort ( info . nodePort , info . protocol , proxier . listenIP , info . proxyPort , service ) ... )
}
if len ( el ) == 0 {
klog . V ( 3 ) . Infof ( "Closed iptables portals for service %q" , service )
} else {
klog . Errorf ( "Some errors closing iptables portals for service %q" , service )
}
return utilerrors . NewAggregate ( el )
}
func ( proxier * Proxier ) closeOnePortal ( portal portal , protocol v1 . Protocol , proxyIP net . IP , proxyPort int , name proxy . ServicePortName ) [ ] error {
el := [ ] error { }
2020-03-26 21:07:15 +00:00
if proxier . localAddrs . Len ( ) > 0 && proxier . localAddrs . Has ( portal . ip ) {
2019-01-12 04:58:27 +00:00
if err := proxier . releaseNodePort ( portal . ip , portal . port , protocol , name ) ; err != nil {
el = append ( el , err )
}
}
// Handle traffic from containers.
args := proxier . iptablesContainerPortalArgs ( portal . ip , portal . isExternal , false , portal . port , protocol , proxyIP , proxyPort , name )
if err := proxier . iptables . DeleteRule ( iptables . TableNAT , iptablesContainerPortalChain , args ... ) ; err != nil {
klog . Errorf ( "Failed to delete iptables %s rule for service %q" , iptablesContainerPortalChain , name )
el = append ( el , err )
}
if portal . isExternal {
args := proxier . iptablesContainerPortalArgs ( portal . ip , false , true , portal . port , protocol , proxyIP , proxyPort , name )
if err := proxier . iptables . DeleteRule ( iptables . TableNAT , iptablesContainerPortalChain , args ... ) ; err != nil {
klog . Errorf ( "Failed to delete iptables %s rule for service %q" , iptablesContainerPortalChain , name )
el = append ( el , err )
}
args = proxier . iptablesHostPortalArgs ( portal . ip , true , portal . port , protocol , proxyIP , proxyPort , name )
if err := proxier . iptables . DeleteRule ( iptables . TableNAT , iptablesHostPortalChain , args ... ) ; err != nil {
klog . Errorf ( "Failed to delete iptables %s rule for service %q" , iptablesHostPortalChain , name )
el = append ( el , err )
}
return el
}
// Handle traffic from the host (portalIP is not external).
args = proxier . iptablesHostPortalArgs ( portal . ip , false , portal . port , protocol , proxyIP , proxyPort , name )
if err := proxier . iptables . DeleteRule ( iptables . TableNAT , iptablesHostPortalChain , args ... ) ; err != nil {
klog . Errorf ( "Failed to delete iptables %s rule for service %q" , iptablesHostPortalChain , name )
el = append ( el , err )
}
return el
}
func ( proxier * Proxier ) closeNodePort ( nodePort int , protocol v1 . Protocol , proxyIP net . IP , proxyPort int , name proxy . ServicePortName ) [ ] error {
el := [ ] error { }
// Handle traffic from containers.
args := proxier . iptablesContainerPortalArgs ( nil , false , false , nodePort , protocol , proxyIP , proxyPort , name )
if err := proxier . iptables . DeleteRule ( iptables . TableNAT , iptablesContainerNodePortChain , args ... ) ; err != nil {
klog . Errorf ( "Failed to delete iptables %s rule for service %q" , iptablesContainerNodePortChain , name )
el = append ( el , err )
}
// Handle traffic from the host.
args = proxier . iptablesHostNodePortArgs ( nodePort , protocol , proxyIP , proxyPort , name )
if err := proxier . iptables . DeleteRule ( iptables . TableNAT , iptablesHostNodePortChain , args ... ) ; err != nil {
klog . Errorf ( "Failed to delete iptables %s rule for service %q" , iptablesHostNodePortChain , name )
el = append ( el , err )
}
// Handle traffic not local to the host
args = proxier . iptablesNonLocalNodePortArgs ( nodePort , protocol , proxyIP , proxyPort , name )
if err := proxier . iptables . DeleteRule ( iptables . TableFilter , iptablesNonLocalNodePortChain , args ... ) ; err != nil {
klog . Errorf ( "Failed to delete iptables %s rule for service %q" , iptablesNonLocalNodePortChain , name )
el = append ( el , err )
}
if err := proxier . releaseNodePort ( nil , nodePort , protocol , name ) ; err != nil {
el = append ( el , err )
}
return el
}
// See comments in the *PortalArgs() functions for some details about why we
// use two chains for portals.
var iptablesContainerPortalChain iptables . Chain = "KUBE-PORTALS-CONTAINER"
var iptablesHostPortalChain iptables . Chain = "KUBE-PORTALS-HOST"
// Chains for NodePort services
var iptablesContainerNodePortChain iptables . Chain = "KUBE-NODEPORT-CONTAINER"
var iptablesHostNodePortChain iptables . Chain = "KUBE-NODEPORT-HOST"
var iptablesNonLocalNodePortChain iptables . Chain = "KUBE-NODEPORT-NON-LOCAL"
// Ensure that the iptables infrastructure we use is set up. This can safely be called periodically.
func iptablesInit ( ipt iptables . Interface ) error {
// TODO: There is almost certainly room for optimization here. E.g. If
// we knew the service-cluster-ip-range CIDR we could fast-track outbound packets not
// destined for a service. There's probably more, help wanted.
// Danger - order of these rules matters here:
//
// We match portal rules first, then NodePort rules. For NodePort rules, we filter primarily on --dst-type LOCAL,
// because we want to listen on all local addresses, but don't match internet traffic with the same dst port number.
//
// There is one complication (per thockin):
// -m addrtype --dst-type LOCAL is what we want except that it is broken (by intent without foresight to our usecase)
// on at least GCE. Specifically, GCE machines have a daemon which learns what external IPs are forwarded to that
// machine, and configure a local route for that IP, making a match for --dst-type LOCAL when we don't want it to.
// Removing the route gives correct behavior until the daemon recreates it.
// Killing the daemon is an option, but means that any non-kubernetes use of the machine with external IP will be broken.
//
// This applies to IPs on GCE that are actually from a load-balancer; they will be categorized as LOCAL.
// _If_ the chains were in the wrong order, and the LB traffic had dst-port == a NodePort on some other service,
// the NodePort would take priority (incorrectly).
// This is unlikely (and would only affect outgoing traffic from the cluster to the load balancer, which seems
// doubly-unlikely), but we need to be careful to keep the rules in the right order.
args := [ ] string { /* service-cluster-ip-range matching could go here */ }
args = append ( args , "-m" , "comment" , "--comment" , "handle ClusterIPs; NOTE: this must be before the NodePort rules" )
if _ , err := ipt . EnsureChain ( iptables . TableNAT , iptablesContainerPortalChain ) ; err != nil {
return err
}
if _ , err := ipt . EnsureRule ( iptables . Prepend , iptables . TableNAT , iptables . ChainPrerouting , append ( args , "-j" , string ( iptablesContainerPortalChain ) ) ... ) ; err != nil {
return err
}
if _ , err := ipt . EnsureChain ( iptables . TableNAT , iptablesHostPortalChain ) ; err != nil {
return err
}
if _ , err := ipt . EnsureRule ( iptables . Prepend , iptables . TableNAT , iptables . ChainOutput , append ( args , "-j" , string ( iptablesHostPortalChain ) ) ... ) ; err != nil {
return err
}
// This set of rules matches broadly (addrtype & destination port), and therefore must come after the portal rules
args = [ ] string { "-m" , "addrtype" , "--dst-type" , "LOCAL" }
args = append ( args , "-m" , "comment" , "--comment" , "handle service NodePorts; NOTE: this must be the last rule in the chain" )
if _ , err := ipt . EnsureChain ( iptables . TableNAT , iptablesContainerNodePortChain ) ; err != nil {
return err
}
if _ , err := ipt . EnsureRule ( iptables . Append , iptables . TableNAT , iptables . ChainPrerouting , append ( args , "-j" , string ( iptablesContainerNodePortChain ) ) ... ) ; err != nil {
return err
}
if _ , err := ipt . EnsureChain ( iptables . TableNAT , iptablesHostNodePortChain ) ; err != nil {
return err
}
if _ , err := ipt . EnsureRule ( iptables . Append , iptables . TableNAT , iptables . ChainOutput , append ( args , "-j" , string ( iptablesHostNodePortChain ) ) ... ) ; err != nil {
return err
}
// Create a chain intended to explicitly allow non-local NodePort
// traffic to work around default-deny iptables configurations
// that would otherwise reject such traffic.
args = [ ] string { "-m" , "comment" , "--comment" , "Ensure that non-local NodePort traffic can flow" }
if _ , err := ipt . EnsureChain ( iptables . TableFilter , iptablesNonLocalNodePortChain ) ; err != nil {
return err
}
if _ , err := ipt . EnsureRule ( iptables . Prepend , iptables . TableFilter , iptables . ChainInput , append ( args , "-j" , string ( iptablesNonLocalNodePortChain ) ) ... ) ; err != nil {
return err
}
// TODO: Verify order of rules.
return nil
}
// Flush all of our custom iptables rules.
func iptablesFlush ( ipt iptables . Interface ) error {
el := [ ] error { }
if err := ipt . FlushChain ( iptables . TableNAT , iptablesContainerPortalChain ) ; err != nil {
el = append ( el , err )
}
if err := ipt . FlushChain ( iptables . TableNAT , iptablesHostPortalChain ) ; err != nil {
el = append ( el , err )
}
if err := ipt . FlushChain ( iptables . TableNAT , iptablesContainerNodePortChain ) ; err != nil {
el = append ( el , err )
}
if err := ipt . FlushChain ( iptables . TableNAT , iptablesHostNodePortChain ) ; err != nil {
el = append ( el , err )
}
if err := ipt . FlushChain ( iptables . TableFilter , iptablesNonLocalNodePortChain ) ; err != nil {
el = append ( el , err )
}
if len ( el ) != 0 {
klog . Errorf ( "Some errors flushing old iptables portals: %v" , el )
}
return utilerrors . NewAggregate ( el )
}
// Used below.
var zeroIPv4 = net . ParseIP ( "0.0.0.0" )
var localhostIPv4 = net . ParseIP ( "127.0.0.1" )
var zeroIPv6 = net . ParseIP ( "::" )
var localhostIPv6 = net . ParseIP ( "::1" )
// Build a slice of iptables args that are common to from-container and from-host portal rules.
func iptablesCommonPortalArgs ( destIP net . IP , addPhysicalInterfaceMatch bool , addDstLocalMatch bool , destPort int , protocol v1 . Protocol , service proxy . ServicePortName ) [ ] string {
// This list needs to include all fields as they are eventually spit out
// by iptables-save. This is because some systems do not support the
// 'iptables -C' arg, and so fall back on parsing iptables-save output.
// If this does not match, it will not pass the check. For example:
// adding the /32 on the destination IP arg is not strictly required,
// but causes this list to not match the final iptables-save output.
// This is fragile and I hope one day we can stop supporting such old
// iptables versions.
args := [ ] string {
"-m" , "comment" ,
"--comment" , service . String ( ) ,
"-p" , strings . ToLower ( string ( protocol ) ) ,
"-m" , strings . ToLower ( string ( protocol ) ) ,
"--dport" , fmt . Sprintf ( "%d" , destPort ) ,
}
if destIP != nil {
args = append ( args , "-d" , utilproxy . ToCIDR ( destIP ) )
}
if addPhysicalInterfaceMatch {
args = append ( args , "-m" , "physdev" , "!" , "--physdev-is-in" )
}
if addDstLocalMatch {
args = append ( args , "-m" , "addrtype" , "--dst-type" , "LOCAL" )
}
return args
}
// Build a slice of iptables args for a from-container portal rule.
func ( proxier * Proxier ) iptablesContainerPortalArgs ( destIP net . IP , addPhysicalInterfaceMatch bool , addDstLocalMatch bool , destPort int , protocol v1 . Protocol , proxyIP net . IP , proxyPort int , service proxy . ServicePortName ) [ ] string {
args := iptablesCommonPortalArgs ( destIP , addPhysicalInterfaceMatch , addDstLocalMatch , destPort , protocol , service )
// This is tricky.
//
// If the proxy is bound (see Proxier.listenIP) to 0.0.0.0 ("any
// interface") we want to use REDIRECT, which sends traffic to the
// "primary address of the incoming interface" which means the container
// bridge, if there is one. When the response comes, it comes from that
// same interface, so the NAT matches and the response packet is
// correct. This matters for UDP, since there is no per-connection port
// number.
//
// The alternative would be to use DNAT, except that it doesn't work
// (empirically):
// * DNAT to 127.0.0.1 = Packets just disappear - this seems to be a
// well-known limitation of iptables.
// * DNAT to eth0's IP = Response packets come from the bridge, which
// breaks the NAT, and makes things like DNS not accept them. If
// this could be resolved, it would simplify all of this code.
//
// If the proxy is bound to a specific IP, then we have to use DNAT to
// that IP. Unlike the previous case, this works because the proxy is
// ONLY listening on that IP, not the bridge.
//
// Why would anyone bind to an address that is not inclusive of
// localhost? Apparently some cloud environments have their public IP
// exposed as a real network interface AND do not have firewalling. We
// don't want to expose everything out to the world.
//
// Unfortunately, I don't know of any way to listen on some (N > 1)
// interfaces but not ALL interfaces, short of doing it manually, and
// this is simpler than that.
//
// If the proxy is bound to localhost only, all of this is broken. Not
// allowed.
if proxyIP . Equal ( zeroIPv4 ) || proxyIP . Equal ( zeroIPv6 ) {
// TODO: Can we REDIRECT with IPv6?
args = append ( args , "-j" , "REDIRECT" , "--to-ports" , fmt . Sprintf ( "%d" , proxyPort ) )
} else {
// TODO: Can we DNAT with IPv6?
args = append ( args , "-j" , "DNAT" , "--to-destination" , net . JoinHostPort ( proxyIP . String ( ) , strconv . Itoa ( proxyPort ) ) )
}
return args
}
// Build a slice of iptables args for a from-host portal rule.
func ( proxier * Proxier ) iptablesHostPortalArgs ( destIP net . IP , addDstLocalMatch bool , destPort int , protocol v1 . Protocol , proxyIP net . IP , proxyPort int , service proxy . ServicePortName ) [ ] string {
args := iptablesCommonPortalArgs ( destIP , false , addDstLocalMatch , destPort , protocol , service )
// This is tricky.
//
// If the proxy is bound (see Proxier.listenIP) to 0.0.0.0 ("any
// interface") we want to do the same as from-container traffic and use
// REDIRECT. Except that it doesn't work (empirically). REDIRECT on
// local packets sends the traffic to localhost (special case, but it is
// documented) but the response comes from the eth0 IP (not sure why,
// truthfully), which makes DNS unhappy.
//
// So we have to use DNAT. DNAT to 127.0.0.1 can't work for the same
// reason.
//
// So we do our best to find an interface that is not a loopback and
// DNAT to that. This works (again, empirically).
//
// If the proxy is bound to a specific IP, then we have to use DNAT to
// that IP. Unlike the previous case, this works because the proxy is
// ONLY listening on that IP, not the bridge.
//
// If the proxy is bound to localhost only, this should work, but we
// don't allow it for now.
if proxyIP . Equal ( zeroIPv4 ) || proxyIP . Equal ( zeroIPv6 ) {
proxyIP = proxier . hostIP
}
// TODO: Can we DNAT with IPv6?
args = append ( args , "-j" , "DNAT" , "--to-destination" , net . JoinHostPort ( proxyIP . String ( ) , strconv . Itoa ( proxyPort ) ) )
return args
}
// Build a slice of iptables args for a from-host public-port rule.
// See iptablesHostPortalArgs
// TODO: Should we just reuse iptablesHostPortalArgs?
func ( proxier * Proxier ) iptablesHostNodePortArgs ( nodePort int , protocol v1 . Protocol , proxyIP net . IP , proxyPort int , service proxy . ServicePortName ) [ ] string {
args := iptablesCommonPortalArgs ( nil , false , false , nodePort , protocol , service )
if proxyIP . Equal ( zeroIPv4 ) || proxyIP . Equal ( zeroIPv6 ) {
proxyIP = proxier . hostIP
}
// TODO: Can we DNAT with IPv6?
args = append ( args , "-j" , "DNAT" , "--to-destination" , net . JoinHostPort ( proxyIP . String ( ) , strconv . Itoa ( proxyPort ) ) )
return args
}
// Build a slice of iptables args for an from-non-local public-port rule.
func ( proxier * Proxier ) iptablesNonLocalNodePortArgs ( nodePort int , protocol v1 . Protocol , proxyIP net . IP , proxyPort int , service proxy . ServicePortName ) [ ] string {
args := iptablesCommonPortalArgs ( nil , false , false , proxyPort , protocol , service )
args = append ( args , "-m" , "state" , "--state" , "NEW" , "-j" , "ACCEPT" )
return args
}
func isTooManyFDsError ( err error ) bool {
return strings . Contains ( err . Error ( ) , "too many open files" )
}
func isClosedError ( err error ) bool {
// A brief discussion about handling closed error here:
// https://code.google.com/p/go/issues/detail?id=4373#c14
// TODO: maybe create a stoppable TCP listener that returns a StoppedError
return strings . HasSuffix ( err . Error ( ) , "use of closed network connection" )
}