2015-08-11 02:47:13 +00:00
/ *
Copyright 2015 The Kubernetes Authors All rights reserved .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package iptables
/ *
NOTE : this needs to be tested in e2e since it uses iptables for everything .
* /
import (
"bytes"
"crypto/sha256"
"encoding/base32"
"fmt"
2015-08-15 05:15:12 +00:00
"io/ioutil"
2015-08-11 02:47:13 +00:00
"net"
2015-08-15 05:15:12 +00:00
"path"
2015-08-11 02:47:13 +00:00
"reflect"
"strconv"
"strings"
"sync"
"time"
"github.com/coreos/go-semver/semver"
2015-08-15 04:53:52 +00:00
"github.com/davecgh/go-spew/spew"
2015-08-11 02:47:13 +00:00
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/proxy"
"k8s.io/kubernetes/pkg/types"
utilexec "k8s.io/kubernetes/pkg/util/exec"
utiliptables "k8s.io/kubernetes/pkg/util/iptables"
"k8s.io/kubernetes/pkg/util/slice"
)
// NOTE: IPTABLES_MIN_VERSION is the minimum version of iptables for which we will use the Proxier
// from this package instead of the userspace Proxier.
// This is will not be enough, as the version number is somewhat unreliable,
// features are backported in various distros and this could get pretty hairy.
// However iptables-1.4.0 was released 2007-Dec-22 and appears to have every feature we use,
// so this seems prefectly reasonable for now.
2015-08-14 22:20:19 +00:00
const IPTABLES_MIN_VERSION string = "1.4.0"
2015-08-11 02:47:13 +00:00
// the services chain
2015-08-14 22:20:19 +00:00
const iptablesServicesChain utiliptables . Chain = "KUBE-SERVICES"
// the nodeports chain
const iptablesNodePortsChain utiliptables . Chain = "KUBE-NODEPORTS"
// the mark we apply to traffic needing SNAT
const iptablesMasqueradeMark = "0x4d415351"
2015-08-11 02:47:13 +00:00
2015-08-15 05:15:12 +00:00
// ShouldUseIptablesProxier returns true if we should use the iptables Proxier
// instead of the "classic" userspace Proxier. This is determined by checking
// the iptables version and for the existence of kernel features. It may return
// an error if it fails to get the itpables version without error, in which
// case it will also return false.
2015-08-11 02:47:13 +00:00
func ShouldUseIptablesProxier ( ) ( bool , error ) {
exec := utilexec . New ( )
minVersion , err := semver . NewVersion ( IPTABLES_MIN_VERSION )
if err != nil {
return false , err
}
// returns "vX.X.X", err
versionString , err := utiliptables . GetIptablesVersionString ( exec )
if err != nil {
return false , err
}
// make a semver of the part after the v in "vX.X.X"
version , err := semver . NewVersion ( versionString [ 1 : ] )
if err != nil {
return false , err
}
2015-08-15 05:15:12 +00:00
if version . LessThan ( * minVersion ) {
return false , nil
}
// Check for the required sysctls. We don't care about the value, just
// that it exists. If this Proxier is chosen, we'll iniialize it as we
// need.
_ , err = getSysctl ( sysctlRouteLocalnet )
if err != nil {
return false , err
}
return true , nil
}
const sysctlBase = "/proc/sys"
const sysctlRouteLocalnet = "net/ipv4/conf/all/route_localnet"
2015-08-15 05:36:11 +00:00
const sysctlBridgeCallIptables = "net/bridge/bridge-nf-call-iptables"
2015-08-15 05:15:12 +00:00
func getSysctl ( sysctl string ) ( int , error ) {
data , err := ioutil . ReadFile ( path . Join ( sysctlBase , sysctl ) )
if err != nil {
return - 1 , err
}
val , err := strconv . Atoi ( strings . Trim ( string ( data ) , " \n" ) )
if err != nil {
return - 1 , err
}
return val , nil
}
func setSysctl ( sysctl string , newVal int ) error {
return ioutil . WriteFile ( path . Join ( sysctlBase , sysctl ) , [ ] byte ( strconv . Itoa ( newVal ) ) , 0640 )
2015-08-11 02:47:13 +00:00
}
// internal struct for string service information
type serviceInfo struct {
2015-08-15 03:50:29 +00:00
clusterIP net . IP
port int
protocol api . Protocol
nodePort int
2015-08-11 02:47:13 +00:00
loadBalancerStatus api . LoadBalancerStatus
sessionAffinityType api . ServiceAffinity
2015-08-15 03:50:29 +00:00
stickyMaxAgeSeconds int
2015-08-11 02:47:13 +00:00
endpoints [ ] string
// Deprecated, but required for back-compat (including e2e)
deprecatedPublicIPs [ ] string
}
// returns a new serviceInfo struct
func newServiceInfo ( service proxy . ServicePortName ) * serviceInfo {
return & serviceInfo {
sessionAffinityType : api . ServiceAffinityNone , // default
2015-08-15 03:50:29 +00:00
stickyMaxAgeSeconds : 180 , // TODO: paramaterize this in the API.
2015-08-11 02:47:13 +00:00
}
}
// Proxier is an iptables based proxy for connections between a localhost:lport
// and services that provide the actual backends.
type Proxier struct {
mu sync . Mutex // protects serviceMap
serviceMap map [ proxy . ServicePortName ] * serviceInfo
2015-08-13 00:33:05 +00:00
syncPeriod time . Duration
2015-08-11 02:47:13 +00:00
iptables utiliptables . Interface
haveReceivedServiceUpdate bool // true once we've seen an OnServiceUpdate event
haveReceivedEndpointsUpdate bool // true once we've seen an OnEndpointsUpdate event
}
// Proxier implements ProxyProvider
var _ proxy . ProxyProvider = & Proxier { }
// NewProxier returns a new Proxier given an iptables Interface instance.
// Because of the iptables logic, it is assumed that there is only a single Proxier active on a machine.
// An error will be returned if iptables fails to update or acquire the initial lock.
// Once a proxier is created, it will keep iptables up to date in the background and
// will not terminate if a particular iptables call fails.
2015-08-15 05:36:11 +00:00
func NewProxier ( ipt utiliptables . Interface , exec utilexec . Interface , syncPeriod time . Duration ) ( * Proxier , error ) {
2015-08-15 05:15:12 +00:00
// Set the route_localnet sysctl we need for
if err := setSysctl ( sysctlRouteLocalnet , 1 ) ; err != nil {
2015-08-15 05:36:11 +00:00
return nil , fmt . Errorf ( "can't set sysctl %s: %v" , sysctlRouteLocalnet , err )
}
// Load the module. It's OK if this fails (e.g. the module is not present)
// because we'll catch the error on the sysctl, which is what we actually
// care about.
exec . Command ( "modprobe" , "br-netfilter" ) . CombinedOutput ( )
if err := setSysctl ( sysctlBridgeCallIptables , 1 ) ; err != nil {
return nil , fmt . Errorf ( "can't set sysctl %s: %v" , sysctlBridgeCallIptables , err )
2015-08-15 05:15:12 +00:00
}
2015-08-15 05:36:11 +00:00
// No turning back. Remove artifacts that might still exist from the userspace Proxier.
glog . V ( 2 ) . Info ( "Tearing down userspace rules. Errors here are acceptable." )
tearDownUserspaceIptables ( ipt )
2015-08-11 02:47:13 +00:00
return & Proxier {
serviceMap : make ( map [ proxy . ServicePortName ] * serviceInfo ) ,
2015-08-13 00:33:05 +00:00
syncPeriod : syncPeriod ,
2015-08-11 02:47:13 +00:00
iptables : ipt ,
} , nil
}
// Chains from the userspace proxy
// TODO: Remove these Chains and tearDownUserspaceIptables once the userspace Proxier has been removed.
var iptablesContainerPortalChain utiliptables . Chain = "KUBE-PORTALS-CONTAINER"
var iptablesHostPortalChain utiliptables . Chain = "KUBE-PORTALS-HOST"
var iptablesContainerNodePortChain utiliptables . Chain = "KUBE-NODEPORT-CONTAINER"
var iptablesHostNodePortChain utiliptables . Chain = "KUBE-NODEPORT-HOST"
// tearDownUserspaceIptables removes all iptables rules and chains created by the userspace Proxier
func tearDownUserspaceIptables ( ipt utiliptables . Interface ) {
// NOTE: Warning, this needs to be kept in sync with the userspace Proxier,
// we want to ensure we remove all of the iptables rules it creates.
// Currently they are all in iptablesInit()
// Delete Rules first, then Flush and Delete Chains
args := [ ] string { "-m" , "comment" , "--comment" , "handle ClusterIPs; NOTE: this must be before the NodePort rules" }
if err := ipt . DeleteRule ( utiliptables . TableNAT , utiliptables . ChainOutput , append ( args , "-j" , string ( iptablesHostPortalChain ) ) ... ) ; err != nil {
glog . Errorf ( "Error removing userspace rule: %v" , err )
}
if err := ipt . DeleteRule ( utiliptables . TableNAT , utiliptables . ChainPrerouting , append ( args , "-j" , string ( iptablesContainerPortalChain ) ) ... ) ; err != nil {
glog . Errorf ( "Error removing userspace rule: %v" , err )
}
args = [ ] string { "-m" , "addrtype" , "--dst-type" , "LOCAL" }
args = append ( args , "-m" , "comment" , "--comment" , "handle service NodePorts; NOTE: this must be the last rule in the chain" )
if err := ipt . DeleteRule ( utiliptables . TableNAT , utiliptables . ChainOutput , append ( args , "-j" , string ( iptablesHostNodePortChain ) ) ... ) ; err != nil {
glog . Errorf ( "Error removing userspace rule: %v" , err )
}
if err := ipt . DeleteRule ( utiliptables . TableNAT , utiliptables . ChainPrerouting , append ( args , "-j" , string ( iptablesContainerNodePortChain ) ) ... ) ; err != nil {
glog . Errorf ( "Error removing userspace rule: %v" , err )
}
// flush and delete chains.
chains := [ ] utiliptables . Chain { iptablesContainerPortalChain , iptablesHostPortalChain , iptablesHostNodePortChain , iptablesContainerNodePortChain }
for _ , c := range chains {
// flush chain, then if sucessful delete, delete will fail if flush fails.
if err := ipt . FlushChain ( utiliptables . TableNAT , c ) ; err != nil {
glog . Errorf ( "Error flushing userspace chain: %v" , err )
} else {
if err = ipt . DeleteChain ( utiliptables . TableNAT , c ) ; err != nil {
glog . Errorf ( "Error flushing userspace chain: %v" , err )
}
}
}
}
func ( proxier * Proxier ) sameConfig ( info * serviceInfo , service * api . Service , port * api . ServicePort ) bool {
2015-08-15 03:50:29 +00:00
if info . protocol != port . Protocol || info . port != port . Port || info . nodePort != port . NodePort {
2015-08-11 02:47:13 +00:00
return false
}
2015-08-15 03:50:29 +00:00
if ! info . clusterIP . Equal ( net . ParseIP ( service . Spec . ClusterIP ) ) {
2015-08-11 02:47:13 +00:00
return false
}
if ! ipsEqual ( info . deprecatedPublicIPs , service . Spec . DeprecatedPublicIPs ) {
return false
}
if ! api . LoadBalancerStatusEqual ( & info . loadBalancerStatus , & service . Status . LoadBalancer ) {
return false
}
if info . sessionAffinityType != service . Spec . SessionAffinity {
return false
}
return true
}
func ipsEqual ( lhs , rhs [ ] string ) bool {
if len ( lhs ) != len ( rhs ) {
return false
}
for i := range lhs {
if lhs [ i ] != rhs [ i ] {
return false
}
}
return true
}
// SyncLoop runs periodic work. This is expected to run as a goroutine or as the main loop of the app. It does not return.
func ( proxier * Proxier ) SyncLoop ( ) {
2015-08-13 00:33:05 +00:00
t := time . NewTicker ( proxier . syncPeriod )
2015-08-11 02:47:13 +00:00
defer t . Stop ( )
for {
<- t . C
glog . V ( 6 ) . Infof ( "Periodic sync" )
func ( ) {
proxier . mu . Lock ( )
defer proxier . mu . Unlock ( )
if err := proxier . syncProxyRules ( ) ; err != nil {
glog . Errorf ( "Failed to sync iptables rules: %v" , err )
}
} ( )
}
}
// OnServiceUpdate tracks the active set of service proxies.
// They will be synchronized using syncProxyRules()
func ( proxier * Proxier ) OnServiceUpdate ( allServices [ ] api . Service ) {
proxier . mu . Lock ( )
defer proxier . mu . Unlock ( )
proxier . haveReceivedServiceUpdate = true
activeServices := make ( map [ proxy . ServicePortName ] bool ) // use a map as a set
for i := range allServices {
service := & allServices [ i ]
// if ClusterIP is "None" or empty, skip proxying
if ! api . IsServiceIPSet ( service ) {
2015-08-15 03:50:29 +00:00
glog . V ( 3 ) . Infof ( "Skipping service %s due to clusterIP = %q" , types . NamespacedName { Namespace : service . Namespace , Name : service . Name } , service . Spec . ClusterIP )
2015-08-11 02:47:13 +00:00
continue
}
for i := range service . Spec . Ports {
servicePort := & service . Spec . Ports [ i ]
2015-08-13 15:26:43 +00:00
serviceName := proxy . ServicePortName { NamespacedName : types . NamespacedName { Namespace : service . Namespace , Name : service . Name } , Port : servicePort . Name }
2015-08-11 02:47:13 +00:00
activeServices [ serviceName ] = true
info , exists := proxier . serviceMap [ serviceName ]
if exists && proxier . sameConfig ( info , service , servicePort ) {
// Nothing changed.
continue
}
if exists {
//Something changed.
2015-08-15 04:53:52 +00:00
glog . V ( 3 ) . Infof ( "Something changed for service %q: removing it" , serviceName )
2015-08-11 02:47:13 +00:00
delete ( proxier . serviceMap , serviceName )
}
serviceIP := net . ParseIP ( service . Spec . ClusterIP )
glog . V ( 1 ) . Infof ( "Adding new service %q at %s:%d/%s" , serviceName , serviceIP , servicePort . Port , servicePort . Protocol )
info = newServiceInfo ( serviceName )
2015-08-15 03:50:29 +00:00
info . clusterIP = serviceIP
info . port = servicePort . Port
info . protocol = servicePort . Protocol
info . nodePort = servicePort . NodePort
2015-08-11 02:47:13 +00:00
info . deprecatedPublicIPs = service . Spec . DeprecatedPublicIPs
// Deep-copy in case the service instance changes
info . loadBalancerStatus = * api . LoadBalancerStatusDeepCopy ( & service . Status . LoadBalancer )
info . sessionAffinityType = service . Spec . SessionAffinity
proxier . serviceMap [ serviceName ] = info
2015-08-15 04:53:52 +00:00
glog . V ( 4 ) . Infof ( "added serviceInfo(%s): %s" , serviceName , spew . Sdump ( info ) )
2015-08-11 02:47:13 +00:00
}
}
for name , info := range proxier . serviceMap {
// Check for servicePorts that were not in this update and have no endpoints.
// This helps prevent unnecessarily removing and adding services.
if ! activeServices [ name ] && info . endpoints == nil {
glog . V ( 1 ) . Infof ( "Removing service %q" , name )
delete ( proxier . serviceMap , name )
}
}
if err := proxier . syncProxyRules ( ) ; err != nil {
glog . Errorf ( "Failed to sync iptables rules: %v" , err )
}
}
// OnEndpointsUpdate takes in a slice of updated endpoints.
func ( proxier * Proxier ) OnEndpointsUpdate ( allEndpoints [ ] api . Endpoints ) {
proxier . mu . Lock ( )
defer proxier . mu . Unlock ( )
proxier . haveReceivedEndpointsUpdate = true
registeredEndpoints := make ( map [ proxy . ServicePortName ] bool ) // use a map as a set
// Update endpoints for services.
for i := range allEndpoints {
svcEndpoints := & allEndpoints [ i ]
// We need to build a map of portname -> all ip:ports for that
// portname. Explode Endpoints.Subsets[*] into this structure.
portsToEndpoints := map [ string ] [ ] hostPortPair { }
for i := range svcEndpoints . Subsets {
ss := & svcEndpoints . Subsets [ i ]
for i := range ss . Ports {
port := & ss . Ports [ i ]
for i := range ss . Addresses {
addr := & ss . Addresses [ i ]
portsToEndpoints [ port . Name ] = append ( portsToEndpoints [ port . Name ] , hostPortPair { addr . IP , port . Port } )
}
}
}
for portname := range portsToEndpoints {
2015-08-13 15:26:43 +00:00
svcPort := proxy . ServicePortName { NamespacedName : types . NamespacedName { Namespace : svcEndpoints . Namespace , Name : svcEndpoints . Name } , Port : portname }
2015-08-11 02:47:13 +00:00
state , exists := proxier . serviceMap [ svcPort ]
if ! exists || state == nil {
state = newServiceInfo ( svcPort )
proxier . serviceMap [ svcPort ] = state
}
curEndpoints := [ ] string { }
if state != nil {
curEndpoints = state . endpoints
}
newEndpoints := flattenValidEndpoints ( portsToEndpoints [ portname ] )
if len ( curEndpoints ) != len ( newEndpoints ) || ! slicesEquiv ( slice . CopyStrings ( curEndpoints ) , newEndpoints ) {
glog . V ( 1 ) . Infof ( "Setting endpoints for %s to %+v" , svcPort , newEndpoints )
state . endpoints = newEndpoints
}
registeredEndpoints [ svcPort ] = true
}
}
// Remove endpoints missing from the update.
for service , info := range proxier . serviceMap {
// if missing from update and not already set by previous endpoints event
if _ , exists := registeredEndpoints [ service ] ; ! exists && info . endpoints != nil {
glog . V ( 2 ) . Infof ( "Removing endpoints for %s" , service )
// Set the endpoints to nil, we will check for this in OnServiceUpdate so that we
// only remove ServicePorts that have no endpoints and were not in the service update,
// that way we only remove ServicePorts that were not in both.
proxier . serviceMap [ service ] . endpoints = nil
}
}
2015-08-15 04:53:52 +00:00
if err := proxier . syncProxyRules ( ) ; err != nil {
glog . Errorf ( "Failed to sync iptables rules: %v" , err )
}
2015-08-11 02:47:13 +00:00
}
// used in OnEndpointsUpdate
type hostPortPair struct {
host string
port int
}
func isValidEndpoint ( hpp * hostPortPair ) bool {
return hpp . host != "" && hpp . port > 0
}
// Tests whether two slices are equivalent. This sorts both slices in-place.
func slicesEquiv ( lhs , rhs [ ] string ) bool {
if len ( lhs ) != len ( rhs ) {
return false
}
if reflect . DeepEqual ( slice . SortStrings ( lhs ) , slice . SortStrings ( rhs ) ) {
return true
}
return false
}
func flattenValidEndpoints ( endpoints [ ] hostPortPair ) [ ] string {
// Convert Endpoint objects into strings for easier use later.
var result [ ] string
for i := range endpoints {
hpp := & endpoints [ i ]
if isValidEndpoint ( hpp ) {
result = append ( result , net . JoinHostPort ( hpp . host , strconv . Itoa ( hpp . port ) ) )
}
}
return result
}
2015-08-15 04:02:53 +00:00
// servicePortToServiceChain takes the ServicePortName for a service and
// returns the associated iptables chain. This is computed by hashing (sha256)
// then encoding to base32 and truncating with the prefix "KUBE-SVC-". We do
// this because Iptables Chain Names must be <= 28 chars long, and the longer
// they are the harder they are to read.
func servicePortToServiceChain ( s proxy . ServicePortName , protocol string ) utiliptables . Chain {
hash := sha256 . Sum256 ( [ ] byte ( s . String ( ) + protocol ) )
2015-08-11 02:47:13 +00:00
encoded := base32 . StdEncoding . EncodeToString ( hash [ : ] )
2015-08-15 04:02:53 +00:00
return utiliptables . Chain ( "KUBE-SVC-" + encoded [ : 16 ] )
2015-08-11 02:47:13 +00:00
}
2015-08-15 04:02:53 +00:00
// This is the same as servicePortToServiceChain but with the endpoint
// included.
func servicePortAndEndpointToServiceChain ( s proxy . ServicePortName , protocol string , endpoint string ) utiliptables . Chain {
hash := sha256 . Sum256 ( [ ] byte ( s . String ( ) + protocol + endpoint ) )
2015-08-11 02:47:13 +00:00
encoded := base32 . StdEncoding . EncodeToString ( hash [ : ] )
2015-08-15 04:02:53 +00:00
return utiliptables . Chain ( "KUBE-SEP-" + encoded [ : 16 ] )
2015-08-11 02:47:13 +00:00
}
// This is where all of the iptables-save/restore calls happen.
// The only other iptables rules are those that are setup in iptablesInit()
// assumes proxier.mu is held
func ( proxier * Proxier ) syncProxyRules ( ) error {
// don't sync rules till we've received services and endpoints
if ! proxier . haveReceivedEndpointsUpdate || ! proxier . haveReceivedServiceUpdate {
2015-08-15 04:53:52 +00:00
glog . V ( 2 ) . Info ( "Not syncing iptables until Services and Endpoints have been received from master" )
2015-08-11 02:47:13 +00:00
return nil
}
2015-08-15 04:53:52 +00:00
glog . V ( 3 ) . Infof ( "Syncing iptables rules" )
2015-08-11 02:47:13 +00:00
2015-08-14 22:20:19 +00:00
// Ensure main chains and rules are installed.
inputChains := [ ] utiliptables . Chain { utiliptables . ChainOutput , utiliptables . ChainPrerouting }
// Link the services chain.
for _ , chain := range inputChains {
if _ , err := proxier . iptables . EnsureChain ( utiliptables . TableNAT , iptablesServicesChain ) ; err != nil {
return err
}
comment := "kubernetes service portals; must be before nodeports"
args := [ ] string { "-m" , "comment" , "--comment" , comment , "-j" , string ( iptablesServicesChain ) }
if _ , err := proxier . iptables . EnsureRule ( utiliptables . Prepend , utiliptables . TableNAT , chain , args ... ) ; err != nil {
return err
}
2015-08-11 02:47:13 +00:00
}
2015-08-14 22:20:19 +00:00
// Link the nodeports chain.
for _ , chain := range inputChains {
if _ , err := proxier . iptables . EnsureChain ( utiliptables . TableNAT , iptablesNodePortsChain ) ; err != nil {
return err
}
comment := "kubernetes service nodeports; must be after portals"
args := [ ] string { "-m" , "comment" , "--comment" , comment , "-m" , "addrtype" , "--dst-type" , "LOCAL" , "-j" , string ( iptablesNodePortsChain ) }
if _ , err := proxier . iptables . EnsureRule ( utiliptables . Append , utiliptables . TableNAT , chain , args ... ) ; err != nil {
return err
}
2015-08-11 02:47:13 +00:00
}
2015-08-14 22:20:19 +00:00
// Link the output rules.
{
comment := "kubernetes service traffic requiring SNAT"
args := [ ] string { "-m" , "comment" , "--comment" , comment , "-m" , "mark" , "--mark" , iptablesMasqueradeMark , "-j" , "MASQUERADE" }
if _ , err := proxier . iptables . EnsureRule ( utiliptables . Append , utiliptables . TableNAT , utiliptables . ChainPostrouting , args ... ) ; err != nil {
return err
}
2015-08-11 02:47:13 +00:00
}
// Get iptables-save output so we can check for existing chains and rules.
// This will be a map of chain name to chain with rules as stored in iptables-save/iptables-restore
existingChains := make ( map [ utiliptables . Chain ] string )
iptablesSaveRaw , err := proxier . iptables . Save ( utiliptables . TableNAT )
if err != nil { // if we failed to get any rules
glog . Errorf ( "Failed to execute iptable-save, syncing all rules. %s" , err . Error ( ) )
} else { // otherwise parse the output
existingChains = getChainLines ( utiliptables . TableNAT , iptablesSaveRaw )
}
2015-08-15 03:50:29 +00:00
chainsLines := bytes . NewBuffer ( nil )
rulesLines := bytes . NewBuffer ( nil )
2015-08-11 02:47:13 +00:00
2015-08-15 03:50:29 +00:00
// Write table header.
writeLine ( chainsLines , "*nat" )
2015-08-11 02:47:13 +00:00
2015-08-15 03:50:29 +00:00
// Make sure we keep stats for the top-level chains, if they existed
// (which they should have because we created them above).
2015-08-11 02:47:13 +00:00
if chain , ok := existingChains [ iptablesServicesChain ] ; ok {
2015-08-15 03:50:29 +00:00
writeLine ( chainsLines , chain )
2015-08-11 02:47:13 +00:00
} else {
2015-08-15 03:50:29 +00:00
writeLine ( chainsLines , makeChainLine ( iptablesServicesChain ) )
}
if chain , ok := existingChains [ iptablesNodePortsChain ] ; ok {
writeLine ( chainsLines , chain )
} else {
writeLine ( chainsLines , makeChainLine ( iptablesNodePortsChain ) )
2015-08-11 02:47:13 +00:00
}
2015-08-15 03:50:29 +00:00
// Accumulate chains to keep.
activeChains := make ( map [ utiliptables . Chain ] bool ) // use a map as a set
2015-08-11 02:47:13 +00:00
2015-08-15 03:50:29 +00:00
// Build rules for each service.
2015-08-11 02:47:13 +00:00
for name , info := range proxier . serviceMap {
2015-08-15 04:02:53 +00:00
protocol := strings . ToLower ( string ( info . protocol ) )
2015-08-15 03:50:29 +00:00
// Create the per-service chain, retaining counters if possible.
2015-08-15 04:02:53 +00:00
svcChain := servicePortToServiceChain ( name , protocol )
2015-08-11 02:47:13 +00:00
if chain , ok := existingChains [ svcChain ] ; ok {
2015-08-15 03:50:29 +00:00
writeLine ( chainsLines , chain )
2015-08-11 02:47:13 +00:00
} else {
2015-08-15 03:50:29 +00:00
writeLine ( chainsLines , makeChainLine ( svcChain ) )
2015-08-11 02:47:13 +00:00
}
2015-08-15 03:50:29 +00:00
activeChains [ svcChain ] = true
// Capture the clusterIP.
writeLine ( rulesLines ,
"-A" , string ( iptablesServicesChain ) ,
"-m" , "comment" , "--comment" , fmt . Sprintf ( "\"%s cluster IP\"" , name . String ( ) ) ,
"-m" , protocol , "-p" , protocol ,
"-d" , fmt . Sprintf ( "%s/32" , info . clusterIP . String ( ) ) ,
"--dport" , fmt . Sprintf ( "%d" , info . port ) ,
"-j" , string ( svcChain ) )
// Capture externalIPs.
for _ , externalIP := range info . deprecatedPublicIPs {
args := [ ] string {
"-A" , string ( iptablesServicesChain ) ,
"-m" , "comment" , "--comment" , fmt . Sprintf ( "\"%s external IP\"" , name . String ( ) ) ,
"-m" , protocol , "-p" , protocol ,
"-d" , fmt . Sprintf ( "%s/32" , externalIP ) ,
"--dport" , fmt . Sprintf ( "%d" , info . port ) ,
}
// We have to SNAT packets from external IPs.
writeLine ( rulesLines , append ( args ,
"-j" , "MARK" , "--set-xmark" , fmt . Sprintf ( "%s/0xffffffff" , iptablesMasqueradeMark ) ) ... )
writeLine ( rulesLines , append ( args ,
"-j" , string ( svcChain ) ) ... )
2015-08-11 02:47:13 +00:00
}
2015-08-15 03:50:29 +00:00
// Capture load-balancer ingress.
for _ , ingress := range info . loadBalancerStatus . Ingress {
if ingress . IP != "" {
args := [ ] string {
"-A" , string ( iptablesServicesChain ) ,
"-m" , "comment" , "--comment" , fmt . Sprintf ( "\"%s loadbalancer IP\"" , name . String ( ) ) ,
"-m" , protocol , "-p" , protocol ,
"-d" , fmt . Sprintf ( "%s/32" , ingress . IP ) ,
"--dport" , fmt . Sprintf ( "%d" , info . port ) ,
}
// We have to SNAT packets from external IPs.
writeLine ( rulesLines , append ( args ,
"-j" , "MARK" , "--set-xmark" , fmt . Sprintf ( "%s/0xffffffff" , iptablesMasqueradeMark ) ) ... )
writeLine ( rulesLines , append ( args ,
"-j" , string ( svcChain ) ) ... )
2015-08-11 02:47:13 +00:00
}
2015-08-15 03:50:29 +00:00
}
2015-08-11 02:47:13 +00:00
2015-08-15 03:50:29 +00:00
// Capture nodeports. If we had more than 2 rules it might be
// worthwhile to make a new per-service chain for nodeport rules, but
// with just 2 rules it ends up being a waste and a cognitive burden.
if info . nodePort != 0 {
// Nodeports need SNAT.
writeLine ( rulesLines ,
"-A" , string ( iptablesNodePortsChain ) ,
"-m" , "comment" , "--comment" , name . String ( ) ,
"-m" , protocol , "-p" , protocol ,
"--dport" , fmt . Sprintf ( "%d" , info . nodePort ) ,
"-j" , "MARK" , "--set-xmark" , fmt . Sprintf ( "%s/0xffffffff" , iptablesMasqueradeMark ) )
// Jump to the service chain.
writeLine ( rulesLines ,
"-A" , string ( iptablesNodePortsChain ) ,
"-m" , "comment" , "--comment" , name . String ( ) ,
"-m" , protocol , "-p" , protocol ,
"--dport" , fmt . Sprintf ( "%d" , info . nodePort ) ,
"-j" , string ( svcChain ) )
2015-08-11 02:47:13 +00:00
}
2015-08-15 03:50:29 +00:00
// Generate the per-endpoint chains. We do this in multiple passes so we
// can group rules together.
endpoints := make ( [ ] string , 0 )
endpointChains := make ( [ ] utiliptables . Chain , 0 )
for _ , ep := range info . endpoints {
endpoints = append ( endpoints , ep )
2015-08-15 04:02:53 +00:00
endpointChain := servicePortAndEndpointToServiceChain ( name , protocol , ep )
2015-08-15 03:50:29 +00:00
endpointChains = append ( endpointChains , endpointChain )
// Create the endpoint chain, retaining counters if possible.
if chain , ok := existingChains [ utiliptables . Chain ( endpointChain ) ] ; ok {
writeLine ( chainsLines , chain )
2015-08-11 02:47:13 +00:00
} else {
2015-08-15 03:50:29 +00:00
writeLine ( chainsLines , makeChainLine ( endpointChain ) )
2015-08-11 02:47:13 +00:00
}
2015-08-15 03:50:29 +00:00
activeChains [ endpointChain ] = true
2015-08-11 02:47:13 +00:00
}
2015-08-15 03:50:29 +00:00
// First write session affinity rules, if applicable.
if info . sessionAffinityType == api . ServiceAffinityClientIP {
for _ , endpointChain := range endpointChains {
writeLine ( rulesLines ,
"-A" , string ( svcChain ) ,
"-m" , "comment" , "--comment" , name . String ( ) ,
"-m" , "recent" , "--name" , string ( endpointChain ) ,
"--rcheck" , "--seconds" , fmt . Sprintf ( "%d" , info . stickyMaxAgeSeconds ) , "--reap" ,
"-j" , string ( endpointChain ) )
}
2015-08-11 02:47:13 +00:00
}
2015-08-15 03:50:29 +00:00
// Now write loadbalancing & DNAT rules.
n := len ( endpointChains )
for i , endpointChain := range endpointChains {
// Balancing rules in the per-service chain.
args := [ ] string {
"-A" , string ( svcChain ) ,
"-m" , "comment" , "--comment" , name . String ( ) ,
}
if i < ( n - 1 ) {
// Each rule is a probabilistic match.
args = append ( args ,
"-m" , "statistic" ,
"--mode" , "random" ,
"--probability" , fmt . Sprintf ( "%f" , 1.0 / float64 ( n - i ) ) )
}
// The final (or only if n == 1) rule is a guaranteed match.
args = append ( args , "-j" , string ( endpointChain ) )
writeLine ( rulesLines , args ... )
// Rules in the per-endpoint chain.
args = [ ] string {
"-A" , string ( endpointChain ) ,
"-m" , "comment" , "--comment" , name . String ( ) ,
2015-08-11 02:47:13 +00:00
}
2015-08-15 03:50:29 +00:00
// Handle traffic that loops back to the originator with SNAT.
// Technically we only need to do this if the endpoint is on this
// host, but we don't have that information, so we just do this for
// all endpoints.
// TODO: if we grow logic to get this node's pod CIDR, we can use it.
writeLine ( rulesLines , append ( args ,
"-s" , fmt . Sprintf ( "%s/32" , strings . Split ( endpoints [ i ] , ":" ) [ 0 ] ) ,
"-j" , "MARK" , "--set-xmark" , fmt . Sprintf ( "%s/0xffffffff" , iptablesMasqueradeMark ) ) ... )
// Update client-affinity lists.
if info . sessionAffinityType == api . ServiceAffinityClientIP {
args = append ( args , "-m" , "recent" , "--name" , string ( endpointChain ) , "--set" )
}
// DNAT to final destination.
args = append ( args ,
"-m" , protocol , "-p" , protocol ,
"-j" , "DNAT" , "--to-destination" , endpoints [ i ] )
writeLine ( rulesLines , args ... )
2015-08-11 02:47:13 +00:00
}
}
2015-08-15 03:50:29 +00:00
// Delete chains no longer in use.
2015-08-11 02:47:13 +00:00
for chain := range existingChains {
if ! activeChains [ chain ] {
chainString := string ( chain )
if ! strings . HasPrefix ( chainString , "KUBE-SVC-" ) && ! strings . HasPrefix ( chainString , "KUBE-SEP-" ) {
2015-08-15 03:50:29 +00:00
// Ignore chains that aren't ours.
2015-08-11 02:47:13 +00:00
continue
}
2015-08-15 03:50:29 +00:00
// We must (as per iptables) write a chain-line for it, which has
// the nice effect of flushing the chain. Then we can remove the
// chain.
writeLine ( chainsLines , existingChains [ chain ] )
writeLine ( rulesLines , "-X" , chainString )
2015-08-11 02:47:13 +00:00
}
}
2015-08-15 03:50:29 +00:00
// Write the end-of-table marker.
writeLine ( rulesLines , "COMMIT" )
2015-08-11 02:47:13 +00:00
2015-08-15 03:50:29 +00:00
// Sync rules.
// NOTE: NoFlushTables is used so we don't flush non-kubernetes chains in the table.
lines := append ( chainsLines . Bytes ( ) , rulesLines . Bytes ( ) ... )
2015-08-11 02:47:13 +00:00
glog . V ( 3 ) . Infof ( "Syncing rules: %s" , lines )
2015-08-15 03:50:29 +00:00
return proxier . iptables . Restore ( utiliptables . TableNAT , lines , utiliptables . NoFlushTables , utiliptables . RestoreCounters )
}
// Join all words with spaces, terminate with newline and write to buf.
func writeLine ( buf * bytes . Buffer , words ... string ) {
buf . WriteString ( strings . Join ( words , " " ) + "\n" )
2015-08-11 02:47:13 +00:00
}
// return an iptables-save/restore formatted chain line given a Chain
func makeChainLine ( chain utiliptables . Chain ) string {
2015-08-15 03:50:29 +00:00
return fmt . Sprintf ( ":%s - [0:0]" , chain )
2015-08-11 02:47:13 +00:00
}
// getChainLines parses a table's iptables-save data to find chains in the table.
// It returns a map of iptables.Chain to string where the string is the chain line from the save (with counters etc).
func getChainLines ( table utiliptables . Table , save [ ] byte ) map [ utiliptables . Chain ] string {
// get lines
lines := strings . Split ( string ( save ) , "\n" )
chainsMap := make ( map [ utiliptables . Chain ] string )
tablePrefix := "*" + string ( table )
lineNum := 0
// find beginning of table
for ; lineNum < len ( lines ) ; lineNum ++ {
if strings . HasPrefix ( strings . TrimSpace ( lines [ lineNum ] ) , tablePrefix ) {
lineNum ++
break
}
}
// parse table lines
for ; lineNum < len ( lines ) ; lineNum ++ {
line := strings . TrimSpace ( lines [ lineNum ] )
if strings . HasPrefix ( line , "COMMIT" ) || strings . HasPrefix ( line , "*" ) {
break
} else if len ( line ) == 0 || strings . HasPrefix ( line , "#" ) {
continue
} else if strings . HasPrefix ( line , ":" ) && len ( line ) > 1 {
chain := utiliptables . Chain ( strings . SplitN ( line [ 1 : ] , " " , 2 ) [ 0 ] )
chainsMap [ chain ] = lines [ lineNum ]
}
}
return chainsMap
}