k3s/pkg/proxy/iptables/proxier.go

741 lines
27 KiB
Go
Raw Normal View History

2015-08-11 02:47:13 +00:00
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package iptables
/*
NOTE: this needs to be tested in e2e since it uses iptables for everything.
*/
import (
"bytes"
"crypto/sha256"
"encoding/base32"
"fmt"
2015-08-15 05:15:12 +00:00
"io/ioutil"
2015-08-11 02:47:13 +00:00
"net"
2015-08-15 05:15:12 +00:00
"path"
2015-08-11 02:47:13 +00:00
"reflect"
"strconv"
"strings"
"sync"
"time"
"github.com/coreos/go-semver/semver"
"github.com/davecgh/go-spew/spew"
2015-08-11 02:47:13 +00:00
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/proxy"
"k8s.io/kubernetes/pkg/types"
utilexec "k8s.io/kubernetes/pkg/util/exec"
utiliptables "k8s.io/kubernetes/pkg/util/iptables"
"k8s.io/kubernetes/pkg/util/slice"
)
// NOTE: IPTABLES_MIN_VERSION is the minimum version of iptables for which we will use the Proxier
// from this package instead of the userspace Proxier.
// This is will not be enough, as the version number is somewhat unreliable,
// features are backported in various distros and this could get pretty hairy.
// However iptables-1.4.0 was released 2007-Dec-22 and appears to have every feature we use,
// so this seems prefectly reasonable for now.
const IPTABLES_MIN_VERSION string = "1.4.0"
2015-08-11 02:47:13 +00:00
// the services chain
const iptablesServicesChain utiliptables.Chain = "KUBE-SERVICES"
// the nodeports chain
const iptablesNodePortsChain utiliptables.Chain = "KUBE-NODEPORTS"
// the mark we apply to traffic needing SNAT
const iptablesMasqueradeMark = "0x4d415351"
2015-08-11 02:47:13 +00:00
2015-08-15 05:15:12 +00:00
// ShouldUseIptablesProxier returns true if we should use the iptables Proxier
// instead of the "classic" userspace Proxier. This is determined by checking
// the iptables version and for the existence of kernel features. It may return
// an error if it fails to get the itpables version without error, in which
// case it will also return false.
2015-08-11 02:47:13 +00:00
func ShouldUseIptablesProxier() (bool, error) {
exec := utilexec.New()
minVersion, err := semver.NewVersion(IPTABLES_MIN_VERSION)
if err != nil {
return false, err
}
// returns "vX.X.X", err
versionString, err := utiliptables.GetIptablesVersionString(exec)
if err != nil {
return false, err
}
// make a semver of the part after the v in "vX.X.X"
version, err := semver.NewVersion(versionString[1:])
if err != nil {
return false, err
}
2015-08-15 05:15:12 +00:00
if version.LessThan(*minVersion) {
return false, nil
}
// Check for the required sysctls. We don't care about the value, just
// that it exists. If this Proxier is chosen, we'll iniialize it as we
// need.
_, err = getSysctl(sysctlRouteLocalnet)
if err != nil {
return false, err
}
return true, nil
}
const sysctlBase = "/proc/sys"
const sysctlRouteLocalnet = "net/ipv4/conf/all/route_localnet"
func getSysctl(sysctl string) (int, error) {
data, err := ioutil.ReadFile(path.Join(sysctlBase, sysctl))
if err != nil {
return -1, err
}
val, err := strconv.Atoi(strings.Trim(string(data), " \n"))
if err != nil {
return -1, err
}
return val, nil
}
func setSysctl(sysctl string, newVal int) error {
return ioutil.WriteFile(path.Join(sysctlBase, sysctl), []byte(strconv.Itoa(newVal)), 0640)
2015-08-11 02:47:13 +00:00
}
// internal struct for string service information
type serviceInfo struct {
clusterIP net.IP
port int
protocol api.Protocol
nodePort int
2015-08-11 02:47:13 +00:00
loadBalancerStatus api.LoadBalancerStatus
sessionAffinityType api.ServiceAffinity
stickyMaxAgeSeconds int
2015-08-11 02:47:13 +00:00
endpoints []string
// Deprecated, but required for back-compat (including e2e)
deprecatedPublicIPs []string
}
// returns a new serviceInfo struct
func newServiceInfo(service proxy.ServicePortName) *serviceInfo {
return &serviceInfo{
sessionAffinityType: api.ServiceAffinityNone, // default
stickyMaxAgeSeconds: 180, // TODO: paramaterize this in the API.
2015-08-11 02:47:13 +00:00
}
}
// Proxier is an iptables based proxy for connections between a localhost:lport
// and services that provide the actual backends.
type Proxier struct {
mu sync.Mutex // protects serviceMap
serviceMap map[proxy.ServicePortName]*serviceInfo
syncPeriod time.Duration
2015-08-11 02:47:13 +00:00
iptables utiliptables.Interface
haveReceivedServiceUpdate bool // true once we've seen an OnServiceUpdate event
haveReceivedEndpointsUpdate bool // true once we've seen an OnEndpointsUpdate event
}
// Proxier implements ProxyProvider
var _ proxy.ProxyProvider = &Proxier{}
// NewProxier returns a new Proxier given an iptables Interface instance.
// Because of the iptables logic, it is assumed that there is only a single Proxier active on a machine.
// An error will be returned if iptables fails to update or acquire the initial lock.
// Once a proxier is created, it will keep iptables up to date in the background and
// will not terminate if a particular iptables call fails.
func NewProxier(ipt utiliptables.Interface, syncPeriod time.Duration) (*Proxier, error) {
2015-08-11 02:47:13 +00:00
glog.V(2).Info("Tearing down userspace rules. Errors here are acceptable.")
// remove iptables rules/chains from the userspace Proxier
tearDownUserspaceIptables(ipt)
2015-08-15 05:15:12 +00:00
// Set the route_localnet sysctl we need for
if err := setSysctl(sysctlRouteLocalnet, 1); err != nil {
return nil, fmt.Errorf("can't set sysctl route_localnet: %v", err)
}
2015-08-11 02:47:13 +00:00
return &Proxier{
serviceMap: make(map[proxy.ServicePortName]*serviceInfo),
syncPeriod: syncPeriod,
2015-08-11 02:47:13 +00:00
iptables: ipt,
}, nil
}
// Chains from the userspace proxy
// TODO: Remove these Chains and tearDownUserspaceIptables once the userspace Proxier has been removed.
var iptablesContainerPortalChain utiliptables.Chain = "KUBE-PORTALS-CONTAINER"
var iptablesHostPortalChain utiliptables.Chain = "KUBE-PORTALS-HOST"
var iptablesContainerNodePortChain utiliptables.Chain = "KUBE-NODEPORT-CONTAINER"
var iptablesHostNodePortChain utiliptables.Chain = "KUBE-NODEPORT-HOST"
// tearDownUserspaceIptables removes all iptables rules and chains created by the userspace Proxier
func tearDownUserspaceIptables(ipt utiliptables.Interface) {
// NOTE: Warning, this needs to be kept in sync with the userspace Proxier,
// we want to ensure we remove all of the iptables rules it creates.
// Currently they are all in iptablesInit()
// Delete Rules first, then Flush and Delete Chains
args := []string{"-m", "comment", "--comment", "handle ClusterIPs; NOTE: this must be before the NodePort rules"}
if err := ipt.DeleteRule(utiliptables.TableNAT, utiliptables.ChainOutput, append(args, "-j", string(iptablesHostPortalChain))...); err != nil {
glog.Errorf("Error removing userspace rule: %v", err)
}
if err := ipt.DeleteRule(utiliptables.TableNAT, utiliptables.ChainPrerouting, append(args, "-j", string(iptablesContainerPortalChain))...); err != nil {
glog.Errorf("Error removing userspace rule: %v", err)
}
args = []string{"-m", "addrtype", "--dst-type", "LOCAL"}
args = append(args, "-m", "comment", "--comment", "handle service NodePorts; NOTE: this must be the last rule in the chain")
if err := ipt.DeleteRule(utiliptables.TableNAT, utiliptables.ChainOutput, append(args, "-j", string(iptablesHostNodePortChain))...); err != nil {
glog.Errorf("Error removing userspace rule: %v", err)
}
if err := ipt.DeleteRule(utiliptables.TableNAT, utiliptables.ChainPrerouting, append(args, "-j", string(iptablesContainerNodePortChain))...); err != nil {
glog.Errorf("Error removing userspace rule: %v", err)
}
// flush and delete chains.
chains := []utiliptables.Chain{iptablesContainerPortalChain, iptablesHostPortalChain, iptablesHostNodePortChain, iptablesContainerNodePortChain}
for _, c := range chains {
// flush chain, then if sucessful delete, delete will fail if flush fails.
if err := ipt.FlushChain(utiliptables.TableNAT, c); err != nil {
glog.Errorf("Error flushing userspace chain: %v", err)
} else {
if err = ipt.DeleteChain(utiliptables.TableNAT, c); err != nil {
glog.Errorf("Error flushing userspace chain: %v", err)
}
}
}
}
func (proxier *Proxier) sameConfig(info *serviceInfo, service *api.Service, port *api.ServicePort) bool {
if info.protocol != port.Protocol || info.port != port.Port || info.nodePort != port.NodePort {
2015-08-11 02:47:13 +00:00
return false
}
if !info.clusterIP.Equal(net.ParseIP(service.Spec.ClusterIP)) {
2015-08-11 02:47:13 +00:00
return false
}
if !ipsEqual(info.deprecatedPublicIPs, service.Spec.DeprecatedPublicIPs) {
return false
}
if !api.LoadBalancerStatusEqual(&info.loadBalancerStatus, &service.Status.LoadBalancer) {
return false
}
if info.sessionAffinityType != service.Spec.SessionAffinity {
return false
}
return true
}
func ipsEqual(lhs, rhs []string) bool {
if len(lhs) != len(rhs) {
return false
}
for i := range lhs {
if lhs[i] != rhs[i] {
return false
}
}
return true
}
// SyncLoop runs periodic work. This is expected to run as a goroutine or as the main loop of the app. It does not return.
func (proxier *Proxier) SyncLoop() {
t := time.NewTicker(proxier.syncPeriod)
2015-08-11 02:47:13 +00:00
defer t.Stop()
for {
<-t.C
glog.V(6).Infof("Periodic sync")
func() {
proxier.mu.Lock()
defer proxier.mu.Unlock()
if err := proxier.syncProxyRules(); err != nil {
glog.Errorf("Failed to sync iptables rules: %v", err)
}
}()
}
}
// OnServiceUpdate tracks the active set of service proxies.
// They will be synchronized using syncProxyRules()
func (proxier *Proxier) OnServiceUpdate(allServices []api.Service) {
proxier.mu.Lock()
defer proxier.mu.Unlock()
proxier.haveReceivedServiceUpdate = true
activeServices := make(map[proxy.ServicePortName]bool) // use a map as a set
for i := range allServices {
service := &allServices[i]
// if ClusterIP is "None" or empty, skip proxying
if !api.IsServiceIPSet(service) {
glog.V(3).Infof("Skipping service %s due to clusterIP = %q", types.NamespacedName{Namespace: service.Namespace, Name: service.Name}, service.Spec.ClusterIP)
2015-08-11 02:47:13 +00:00
continue
}
for i := range service.Spec.Ports {
servicePort := &service.Spec.Ports[i]
2015-08-13 15:26:43 +00:00
serviceName := proxy.ServicePortName{NamespacedName: types.NamespacedName{Namespace: service.Namespace, Name: service.Name}, Port: servicePort.Name}
2015-08-11 02:47:13 +00:00
activeServices[serviceName] = true
info, exists := proxier.serviceMap[serviceName]
if exists && proxier.sameConfig(info, service, servicePort) {
// Nothing changed.
continue
}
if exists {
//Something changed.
glog.V(3).Infof("Something changed for service %q: removing it", serviceName)
2015-08-11 02:47:13 +00:00
delete(proxier.serviceMap, serviceName)
}
serviceIP := net.ParseIP(service.Spec.ClusterIP)
glog.V(1).Infof("Adding new service %q at %s:%d/%s", serviceName, serviceIP, servicePort.Port, servicePort.Protocol)
info = newServiceInfo(serviceName)
info.clusterIP = serviceIP
info.port = servicePort.Port
info.protocol = servicePort.Protocol
info.nodePort = servicePort.NodePort
2015-08-11 02:47:13 +00:00
info.deprecatedPublicIPs = service.Spec.DeprecatedPublicIPs
// Deep-copy in case the service instance changes
info.loadBalancerStatus = *api.LoadBalancerStatusDeepCopy(&service.Status.LoadBalancer)
info.sessionAffinityType = service.Spec.SessionAffinity
proxier.serviceMap[serviceName] = info
glog.V(4).Infof("added serviceInfo(%s): %s", serviceName, spew.Sdump(info))
2015-08-11 02:47:13 +00:00
}
}
for name, info := range proxier.serviceMap {
// Check for servicePorts that were not in this update and have no endpoints.
// This helps prevent unnecessarily removing and adding services.
if !activeServices[name] && info.endpoints == nil {
glog.V(1).Infof("Removing service %q", name)
delete(proxier.serviceMap, name)
}
}
if err := proxier.syncProxyRules(); err != nil {
glog.Errorf("Failed to sync iptables rules: %v", err)
}
}
// OnEndpointsUpdate takes in a slice of updated endpoints.
func (proxier *Proxier) OnEndpointsUpdate(allEndpoints []api.Endpoints) {
proxier.mu.Lock()
defer proxier.mu.Unlock()
proxier.haveReceivedEndpointsUpdate = true
registeredEndpoints := make(map[proxy.ServicePortName]bool) // use a map as a set
// Update endpoints for services.
for i := range allEndpoints {
svcEndpoints := &allEndpoints[i]
// We need to build a map of portname -> all ip:ports for that
// portname. Explode Endpoints.Subsets[*] into this structure.
portsToEndpoints := map[string][]hostPortPair{}
for i := range svcEndpoints.Subsets {
ss := &svcEndpoints.Subsets[i]
for i := range ss.Ports {
port := &ss.Ports[i]
for i := range ss.Addresses {
addr := &ss.Addresses[i]
portsToEndpoints[port.Name] = append(portsToEndpoints[port.Name], hostPortPair{addr.IP, port.Port})
}
}
}
for portname := range portsToEndpoints {
2015-08-13 15:26:43 +00:00
svcPort := proxy.ServicePortName{NamespacedName: types.NamespacedName{Namespace: svcEndpoints.Namespace, Name: svcEndpoints.Name}, Port: portname}
2015-08-11 02:47:13 +00:00
state, exists := proxier.serviceMap[svcPort]
if !exists || state == nil {
state = newServiceInfo(svcPort)
proxier.serviceMap[svcPort] = state
}
curEndpoints := []string{}
if state != nil {
curEndpoints = state.endpoints
}
newEndpoints := flattenValidEndpoints(portsToEndpoints[portname])
if len(curEndpoints) != len(newEndpoints) || !slicesEquiv(slice.CopyStrings(curEndpoints), newEndpoints) {
glog.V(1).Infof("Setting endpoints for %s to %+v", svcPort, newEndpoints)
state.endpoints = newEndpoints
}
registeredEndpoints[svcPort] = true
}
}
// Remove endpoints missing from the update.
for service, info := range proxier.serviceMap {
// if missing from update and not already set by previous endpoints event
if _, exists := registeredEndpoints[service]; !exists && info.endpoints != nil {
glog.V(2).Infof("Removing endpoints for %s", service)
// Set the endpoints to nil, we will check for this in OnServiceUpdate so that we
// only remove ServicePorts that have no endpoints and were not in the service update,
// that way we only remove ServicePorts that were not in both.
proxier.serviceMap[service].endpoints = nil
}
}
if err := proxier.syncProxyRules(); err != nil {
glog.Errorf("Failed to sync iptables rules: %v", err)
}
2015-08-11 02:47:13 +00:00
}
// used in OnEndpointsUpdate
type hostPortPair struct {
host string
port int
}
func isValidEndpoint(hpp *hostPortPair) bool {
return hpp.host != "" && hpp.port > 0
}
// Tests whether two slices are equivalent. This sorts both slices in-place.
func slicesEquiv(lhs, rhs []string) bool {
if len(lhs) != len(rhs) {
return false
}
if reflect.DeepEqual(slice.SortStrings(lhs), slice.SortStrings(rhs)) {
return true
}
return false
}
func flattenValidEndpoints(endpoints []hostPortPair) []string {
// Convert Endpoint objects into strings for easier use later.
var result []string
for i := range endpoints {
hpp := &endpoints[i]
if isValidEndpoint(hpp) {
result = append(result, net.JoinHostPort(hpp.host, strconv.Itoa(hpp.port)))
}
}
return result
}
// servicePortToServiceChain takes the ServicePortName for a service and
// returns the associated iptables chain. This is computed by hashing (sha256)
// then encoding to base32 and truncating with the prefix "KUBE-SVC-". We do
// this because Iptables Chain Names must be <= 28 chars long, and the longer
// they are the harder they are to read.
func servicePortToServiceChain(s proxy.ServicePortName, protocol string) utiliptables.Chain {
hash := sha256.Sum256([]byte(s.String() + protocol))
2015-08-11 02:47:13 +00:00
encoded := base32.StdEncoding.EncodeToString(hash[:])
return utiliptables.Chain("KUBE-SVC-" + encoded[:16])
2015-08-11 02:47:13 +00:00
}
// This is the same as servicePortToServiceChain but with the endpoint
// included.
func servicePortAndEndpointToServiceChain(s proxy.ServicePortName, protocol string, endpoint string) utiliptables.Chain {
hash := sha256.Sum256([]byte(s.String() + protocol + endpoint))
2015-08-11 02:47:13 +00:00
encoded := base32.StdEncoding.EncodeToString(hash[:])
return utiliptables.Chain("KUBE-SEP-" + encoded[:16])
2015-08-11 02:47:13 +00:00
}
// This is where all of the iptables-save/restore calls happen.
// The only other iptables rules are those that are setup in iptablesInit()
// assumes proxier.mu is held
func (proxier *Proxier) syncProxyRules() error {
// don't sync rules till we've received services and endpoints
if !proxier.haveReceivedEndpointsUpdate || !proxier.haveReceivedServiceUpdate {
glog.V(2).Info("Not syncing iptables until Services and Endpoints have been received from master")
2015-08-11 02:47:13 +00:00
return nil
}
glog.V(3).Infof("Syncing iptables rules")
2015-08-11 02:47:13 +00:00
// Ensure main chains and rules are installed.
inputChains := []utiliptables.Chain{utiliptables.ChainOutput, utiliptables.ChainPrerouting}
// Link the services chain.
for _, chain := range inputChains {
if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, iptablesServicesChain); err != nil {
return err
}
comment := "kubernetes service portals; must be before nodeports"
args := []string{"-m", "comment", "--comment", comment, "-j", string(iptablesServicesChain)}
if _, err := proxier.iptables.EnsureRule(utiliptables.Prepend, utiliptables.TableNAT, chain, args...); err != nil {
return err
}
2015-08-11 02:47:13 +00:00
}
// Link the nodeports chain.
for _, chain := range inputChains {
if _, err := proxier.iptables.EnsureChain(utiliptables.TableNAT, iptablesNodePortsChain); err != nil {
return err
}
comment := "kubernetes service nodeports; must be after portals"
args := []string{"-m", "comment", "--comment", comment, "-m", "addrtype", "--dst-type", "LOCAL", "-j", string(iptablesNodePortsChain)}
if _, err := proxier.iptables.EnsureRule(utiliptables.Append, utiliptables.TableNAT, chain, args...); err != nil {
return err
}
2015-08-11 02:47:13 +00:00
}
// Link the output rules.
{
comment := "kubernetes service traffic requiring SNAT"
args := []string{"-m", "comment", "--comment", comment, "-m", "mark", "--mark", iptablesMasqueradeMark, "-j", "MASQUERADE"}
if _, err := proxier.iptables.EnsureRule(utiliptables.Append, utiliptables.TableNAT, utiliptables.ChainPostrouting, args...); err != nil {
return err
}
2015-08-11 02:47:13 +00:00
}
// Get iptables-save output so we can check for existing chains and rules.
// This will be a map of chain name to chain with rules as stored in iptables-save/iptables-restore
existingChains := make(map[utiliptables.Chain]string)
iptablesSaveRaw, err := proxier.iptables.Save(utiliptables.TableNAT)
if err != nil { // if we failed to get any rules
glog.Errorf("Failed to execute iptable-save, syncing all rules. %s", err.Error())
} else { // otherwise parse the output
existingChains = getChainLines(utiliptables.TableNAT, iptablesSaveRaw)
}
chainsLines := bytes.NewBuffer(nil)
rulesLines := bytes.NewBuffer(nil)
2015-08-11 02:47:13 +00:00
// Write table header.
writeLine(chainsLines, "*nat")
2015-08-11 02:47:13 +00:00
// Make sure we keep stats for the top-level chains, if they existed
// (which they should have because we created them above).
2015-08-11 02:47:13 +00:00
if chain, ok := existingChains[iptablesServicesChain]; ok {
writeLine(chainsLines, chain)
2015-08-11 02:47:13 +00:00
} else {
writeLine(chainsLines, makeChainLine(iptablesServicesChain))
}
if chain, ok := existingChains[iptablesNodePortsChain]; ok {
writeLine(chainsLines, chain)
} else {
writeLine(chainsLines, makeChainLine(iptablesNodePortsChain))
2015-08-11 02:47:13 +00:00
}
// Accumulate chains to keep.
activeChains := make(map[utiliptables.Chain]bool) // use a map as a set
2015-08-11 02:47:13 +00:00
// Build rules for each service.
2015-08-11 02:47:13 +00:00
for name, info := range proxier.serviceMap {
protocol := strings.ToLower(string(info.protocol))
// Create the per-service chain, retaining counters if possible.
svcChain := servicePortToServiceChain(name, protocol)
2015-08-11 02:47:13 +00:00
if chain, ok := existingChains[svcChain]; ok {
writeLine(chainsLines, chain)
2015-08-11 02:47:13 +00:00
} else {
writeLine(chainsLines, makeChainLine(svcChain))
2015-08-11 02:47:13 +00:00
}
activeChains[svcChain] = true
// Capture the clusterIP.
writeLine(rulesLines,
"-A", string(iptablesServicesChain),
"-m", "comment", "--comment", fmt.Sprintf("\"%s cluster IP\"", name.String()),
"-m", protocol, "-p", protocol,
"-d", fmt.Sprintf("%s/32", info.clusterIP.String()),
"--dport", fmt.Sprintf("%d", info.port),
"-j", string(svcChain))
// Capture externalIPs.
for _, externalIP := range info.deprecatedPublicIPs {
args := []string{
"-A", string(iptablesServicesChain),
"-m", "comment", "--comment", fmt.Sprintf("\"%s external IP\"", name.String()),
"-m", protocol, "-p", protocol,
"-d", fmt.Sprintf("%s/32", externalIP),
"--dport", fmt.Sprintf("%d", info.port),
}
// We have to SNAT packets from external IPs.
writeLine(rulesLines, append(args,
"-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark))...)
writeLine(rulesLines, append(args,
"-j", string(svcChain))...)
2015-08-11 02:47:13 +00:00
}
// Capture load-balancer ingress.
for _, ingress := range info.loadBalancerStatus.Ingress {
if ingress.IP != "" {
args := []string{
"-A", string(iptablesServicesChain),
"-m", "comment", "--comment", fmt.Sprintf("\"%s loadbalancer IP\"", name.String()),
"-m", protocol, "-p", protocol,
"-d", fmt.Sprintf("%s/32", ingress.IP),
"--dport", fmt.Sprintf("%d", info.port),
}
// We have to SNAT packets from external IPs.
writeLine(rulesLines, append(args,
"-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark))...)
writeLine(rulesLines, append(args,
"-j", string(svcChain))...)
2015-08-11 02:47:13 +00:00
}
}
2015-08-11 02:47:13 +00:00
// Capture nodeports. If we had more than 2 rules it might be
// worthwhile to make a new per-service chain for nodeport rules, but
// with just 2 rules it ends up being a waste and a cognitive burden.
if info.nodePort != 0 {
// Nodeports need SNAT.
writeLine(rulesLines,
"-A", string(iptablesNodePortsChain),
"-m", "comment", "--comment", name.String(),
"-m", protocol, "-p", protocol,
"--dport", fmt.Sprintf("%d", info.nodePort),
"-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark))
// Jump to the service chain.
writeLine(rulesLines,
"-A", string(iptablesNodePortsChain),
"-m", "comment", "--comment", name.String(),
"-m", protocol, "-p", protocol,
"--dport", fmt.Sprintf("%d", info.nodePort),
"-j", string(svcChain))
2015-08-11 02:47:13 +00:00
}
// Generate the per-endpoint chains. We do this in multiple passes so we
// can group rules together.
endpoints := make([]string, 0)
endpointChains := make([]utiliptables.Chain, 0)
for _, ep := range info.endpoints {
endpoints = append(endpoints, ep)
endpointChain := servicePortAndEndpointToServiceChain(name, protocol, ep)
endpointChains = append(endpointChains, endpointChain)
// Create the endpoint chain, retaining counters if possible.
if chain, ok := existingChains[utiliptables.Chain(endpointChain)]; ok {
writeLine(chainsLines, chain)
2015-08-11 02:47:13 +00:00
} else {
writeLine(chainsLines, makeChainLine(endpointChain))
2015-08-11 02:47:13 +00:00
}
activeChains[endpointChain] = true
2015-08-11 02:47:13 +00:00
}
// First write session affinity rules, if applicable.
if info.sessionAffinityType == api.ServiceAffinityClientIP {
for _, endpointChain := range endpointChains {
writeLine(rulesLines,
"-A", string(svcChain),
"-m", "comment", "--comment", name.String(),
"-m", "recent", "--name", string(endpointChain),
"--rcheck", "--seconds", fmt.Sprintf("%d", info.stickyMaxAgeSeconds), "--reap",
"-j", string(endpointChain))
}
2015-08-11 02:47:13 +00:00
}
// Now write loadbalancing & DNAT rules.
n := len(endpointChains)
for i, endpointChain := range endpointChains {
// Balancing rules in the per-service chain.
args := []string{
"-A", string(svcChain),
"-m", "comment", "--comment", name.String(),
}
if i < (n - 1) {
// Each rule is a probabilistic match.
args = append(args,
"-m", "statistic",
"--mode", "random",
"--probability", fmt.Sprintf("%f", 1.0/float64(n-i)))
}
// The final (or only if n == 1) rule is a guaranteed match.
args = append(args, "-j", string(endpointChain))
writeLine(rulesLines, args...)
// Rules in the per-endpoint chain.
args = []string{
"-A", string(endpointChain),
"-m", "comment", "--comment", name.String(),
2015-08-11 02:47:13 +00:00
}
// Handle traffic that loops back to the originator with SNAT.
// Technically we only need to do this if the endpoint is on this
// host, but we don't have that information, so we just do this for
// all endpoints.
// TODO: if we grow logic to get this node's pod CIDR, we can use it.
writeLine(rulesLines, append(args,
"-s", fmt.Sprintf("%s/32", strings.Split(endpoints[i], ":")[0]),
"-j", "MARK", "--set-xmark", fmt.Sprintf("%s/0xffffffff", iptablesMasqueradeMark))...)
// Update client-affinity lists.
if info.sessionAffinityType == api.ServiceAffinityClientIP {
args = append(args, "-m", "recent", "--name", string(endpointChain), "--set")
}
// DNAT to final destination.
args = append(args,
"-m", protocol, "-p", protocol,
"-j", "DNAT", "--to-destination", endpoints[i])
writeLine(rulesLines, args...)
2015-08-11 02:47:13 +00:00
}
}
// Delete chains no longer in use.
2015-08-11 02:47:13 +00:00
for chain := range existingChains {
if !activeChains[chain] {
chainString := string(chain)
if !strings.HasPrefix(chainString, "KUBE-SVC-") && !strings.HasPrefix(chainString, "KUBE-SEP-") {
// Ignore chains that aren't ours.
2015-08-11 02:47:13 +00:00
continue
}
// We must (as per iptables) write a chain-line for it, which has
// the nice effect of flushing the chain. Then we can remove the
// chain.
writeLine(chainsLines, existingChains[chain])
writeLine(rulesLines, "-X", chainString)
2015-08-11 02:47:13 +00:00
}
}
// Write the end-of-table marker.
writeLine(rulesLines, "COMMIT")
2015-08-11 02:47:13 +00:00
// Sync rules.
// NOTE: NoFlushTables is used so we don't flush non-kubernetes chains in the table.
lines := append(chainsLines.Bytes(), rulesLines.Bytes()...)
2015-08-11 02:47:13 +00:00
glog.V(3).Infof("Syncing rules: %s", lines)
return proxier.iptables.Restore(utiliptables.TableNAT, lines, utiliptables.NoFlushTables, utiliptables.RestoreCounters)
}
// Join all words with spaces, terminate with newline and write to buf.
func writeLine(buf *bytes.Buffer, words ...string) {
buf.WriteString(strings.Join(words, " ") + "\n")
2015-08-11 02:47:13 +00:00
}
// return an iptables-save/restore formatted chain line given a Chain
func makeChainLine(chain utiliptables.Chain) string {
return fmt.Sprintf(":%s - [0:0]", chain)
2015-08-11 02:47:13 +00:00
}
// getChainLines parses a table's iptables-save data to find chains in the table.
// It returns a map of iptables.Chain to string where the string is the chain line from the save (with counters etc).
func getChainLines(table utiliptables.Table, save []byte) map[utiliptables.Chain]string {
// get lines
lines := strings.Split(string(save), "\n")
chainsMap := make(map[utiliptables.Chain]string)
tablePrefix := "*" + string(table)
lineNum := 0
// find beginning of table
for ; lineNum < len(lines); lineNum++ {
if strings.HasPrefix(strings.TrimSpace(lines[lineNum]), tablePrefix) {
lineNum++
break
}
}
// parse table lines
for ; lineNum < len(lines); lineNum++ {
line := strings.TrimSpace(lines[lineNum])
if strings.HasPrefix(line, "COMMIT") || strings.HasPrefix(line, "*") {
break
} else if len(line) == 0 || strings.HasPrefix(line, "#") {
continue
} else if strings.HasPrefix(line, ":") && len(line) > 1 {
chain := utiliptables.Chain(strings.SplitN(line[1:], " ", 2)[0])
chainsMap[chain] = lines[lineNum]
}
}
return chainsMap
}