2015-12-16 23:31:10 +00:00
// +build linux
/ *
2016-06-03 00:25:58 +00:00
Copyright 2014 The Kubernetes Authors .
2015-12-16 23:31:10 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package kubenet
import (
"fmt"
2016-12-29 07:39:16 +00:00
"io/ioutil"
2015-12-16 23:31:10 +00:00
"net"
2016-10-25 21:40:17 +00:00
"path/filepath"
2016-08-22 08:28:11 +00:00
"strings"
2016-04-26 20:56:46 +00:00
"sync"
2016-05-09 21:54:15 +00:00
"time"
2015-12-16 23:31:10 +00:00
2016-07-12 20:43:19 +00:00
"github.com/containernetworking/cni/libcni"
cnitypes "github.com/containernetworking/cni/pkg/types"
2017-04-10 11:18:32 +00:00
cnitypes020 "github.com/containernetworking/cni/pkg/types/020"
2015-12-16 23:31:10 +00:00
"github.com/golang/glog"
2016-05-11 20:25:14 +00:00
"github.com/vishvananda/netlink"
2017-07-20 11:44:19 +00:00
"golang.org/x/sys/unix"
2017-06-22 18:24:23 +00:00
"k8s.io/api/core/v1"
2017-01-11 14:09:48 +00:00
utilerrors "k8s.io/apimachinery/pkg/util/errors"
utilnet "k8s.io/apimachinery/pkg/util/net"
utilsets "k8s.io/apimachinery/pkg/util/sets"
2017-07-13 23:15:05 +00:00
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
2015-12-16 23:31:10 +00:00
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/network"
2016-12-29 07:39:16 +00:00
"k8s.io/kubernetes/pkg/kubelet/network/hostport"
2015-12-16 23:31:10 +00:00
"k8s.io/kubernetes/pkg/util/bandwidth"
2016-05-11 20:25:14 +00:00
utildbus "k8s.io/kubernetes/pkg/util/dbus"
2016-08-19 22:15:02 +00:00
utilebtables "k8s.io/kubernetes/pkg/util/ebtables"
2016-05-11 20:25:14 +00:00
utiliptables "k8s.io/kubernetes/pkg/util/iptables"
2016-04-11 19:06:26 +00:00
utilsysctl "k8s.io/kubernetes/pkg/util/sysctl"
2017-07-19 05:58:53 +00:00
utilexec "k8s.io/utils/exec"
2015-12-16 23:31:10 +00:00
)
const (
2016-12-29 07:39:16 +00:00
BridgeName = "cbr0"
DefaultCNIDir = "/opt/cni/bin"
2016-04-11 19:06:26 +00:00
2016-08-27 03:06:15 +00:00
sysctlBridgeCallIPTables = "net/bridge/bridge-nf-call-iptables"
2016-08-10 16:09:43 +00:00
// fallbackMTU is used if an MTU is not specified, and we cannot determine the MTU
fallbackMTU = 1460
2016-08-19 17:18:00 +00:00
2016-08-19 22:15:02 +00:00
// ebtables Chain to store dedup rules
dedupChain = utilebtables . Chain ( "KUBE-DEDUP" )
2016-10-25 21:40:17 +00:00
// defaultIPAMDir is the default location for the checkpoint files stored by host-local ipam
// https://github.com/containernetworking/cni/tree/master/plugins/ipam/host-local#backends
defaultIPAMDir = "/var/lib/cni/networks"
2015-12-16 23:31:10 +00:00
)
2018-01-23 20:03:51 +00:00
// CNI plugins required by kubenet in /opt/cni/bin or user-specified directory
2016-09-01 23:29:41 +00:00
var requiredCNIPlugins = [ ... ] string { "bridge" , "host-local" , "loopback" }
2015-12-16 23:31:10 +00:00
type kubenetNetworkPlugin struct {
2016-03-22 16:38:21 +00:00
network . NoopNetworkPlugin
2016-05-27 02:07:20 +00:00
host network . Host
netConfig * libcni . NetworkConfig
loConfig * libcni . NetworkConfig
cniConfig libcni . CNI
bandwidthShaper bandwidth . BandwidthShaper
mu sync . Mutex //Mutex for protecting podIPs map, netConfig, and shaper initialization
podIPs map [ kubecontainer . ContainerID ] string
2016-08-10 16:09:43 +00:00
mtu int
2016-05-27 02:07:20 +00:00
execer utilexec . Interface
nsenterPath string
2017-07-13 23:15:05 +00:00
hairpinMode kubeletconfig . HairpinMode
2017-02-03 21:26:42 +00:00
// kubenet can use either hostportSyncer and hostportManager to implement hostports
// Currently, if network host supports legacy features, hostportSyncer will be used,
// otherwise, hostportManager will be used.
2017-01-26 00:28:03 +00:00
hostportSyncer hostport . HostportSyncer
2017-02-03 21:26:42 +00:00
hostportManager hostport . HostPortManager
2016-05-27 02:07:20 +00:00
iptables utiliptables . Interface
2016-08-10 15:38:44 +00:00
sysctl utilsysctl . Interface
2016-08-19 22:15:02 +00:00
ebtables utilebtables . Interface
2018-01-23 20:03:51 +00:00
// binDirs is passed by kubelet cni-bin-dir parameter.
// kubenet will search for CNI binaries in DefaultCNIDir first, then continue to binDirs.
binDirs [ ] string
2016-06-09 17:32:28 +00:00
nonMasqueradeCIDR string
2016-08-19 22:15:02 +00:00
podCidr string
gateway net . IP
2015-12-16 23:31:10 +00:00
}
2018-01-23 20:03:51 +00:00
func NewPlugin ( networkPluginDirs [ ] string ) network . NetworkPlugin {
2016-05-11 20:25:14 +00:00
protocol := utiliptables . ProtocolIpv4
execer := utilexec . New ( )
dbus := utildbus . New ( )
2016-08-10 16:09:43 +00:00
sysctl := utilsysctl . New ( )
2016-05-11 20:25:14 +00:00
iptInterface := utiliptables . New ( execer , dbus , protocol )
2015-12-16 23:31:10 +00:00
return & kubenetNetworkPlugin {
2016-06-09 17:32:28 +00:00
podIPs : make ( map [ kubecontainer . ContainerID ] string ) ,
execer : utilexec . New ( ) ,
iptables : iptInterface ,
2016-08-10 16:09:43 +00:00
sysctl : sysctl ,
2018-01-23 20:03:51 +00:00
binDirs : append ( [ ] string { DefaultCNIDir } , networkPluginDirs ... ) ,
2017-07-05 21:50:38 +00:00
hostportSyncer : hostport . NewHostportSyncer ( iptInterface ) ,
hostportManager : hostport . NewHostportManager ( iptInterface ) ,
2016-06-09 17:32:28 +00:00
nonMasqueradeCIDR : "10.0.0.0/8" ,
2015-12-16 23:31:10 +00:00
}
}
2017-07-13 23:15:05 +00:00
func ( plugin * kubenetNetworkPlugin ) Init ( host network . Host , hairpinMode kubeletconfig . HairpinMode , nonMasqueradeCIDR string , mtu int ) error {
2015-12-16 23:31:10 +00:00
plugin . host = host
2016-03-31 22:20:04 +00:00
plugin . hairpinMode = hairpinMode
2016-06-22 19:26:11 +00:00
plugin . nonMasqueradeCIDR = nonMasqueradeCIDR
2018-01-23 20:03:51 +00:00
plugin . cniConfig = & libcni . CNIConfig { Path : plugin . binDirs }
2015-12-16 23:31:10 +00:00
2016-08-10 16:09:43 +00:00
if mtu == network . UseDefaultMTU {
if link , err := findMinMTU ( ) ; err == nil {
plugin . mtu = link . MTU
glog . V ( 5 ) . Infof ( "Using interface %s MTU %d as bridge MTU" , link . Name , link . MTU )
} else {
plugin . mtu = fallbackMTU
glog . Warningf ( "Failed to find default bridge MTU, using %d: %v" , fallbackMTU , err )
}
2015-12-16 23:31:10 +00:00
} else {
2016-08-10 16:09:43 +00:00
plugin . mtu = mtu
2015-12-16 23:31:10 +00:00
}
2016-04-11 19:06:26 +00:00
// Since this plugin uses a Linux bridge, set bridge-nf-call-iptables=1
// is necessary to ensure kube-proxy functions correctly.
//
// This will return an error on older kernel version (< 3.18) as the module
// was built-in, we simply ignore the error here. A better thing to do is
// to check the kernel version in the future.
2016-05-05 03:39:41 +00:00
plugin . execer . Command ( "modprobe" , "br-netfilter" ) . CombinedOutput ( )
2016-08-27 03:06:15 +00:00
err := plugin . sysctl . SetSysctl ( sysctlBridgeCallIPTables , 1 )
2016-05-09 21:19:41 +00:00
if err != nil {
2016-08-27 03:06:15 +00:00
glog . Warningf ( "can't set sysctl %s: %v" , sysctlBridgeCallIPTables , err )
2016-04-11 19:06:26 +00:00
}
2016-05-09 21:19:41 +00:00
plugin . loConfig , err = libcni . ConfFromBytes ( [ ] byte ( ` {
"cniVersion" : "0.1.0" ,
"name" : "kubenet-loopback" ,
"type" : "loopback"
} ` ) )
if err != nil {
return fmt . Errorf ( "Failed to generate loopback config: %v" , err )
}
2016-06-22 19:26:11 +00:00
plugin . nsenterPath , err = plugin . execer . LookPath ( "nsenter" )
if err != nil {
return fmt . Errorf ( "Failed to find nsenter binary: %v" , err )
}
2016-06-09 17:32:28 +00:00
// Need to SNAT outbound traffic from cluster
if err = plugin . ensureMasqRule ( ) ; err != nil {
return err
}
return nil
}
// TODO: move thic logic into cni bridge plugin and remove this from kubenet
func ( plugin * kubenetNetworkPlugin ) ensureMasqRule ( ) error {
2017-05-27 00:01:58 +00:00
if plugin . nonMasqueradeCIDR != "0.0.0.0/0" {
if _ , err := plugin . iptables . EnsureRule ( utiliptables . Append , utiliptables . TableNAT , utiliptables . ChainPostrouting ,
"-m" , "comment" , "--comment" , "kubenet: SNAT for outbound traffic from cluster" ,
"-m" , "addrtype" , "!" , "--dst-type" , "LOCAL" ,
"!" , "-d" , plugin . nonMasqueradeCIDR ,
"-j" , "MASQUERADE" ) ; err != nil {
return fmt . Errorf ( "Failed to ensure that %s chain %s jumps to MASQUERADE: %v" , utiliptables . TableNAT , utiliptables . ChainPostrouting , err )
}
2016-06-09 17:32:28 +00:00
}
2015-12-16 23:31:10 +00:00
return nil
}
func findMinMTU ( ) ( * net . Interface , error ) {
intfs , err := net . Interfaces ( )
if err != nil {
return nil , err
}
mtu := 999999
defIntfIndex := - 1
for i , intf := range intfs {
if ( ( intf . Flags & net . FlagUp ) != 0 ) && ( intf . Flags & ( net . FlagLoopback | net . FlagPointToPoint ) == 0 ) {
if intf . MTU < mtu {
mtu = intf . MTU
defIntfIndex = i
}
}
}
if mtu >= 999999 || mtu < 576 || defIntfIndex < 0 {
2016-03-23 00:26:50 +00:00
return nil , fmt . Errorf ( "no suitable interface: %v" , BridgeName )
2015-12-16 23:31:10 +00:00
}
return & intfs [ defIntfIndex ] , nil
}
const NET_CONFIG_TEMPLATE = ` {
"cniVersion" : "0.1.0" ,
"name" : "kubenet" ,
"type" : "bridge" ,
"bridge" : "%s" ,
"mtu" : % d ,
"addIf" : "%s" ,
"isGateway" : true ,
2016-06-09 17:32:28 +00:00
"ipMasq" : false ,
2016-08-23 00:30:23 +00:00
"hairpinMode" : % t ,
2015-12-16 23:31:10 +00:00
"ipam" : {
"type" : "host-local" ,
"subnet" : "%s" ,
"gateway" : "%s" ,
"routes" : [
{ "dst" : "0.0.0.0/0" }
]
}
} `
func ( plugin * kubenetNetworkPlugin ) Event ( name string , details map [ string ] interface { } ) {
if name != network . NET_PLUGIN_EVENT_POD_CIDR_CHANGE {
return
}
2016-04-26 20:56:46 +00:00
plugin . mu . Lock ( )
defer plugin . mu . Unlock ( )
2015-12-16 23:31:10 +00:00
podCIDR , ok := details [ network . NET_PLUGIN_EVENT_POD_CIDR_CHANGE_DETAIL_CIDR ] . ( string )
if ! ok {
glog . Warningf ( "%s event didn't contain pod CIDR" , network . NET_PLUGIN_EVENT_POD_CIDR_CHANGE )
return
}
if plugin . netConfig != nil {
2016-09-20 21:44:09 +00:00
glog . Warningf ( "Ignoring subsequent pod CIDR update to %s" , podCIDR )
2015-12-16 23:31:10 +00:00
return
}
glog . V ( 5 ) . Infof ( "PodCIDR is set to %q" , podCIDR )
_ , cidr , err := net . ParseCIDR ( podCIDR )
if err == nil {
2017-07-13 23:15:05 +00:00
setHairpin := plugin . hairpinMode == kubeletconfig . HairpinVeth
2015-12-16 23:31:10 +00:00
// Set bridge address to first address in IPNet
2017-06-26 18:51:45 +00:00
cidr . IP [ len ( cidr . IP ) - 1 ] += 1
2015-12-16 23:31:10 +00:00
2016-08-10 16:09:43 +00:00
json := fmt . Sprintf ( NET_CONFIG_TEMPLATE , BridgeName , plugin . mtu , network . DefaultInterfaceName , setHairpin , podCIDR , cidr . IP . String ( ) )
2016-03-31 20:54:45 +00:00
glog . V ( 2 ) . Infof ( "CNI network config set to %v" , json )
2015-12-16 23:31:10 +00:00
plugin . netConfig , err = libcni . ConfFromBytes ( [ ] byte ( json ) )
if err == nil {
glog . V ( 5 ) . Infof ( "CNI network config:\n%s" , json )
// Ensure cbr0 has no conflicting addresses; CNI's 'bridge'
// plugin will bail out if the bridge has an unexpected one
2016-06-22 22:57:01 +00:00
plugin . clearBridgeAddressesExcept ( cidr )
2015-12-16 23:31:10 +00:00
}
2016-08-19 17:18:00 +00:00
plugin . podCidr = podCIDR
plugin . gateway = cidr . IP
2015-12-16 23:31:10 +00:00
}
if err != nil {
glog . Warningf ( "Failed to generate CNI network config: %v" , err )
}
}
2016-06-22 22:57:01 +00:00
func ( plugin * kubenetNetworkPlugin ) clearBridgeAddressesExcept ( keep * net . IPNet ) {
2015-12-16 23:31:10 +00:00
bridge , err := netlink . LinkByName ( BridgeName )
if err != nil {
return
}
2017-07-20 11:44:19 +00:00
addrs , err := netlink . AddrList ( bridge , unix . AF_INET )
2015-12-16 23:31:10 +00:00
if err != nil {
return
}
for _ , addr := range addrs {
2016-06-22 22:57:01 +00:00
if ! utilnet . IPNetEqual ( addr . IPNet , keep ) {
glog . V ( 2 ) . Infof ( "Removing old address %s from %s" , addr . IPNet . String ( ) , BridgeName )
2015-12-16 23:31:10 +00:00
netlink . AddrDel ( bridge , & addr )
}
}
}
func ( plugin * kubenetNetworkPlugin ) Name ( ) string {
return KubenetPluginName
}
2016-04-01 17:00:05 +00:00
func ( plugin * kubenetNetworkPlugin ) Capabilities ( ) utilsets . Int {
2017-06-06 16:34:11 +00:00
return utilsets . NewInt ( )
2016-04-01 17:00:05 +00:00
}
2016-10-29 00:11:05 +00:00
// setup sets up networking through CNI using the given ns/name and sandbox ID.
// TODO: Don't pass the pod to this method, it only needs it for bandwidth
// shaping and hostport management.
2017-02-16 19:41:11 +00:00
func ( plugin * kubenetNetworkPlugin ) setup ( namespace string , name string , id kubecontainer . ContainerID , pod * v1 . Pod , annotations map [ string ] string ) error {
2017-11-07 15:37:20 +00:00
// Disable DAD so we skip the kernel delay on bringing up new interfaces.
if err := plugin . disableContainerDAD ( id ) ; err != nil {
glog . V ( 3 ) . Infof ( "Failed to disable DAD in container: %v" , err )
}
2016-05-09 21:19:41 +00:00
// Bring up container loopback interface
if _ , err := plugin . addContainerToNetwork ( plugin . loConfig , "lo" , namespace , name , id ) ; err != nil {
return err
2015-12-16 23:31:10 +00:00
}
2016-05-09 21:19:41 +00:00
// Hook container up with our bridge
2017-04-10 11:18:32 +00:00
resT , err := plugin . addContainerToNetwork ( plugin . netConfig , network . DefaultInterfaceName , namespace , name , id )
2015-12-16 23:31:10 +00:00
if err != nil {
2016-04-26 20:56:46 +00:00
return err
2015-12-16 23:31:10 +00:00
}
2017-04-10 11:18:32 +00:00
// Coerce the CNI result version
res , err := cnitypes020 . GetResult ( resT )
if err != nil {
return fmt . Errorf ( "unable to understand network config: %v" , err )
}
2016-05-27 23:25:14 +00:00
if res . IP4 == nil {
2016-05-09 21:19:41 +00:00
return fmt . Errorf ( "CNI plugin reported no IPv4 address for container %v." , id )
}
2016-05-27 23:25:14 +00:00
ip4 := res . IP4 . IP . IP . To4 ( )
if ip4 == nil {
return fmt . Errorf ( "CNI plugin reported an invalid IPv4 address for container %v: %+v." , id , res . IP4 )
}
2015-12-16 23:31:10 +00:00
2016-03-31 22:20:04 +00:00
// Put the container bridge into promiscuous mode to force it to accept hairpin packets.
// TODO: Remove this once the kernel bug (#20096) is fixed.
2017-07-13 23:15:05 +00:00
if plugin . hairpinMode == kubeletconfig . PromiscuousBridge {
2017-03-25 07:58:36 +00:00
link , err := netlink . LinkByName ( BridgeName )
if err != nil {
return fmt . Errorf ( "failed to lookup %q: %v" , BridgeName , err )
}
2017-12-11 02:39:57 +00:00
if link . Attrs ( ) . Promisc != 1 {
// promiscuous mode is not on, then turn it on.
err := netlink . SetPromiscOn ( link )
if err != nil {
return fmt . Errorf ( "Error setting promiscuous mode on %s: %v" , BridgeName , err )
}
}
2017-03-25 07:58:36 +00:00
2016-08-19 22:15:02 +00:00
// configure the ebtables rules to eliminate duplicate packets by best effort
2017-03-25 07:58:36 +00:00
plugin . syncEbtablesDedupRules ( link . Attrs ( ) . HardwareAddr )
2016-03-31 22:20:04 +00:00
}
2016-10-29 00:11:05 +00:00
plugin . podIPs [ id ] = ip4 . String ( )
2017-02-16 19:41:11 +00:00
// The first SetUpPod call creates the bridge; get a shaper for the sake of initialization
// TODO: replace with CNI traffic shaper plugin
shaper := plugin . shaper ( )
ingress , egress , err := bandwidth . ExtractPodBandwidthResources ( annotations )
if err != nil {
return fmt . Errorf ( "Error reading pod bandwidth annotations: %v" , err )
}
if egress != nil || ingress != nil {
if err := shaper . ReconcileCIDR ( fmt . Sprintf ( "%s/32" , ip4 . String ( ) ) , egress , ingress ) ; err != nil {
return fmt . Errorf ( "Failed to add pod to shaper: %v" , err )
}
}
2016-10-29 00:11:05 +00:00
// The host can choose to not support "legacy" features. The remote
// shim doesn't support it (#35457), but the kubelet does.
2017-02-03 21:26:42 +00:00
if plugin . host . SupportsLegacyFeatures ( ) {
2017-02-17 21:51:11 +00:00
// Open any hostport the pod's containers want
activePodPortMappings , err := plugin . getPodPortMappings ( )
2017-02-03 21:26:42 +00:00
if err != nil {
return err
}
2016-06-23 00:52:12 +00:00
2017-03-20 15:50:49 +00:00
newPodPortMapping := hostport . ConstructPodPortMapping ( pod , ip4 )
2017-02-17 21:51:11 +00:00
if err := plugin . hostportSyncer . OpenPodHostportsAndSync ( newPodPortMapping , BridgeName , activePodPortMappings ) ; err != nil {
2017-02-03 21:26:42 +00:00
return err
}
} else {
2017-02-16 19:41:11 +00:00
// TODO: replace with CNI port-forwarding plugin
2017-02-03 21:26:42 +00:00
portMappings , err := plugin . host . GetPodPortMappings ( id . ID )
if err != nil {
return err
}
if portMappings != nil && len ( portMappings ) > 0 {
if err := plugin . hostportManager . Add ( id . ID , & hostport . PodPortMapping {
Namespace : namespace ,
Name : name ,
PortMappings : portMappings ,
IP : ip4 ,
HostNetwork : false ,
} , BridgeName ) ; err != nil {
return err
}
}
2016-06-09 17:32:28 +00:00
}
2016-06-13 22:19:04 +00:00
return nil
2015-12-16 23:31:10 +00:00
}
2017-02-16 19:37:54 +00:00
func ( plugin * kubenetNetworkPlugin ) SetUpPod ( namespace string , name string , id kubecontainer . ContainerID , annotations map [ string ] string ) error {
2016-05-09 21:54:15 +00:00
plugin . mu . Lock ( )
defer plugin . mu . Unlock ( )
start := time . Now ( )
defer func ( ) {
2016-06-13 22:19:04 +00:00
glog . V ( 4 ) . Infof ( "SetUpPod took %v for %s/%s" , time . Since ( start ) , namespace , name )
2016-05-09 21:54:15 +00:00
} ( )
2016-10-29 00:11:05 +00:00
// TODO: Entire pod object only required for bw shaping and hostport.
2016-06-13 22:19:04 +00:00
pod , ok := plugin . host . GetPodByName ( namespace , name )
if ! ok {
return fmt . Errorf ( "pod %q cannot be found" , name )
2015-12-16 23:31:10 +00:00
}
2016-06-13 22:19:04 +00:00
if err := plugin . Status ( ) ; err != nil {
return fmt . Errorf ( "Kubenet cannot SetUpPod: %v" , err )
}
2017-02-16 19:41:11 +00:00
if err := plugin . setup ( namespace , name , id , pod , annotations ) ; err != nil {
2016-06-13 22:19:04 +00:00
// Make sure everything gets cleaned up on errors
podIP , _ := plugin . podIPs [ id ]
if err := plugin . teardown ( namespace , name , id , podIP ) ; err != nil {
// Not a hard error or warning
glog . V ( 4 ) . Infof ( "Failed to clean up %s/%s after SetUpPod failure: %v" , namespace , name , err )
}
2016-10-25 21:40:17 +00:00
2016-10-29 00:11:05 +00:00
// TODO(#34278): Figure out if we need IP GC through the cri.
// The cri should always send us teardown events for stale sandboxes,
// this obviates the need for GC in the common case, for kubenet.
if plugin . host . SupportsLegacyFeatures ( ) {
// TODO: Remove this hack once we've figured out how to retrieve the netns
// of an exited container. Currently, restarting docker will leak a bunch of
// ips. This will exhaust available ip space unless we cleanup old ips. At the
// same time we don't want to try GC'ing them periodically as that could lead
// to a performance regression in starting pods. So on each setup failure, try
// GC on the assumption that the kubelet is going to retry pod creation, and
// when it does, there will be ips.
plugin . ipamGarbageCollection ( )
}
2016-06-13 22:19:04 +00:00
return err
}
// Need to SNAT outbound traffic from cluster
if err := plugin . ensureMasqRule ( ) ; err != nil {
glog . Errorf ( "Failed to ensure MASQ rule: %v" , err )
}
return nil
}
// Tears down as much of a pod's network as it can even if errors occur. Returns
// an aggregate error composed of all errors encountered during the teardown.
func ( plugin * kubenetNetworkPlugin ) teardown ( namespace string , name string , id kubecontainer . ContainerID , podIP string ) error {
errList := [ ] error { }
if podIP != "" {
2016-05-27 01:47:22 +00:00
glog . V ( 5 ) . Infof ( "Removing pod IP %s from shaper" , podIP )
2015-12-16 23:31:10 +00:00
// shaper wants /32
2016-05-27 02:07:20 +00:00
if err := plugin . shaper ( ) . Reset ( fmt . Sprintf ( "%s/32" , podIP ) ) ; err != nil {
2016-05-27 02:42:56 +00:00
// Possible bandwidth shaping wasn't enabled for this pod anyways
glog . V ( 4 ) . Infof ( "Failed to remove pod IP %s from shaper: %v" , podIP , err )
2015-12-16 23:31:10 +00:00
}
2016-06-13 22:19:04 +00:00
delete ( plugin . podIPs , id )
2015-12-16 23:31:10 +00:00
}
2016-06-13 22:19:04 +00:00
2016-05-09 21:19:41 +00:00
if err := plugin . delContainerFromNetwork ( plugin . netConfig , network . DefaultInterfaceName , namespace , name , id ) ; err != nil {
2016-05-24 21:18:28 +00:00
// This is to prevent returning error when TearDownPod is called twice on the same pod. This helps to reduce event pollution.
2016-06-13 22:19:04 +00:00
if podIP != "" {
2016-05-24 21:18:28 +00:00
glog . Warningf ( "Failed to delete container from kubenet: %v" , err )
2016-06-13 22:19:04 +00:00
} else {
errList = append ( errList , err )
2016-05-24 21:18:28 +00:00
}
2015-12-16 23:31:10 +00:00
}
2016-10-29 00:11:05 +00:00
// The host can choose to not support "legacy" features. The remote
// shim doesn't support it (#35457), but the kubelet does.
2017-02-03 21:26:42 +00:00
if plugin . host . SupportsLegacyFeatures ( ) {
2017-02-17 21:51:11 +00:00
activePodPortMapping , err := plugin . getPodPortMappings ( )
2017-02-03 21:26:42 +00:00
if err == nil {
err = plugin . hostportSyncer . SyncHostports ( BridgeName , activePodPortMapping )
}
if err != nil {
errList = append ( errList , err )
}
} else {
portMappings , err := plugin . host . GetPodPortMappings ( id . ID )
if err != nil {
errList = append ( errList , err )
} else if portMappings != nil && len ( portMappings ) > 0 {
if err = plugin . hostportManager . Remove ( id . ID , & hostport . PodPortMapping {
Namespace : namespace ,
Name : name ,
PortMappings : portMappings ,
HostNetwork : false ,
} ) ; err != nil {
errList = append ( errList , err )
}
}
2016-06-13 22:19:04 +00:00
}
return utilerrors . NewAggregate ( errList )
}
func ( plugin * kubenetNetworkPlugin ) TearDownPod ( namespace string , name string , id kubecontainer . ContainerID ) error {
plugin . mu . Lock ( )
defer plugin . mu . Unlock ( )
start := time . Now ( )
defer func ( ) {
glog . V ( 4 ) . Infof ( "TearDownPod took %v for %s/%s" , time . Since ( start ) , namespace , name )
} ( )
if plugin . netConfig == nil {
return fmt . Errorf ( "Kubenet needs a PodCIDR to tear down pods" )
}
// no cached IP is Ok during teardown
podIP , _ := plugin . podIPs [ id ]
if err := plugin . teardown ( namespace , name , id , podIP ) ; err != nil {
return err
}
2016-06-09 17:32:28 +00:00
// Need to SNAT outbound traffic from cluster
if err := plugin . ensureMasqRule ( ) ; err != nil {
glog . Errorf ( "Failed to ensure MASQ rule: %v" , err )
}
2016-06-07 02:45:46 +00:00
2016-06-13 22:19:04 +00:00
return nil
2015-12-16 23:31:10 +00:00
}
// TODO: Use the addToNetwork function to obtain the IP of the Pod. That will assume idempotent ADD call to the plugin.
// Also fix the runtime's call to Status function to be done only in the case that the IP is lost, no need to do periodic calls
2016-04-26 23:10:07 +00:00
func ( plugin * kubenetNetworkPlugin ) GetPodNetworkStatus ( namespace string , name string , id kubecontainer . ContainerID ) ( * network . PodNetworkStatus , error ) {
2016-04-26 20:56:46 +00:00
plugin . mu . Lock ( )
defer plugin . mu . Unlock ( )
2016-05-05 03:39:41 +00:00
// Assuming the ip of pod does not change. Try to retrieve ip from kubenet map first.
2016-05-27 01:47:22 +00:00
if podIP , ok := plugin . podIPs [ id ] ; ok {
return & network . PodNetworkStatus { IP : net . ParseIP ( podIP ) } , nil
2016-05-05 03:39:41 +00:00
}
2016-05-03 00:49:02 +00:00
2016-10-29 00:01:06 +00:00
netnsPath , err := plugin . host . GetNetNS ( id . ID )
2016-05-05 03:39:41 +00:00
if err != nil {
return nil , fmt . Errorf ( "Kubenet failed to retrieve network namespace path: %v" , err )
}
2017-04-27 02:23:03 +00:00
if netnsPath == "" {
return nil , fmt . Errorf ( "Cannot find the network namespace, skipping pod network status for container %q" , id )
}
2016-06-22 19:26:11 +00:00
ip , err := network . GetPodIP ( plugin . execer , plugin . nsenterPath , netnsPath , network . DefaultInterfaceName )
2016-05-05 03:39:41 +00:00
if err != nil {
2016-06-21 21:58:30 +00:00
return nil , err
2016-05-05 03:39:41 +00:00
}
2016-06-21 21:58:30 +00:00
2016-05-27 01:47:22 +00:00
plugin . podIPs [ id ] = ip . String ( )
2015-12-16 23:31:10 +00:00
return & network . PodNetworkStatus { IP : ip } , nil
}
2016-04-26 23:10:07 +00:00
func ( plugin * kubenetNetworkPlugin ) Status ( ) error {
2016-04-22 22:23:03 +00:00
// Can't set up pods if we don't have a PodCIDR yet
if plugin . netConfig == nil {
return fmt . Errorf ( "Kubenet does not have netConfig. This is most likely due to lack of PodCIDR" )
}
2016-09-01 23:29:41 +00:00
2018-01-23 20:03:51 +00:00
if ! plugin . checkRequiredCNIPlugins ( ) {
2018-01-23 20:04:38 +00:00
return fmt . Errorf ( "could not locate kubenet required CNI plugins %v at %q" , requiredCNIPlugins , plugin . binDirs )
2016-09-01 23:29:41 +00:00
}
2016-04-22 22:23:03 +00:00
return nil
}
2018-01-23 20:03:51 +00:00
// checkRequiredCNIPlugins returns if all kubenet required cni plugins can be found at /opt/cni/bin or user specified NetworkPluginDir.
func ( plugin * kubenetNetworkPlugin ) checkRequiredCNIPlugins ( ) bool {
for _ , dir := range plugin . binDirs {
if plugin . checkRequiredCNIPluginsInOneDir ( dir ) {
return true
}
2016-09-01 23:29:41 +00:00
}
return false
}
2018-01-23 20:03:51 +00:00
// checkRequiredCNIPluginsInOneDir returns true if all required cni plugins are placed in dir
func ( plugin * kubenetNetworkPlugin ) checkRequiredCNIPluginsInOneDir ( dir string ) bool {
2016-09-02 22:36:30 +00:00
files , err := ioutil . ReadDir ( dir )
2016-09-01 23:29:41 +00:00
if err != nil {
return false
}
for _ , cniPlugin := range requiredCNIPlugins {
found := false
2016-09-02 22:36:30 +00:00
for _ , file := range files {
if strings . TrimSpace ( file . Name ( ) ) == cniPlugin {
2016-09-01 23:29:41 +00:00
found = true
break
}
}
if ! found {
return false
}
}
return true
}
2016-10-25 21:40:17 +00:00
// getNonExitedPods returns a list of pods that have at least one running container.
func ( plugin * kubenetNetworkPlugin ) getNonExitedPods ( ) ( [ ] * kubecontainer . Pod , error ) {
ret := [ ] * kubecontainer . Pod { }
2016-08-24 23:21:03 +00:00
pods , err := plugin . host . GetRuntime ( ) . GetPods ( true )
2016-06-07 02:45:46 +00:00
if err != nil {
return nil , fmt . Errorf ( "Failed to retrieve pods from runtime: %v" , err )
}
for _ , p := range pods {
2016-08-24 23:21:03 +00:00
if podIsExited ( p ) {
continue
}
2016-10-25 21:40:17 +00:00
ret = append ( ret , p )
}
return ret , nil
}
2016-08-24 23:21:03 +00:00
2017-02-17 21:51:11 +00:00
func ( plugin * kubenetNetworkPlugin ) getPodPortMappings ( ) ( [ ] * hostport . PodPortMapping , error ) {
2016-10-25 21:40:17 +00:00
pods , err := plugin . getNonExitedPods ( )
if err != nil {
return nil , err
}
2016-12-29 21:30:34 +00:00
activePodPortMappings := make ( [ ] * hostport . PodPortMapping , 0 )
2016-10-25 21:40:17 +00:00
for _ , p := range pods {
2016-06-22 14:44:33 +00:00
containerID , err := plugin . host . GetRuntime ( ) . GetPodContainerID ( p )
if err != nil {
continue
}
ipString , ok := plugin . podIPs [ containerID ]
if ! ok {
continue
}
podIP := net . ParseIP ( ipString )
if podIP == nil {
continue
}
if pod , ok := plugin . host . GetPodByName ( p . Namespace , p . Name ) ; ok {
2017-03-20 15:50:49 +00:00
activePodPortMappings = append ( activePodPortMappings , hostport . ConstructPodPortMapping ( pod , podIP ) )
2016-12-29 21:30:34 +00:00
}
}
return activePodPortMappings , nil
}
2016-10-25 21:40:17 +00:00
// ipamGarbageCollection will release unused IP.
// kubenet uses the CNI bridge plugin, which stores allocated ips on file. Each
// file created under defaultIPAMDir has the format: ip/container-hash. So this
// routine looks for hashes that are not reported by the currently running docker,
// and invokes DelNetwork on each one. Note that this will only work for the
// current CNI bridge plugin, because we have no way of finding the NetNs.
func ( plugin * kubenetNetworkPlugin ) ipamGarbageCollection ( ) {
glog . V ( 2 ) . Infof ( "Starting IP garbage collection" )
ipamDir := filepath . Join ( defaultIPAMDir , KubenetPluginName )
files , err := ioutil . ReadDir ( ipamDir )
if err != nil {
glog . Errorf ( "Failed to list files in %q: %v" , ipamDir , err )
return
}
// gather containerIDs for allocated ips
ipContainerIdMap := make ( map [ string ] string )
for _ , file := range files {
// skip non checkpoint file
if ip := net . ParseIP ( file . Name ( ) ) ; ip == nil {
continue
}
content , err := ioutil . ReadFile ( filepath . Join ( ipamDir , file . Name ( ) ) )
if err != nil {
glog . Errorf ( "Failed to read file %v: %v" , file , err )
}
ipContainerIdMap [ file . Name ( ) ] = strings . TrimSpace ( string ( content ) )
}
// gather infra container IDs of current running Pods
runningContainerIDs := utilsets . String { }
pods , err := plugin . getNonExitedPods ( )
if err != nil {
glog . Errorf ( "Failed to get pods: %v" , err )
return
}
for _ , pod := range pods {
containerID , err := plugin . host . GetRuntime ( ) . GetPodContainerID ( pod )
if err != nil {
glog . Warningf ( "Failed to get infra containerID of %q/%q: %v" , pod . Namespace , pod . Name , err )
continue
}
runningContainerIDs . Insert ( strings . TrimSpace ( containerID . ID ) )
}
// release leaked ips
for ip , containerID := range ipContainerIdMap {
// if the container is not running, release IP
if runningContainerIDs . Has ( containerID ) {
continue
}
// CNI requires all config to be presented, although only containerID is needed in this case
rt := & libcni . RuntimeConf {
ContainerID : containerID ,
IfName : network . DefaultInterfaceName ,
// TODO: How do we find the NetNs of an exited container? docker inspect
// doesn't show us the pid, so we probably need to checkpoint
NetNS : "" ,
}
glog . V ( 2 ) . Infof ( "Releasing IP %q allocated to %q." , ip , containerID )
// CNI bridge plugin should try to release IP and then return
if err := plugin . cniConfig . DelNetwork ( plugin . netConfig , rt ) ; err != nil {
glog . Errorf ( "Error while releasing IP: %v" , err )
}
}
}
2016-08-24 23:21:03 +00:00
// podIsExited returns true if the pod is exited (all containers inside are exited).
func podIsExited ( p * kubecontainer . Pod ) bool {
for _ , c := range p . Containers {
if c . State != kubecontainer . ContainerStateExited {
return false
}
}
for _ , c := range p . Sandboxes {
if c . State != kubecontainer . ContainerStateExited {
return false
}
}
return true
2016-06-07 02:45:46 +00:00
}
2017-03-30 22:33:58 +00:00
func ( plugin * kubenetNetworkPlugin ) buildCNIRuntimeConf ( ifName string , id kubecontainer . ContainerID , needNetNs bool ) ( * libcni . RuntimeConf , error ) {
2016-10-29 00:01:06 +00:00
netnsPath , err := plugin . host . GetNetNS ( id . ID )
2017-03-30 22:33:58 +00:00
if needNetNs && err != nil {
2017-01-11 23:40:01 +00:00
glog . Errorf ( "Kubenet failed to retrieve network namespace path: %v" , err )
2016-05-09 21:19:41 +00:00
}
2015-12-16 23:31:10 +00:00
return & libcni . RuntimeConf {
2016-05-09 21:19:41 +00:00
ContainerID : id . ID ,
NetNS : netnsPath ,
IfName : ifName ,
} , nil
2015-12-16 23:31:10 +00:00
}
2016-04-26 20:56:46 +00:00
2017-04-10 11:18:32 +00:00
func ( plugin * kubenetNetworkPlugin ) addContainerToNetwork ( config * libcni . NetworkConfig , ifName , namespace , name string , id kubecontainer . ContainerID ) ( cnitypes . Result , error ) {
2017-03-30 22:33:58 +00:00
rt , err := plugin . buildCNIRuntimeConf ( ifName , id , true )
2016-04-26 20:56:46 +00:00
if err != nil {
2016-05-09 21:19:41 +00:00
return nil , fmt . Errorf ( "Error building CNI config: %v" , err )
2016-04-26 20:56:46 +00:00
}
2016-05-09 21:19:41 +00:00
glog . V ( 3 ) . Infof ( "Adding %s/%s to '%s' with CNI '%s' plugin and runtime: %+v" , namespace , name , config . Network . Name , config . Network . Type , rt )
2017-10-30 15:55:26 +00:00
// The network plugin can take up to 3 seconds to execute,
// so yield the lock while it runs.
plugin . mu . Unlock ( )
2016-05-09 21:19:41 +00:00
res , err := plugin . cniConfig . AddNetwork ( config , rt )
2017-10-30 15:55:26 +00:00
plugin . mu . Lock ( )
2016-05-09 21:19:41 +00:00
if err != nil {
return nil , fmt . Errorf ( "Error adding container to network: %v" , err )
}
return res , nil
2016-04-26 20:56:46 +00:00
}
2016-05-09 21:19:41 +00:00
func ( plugin * kubenetNetworkPlugin ) delContainerFromNetwork ( config * libcni . NetworkConfig , ifName , namespace , name string , id kubecontainer . ContainerID ) error {
2017-03-30 22:33:58 +00:00
rt , err := plugin . buildCNIRuntimeConf ( ifName , id , false )
2016-05-09 21:19:41 +00:00
if err != nil {
return fmt . Errorf ( "Error building CNI config: %v" , err )
}
glog . V ( 3 ) . Infof ( "Removing %s/%s from '%s' with CNI '%s' plugin and runtime: %+v" , namespace , name , config . Network . Name , config . Network . Type , rt )
2017-12-29 03:07:20 +00:00
err = plugin . cniConfig . DelNetwork ( config , rt )
// The pod may not get deleted successfully at the first time.
// Ignore "no such file or directory" error in case the network has already been deleted in previous attempts.
if err != nil && ! strings . Contains ( err . Error ( ) , "no such file or directory" ) {
2016-04-26 20:56:46 +00:00
return fmt . Errorf ( "Error removing container from network: %v" , err )
}
return nil
}
2016-05-05 03:39:41 +00:00
2016-05-27 02:07:20 +00:00
// shaper retrieves the bandwidth shaper and, if it hasn't been fetched before,
// initializes it and ensures the bridge is appropriately configured
// This function should only be called while holding the `plugin.mu` lock
func ( plugin * kubenetNetworkPlugin ) shaper ( ) bandwidth . BandwidthShaper {
if plugin . bandwidthShaper == nil {
plugin . bandwidthShaper = bandwidth . NewTCShaper ( BridgeName )
plugin . bandwidthShaper . ReconcileInterface ( )
}
return plugin . bandwidthShaper
}
2016-08-19 17:18:00 +00:00
2016-08-19 22:15:02 +00:00
//TODO: make this into a goroutine and rectify the dedup rules periodically
func ( plugin * kubenetNetworkPlugin ) syncEbtablesDedupRules ( macAddr net . HardwareAddr ) {
if plugin . ebtables == nil {
plugin . ebtables = utilebtables . New ( plugin . execer )
glog . V ( 3 ) . Infof ( "Flushing dedup chain" )
if err := plugin . ebtables . FlushChain ( utilebtables . TableFilter , dedupChain ) ; err != nil {
glog . Errorf ( "Failed to flush dedup chain: %v" , err )
}
}
_ , err := plugin . ebtables . GetVersion ( )
if err != nil {
glog . Warningf ( "Failed to get ebtables version. Skip syncing ebtables dedup rules: %v" , err )
return
}
glog . V ( 3 ) . Infof ( "Filtering packets with ebtables on mac address: %v, gateway: %v, pod CIDR: %v" , macAddr . String ( ) , plugin . gateway . String ( ) , plugin . podCidr )
_ , err = plugin . ebtables . EnsureChain ( utilebtables . TableFilter , dedupChain )
if err != nil {
glog . Errorf ( "Failed to ensure %v chain %v" , utilebtables . TableFilter , dedupChain )
return
}
_ , err = plugin . ebtables . EnsureRule ( utilebtables . Append , utilebtables . TableFilter , utilebtables . ChainOutput , "-j" , string ( dedupChain ) )
if err != nil {
glog . Errorf ( "Failed to ensure %v chain %v jump to %v chain: %v" , utilebtables . TableFilter , utilebtables . ChainOutput , dedupChain , err )
return
}
commonArgs := [ ] string { "-p" , "IPv4" , "-s" , macAddr . String ( ) , "-o" , "veth+" }
_ , err = plugin . ebtables . EnsureRule ( utilebtables . Prepend , utilebtables . TableFilter , dedupChain , append ( commonArgs , "--ip-src" , plugin . gateway . String ( ) , "-j" , "ACCEPT" ) ... )
if err != nil {
glog . Errorf ( "Failed to ensure packets from cbr0 gateway to be accepted" )
return
}
_ , err = plugin . ebtables . EnsureRule ( utilebtables . Append , utilebtables . TableFilter , dedupChain , append ( commonArgs , "--ip-src" , plugin . podCidr , "-j" , "DROP" ) ... )
if err != nil {
glog . Errorf ( "Failed to ensure packets from podCidr but has mac address of cbr0 to get dropped." )
return
}
}
2017-11-07 15:37:20 +00:00
// disableContainerDAD disables duplicate address detection in the container.
// DAD has a negative affect on pod creation latency, since we have to wait
// a second or more for the addresses to leave the "tentative" state. Since
// we're sure there won't be an address conflict (since we manage them manually),
// this is safe. See issue 54651.
//
// This sets net.ipv6.conf.default.dad_transmits to 0. It must be run *before*
// the CNI plugins are run.
func ( plugin * kubenetNetworkPlugin ) disableContainerDAD ( id kubecontainer . ContainerID ) error {
key := "net/ipv6/conf/default/dad_transmits"
sysctlBin , err := plugin . execer . LookPath ( "sysctl" )
if err != nil {
return fmt . Errorf ( "Could not find sysctl binary: %s" , err )
}
netnsPath , err := plugin . host . GetNetNS ( id . ID )
if err != nil {
return fmt . Errorf ( "Failed to get netns: %v" , err )
}
if netnsPath == "" {
return fmt . Errorf ( "Pod has no network namespace" )
}
// If the sysctl doesn't exist, it means ipv6 is disabled; log and move on
if _ , err := plugin . sysctl . GetSysctl ( key ) ; err != nil {
return fmt . Errorf ( "Ipv6 not enabled: %v" , err )
}
output , err := plugin . execer . Command ( plugin . nsenterPath ,
fmt . Sprintf ( "--net=%s" , netnsPath ) , "-F" , "--" ,
sysctlBin , "-w" , fmt . Sprintf ( "%s=%s" , key , "0" ) ,
) . CombinedOutput ( )
if err != nil {
return fmt . Errorf ( "Failed to write sysctl: output: %s error: %s" ,
output , err )
}
return nil
}