mirror of https://github.com/k3s-io/k3s
187 lines
6.6 KiB
Go
187 lines
6.6 KiB
Go
|
// Copyright 2015 flannel authors
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
// +build !windows
|
||
|
|
||
|
package vxlan
|
||
|
|
||
|
// Some design notes and history:
|
||
|
// VXLAN encapsulates L2 packets (though flannel is L3 only so don't expect to be able to send L2 packets across hosts)
|
||
|
// The first versions of vxlan for flannel registered the flannel daemon as a handler for both "L2" and "L3" misses
|
||
|
// - When a container sends a packet to a new IP address on the flannel network (but on a different host) this generates
|
||
|
// an L2 miss (i.e. an ARP lookup)
|
||
|
// - The flannel daemon knows which flannel host the packet is destined for so it can supply the VTEP MAC to use.
|
||
|
// This is stored in the ARP table (with a timeout) to avoid constantly looking it up.
|
||
|
// - The packet can then be encapsulated but the host needs to know where to send it. This creates another callout from
|
||
|
// the kernal vxlan code to the flannel daemon to get the public IP that should be used for that VTEP (this gets called
|
||
|
// an L3 miss). The L2/L3 miss hooks are registered when the vxlan device is created. At the same time a device route
|
||
|
// is created to the whole flannel network so that non-local traffic is sent over the vxlan device.
|
||
|
//
|
||
|
// In this scheme the scaling of table entries (per host) is:
|
||
|
// - 1 route (for the configured network out the vxlan device)
|
||
|
// - One arp entry for each remote container that this host has recently contacted
|
||
|
// - One FDB entry for each remote host
|
||
|
//
|
||
|
// The second version of flannel vxlan removed the need for the L3MISS callout. When a new remote host is found (either
|
||
|
// during startup or when it's created), flannel simply adds the required entries so that no further lookup/callout is required.
|
||
|
//
|
||
|
//
|
||
|
// The latest version of the vxlan backend removes the need for the L2MISS too, which means that the flannel deamon is not
|
||
|
// listening for any netlink messages anymore. This improves reliability (no problems with timeouts if
|
||
|
// flannel crashes or restarts) and simplifies upgrades.
|
||
|
//
|
||
|
// How it works:
|
||
|
// Create the vxlan device but don't register for any L2MISS or L3MISS messages
|
||
|
// Then, as each remote host is discovered (either on startup or when they are added), do the following
|
||
|
// 1) Create routing table entry for the remote subnet. It goes via the vxlan device but also specifies a next hop (of the remote flannel host).
|
||
|
// 2) Create a static ARP entry for the remote flannel host IP address (and the VTEP MAC)
|
||
|
// 3) Create an FDB entry with the VTEP MAC and the public IP of the remote flannel daemon.
|
||
|
//
|
||
|
// In this scheme the scaling of table entries is linear to the number of remote hosts - 1 route, 1 arp entry and 1 FDB entry per host
|
||
|
//
|
||
|
// In this newest scheme, there is also the option of skipping the use of vxlan for hosts that are on the same subnet,
|
||
|
// this is called "directRouting"
|
||
|
|
||
|
import (
|
||
|
"encoding/json"
|
||
|
"fmt"
|
||
|
"net"
|
||
|
"sync"
|
||
|
|
||
|
"golang.org/x/net/context"
|
||
|
|
||
|
"github.com/flannel-io/flannel/backend"
|
||
|
"github.com/flannel-io/flannel/pkg/ip"
|
||
|
"github.com/flannel-io/flannel/subnet"
|
||
|
log "k8s.io/klog"
|
||
|
)
|
||
|
|
||
|
func init() {
|
||
|
backend.Register("vxlan", New)
|
||
|
}
|
||
|
|
||
|
const (
|
||
|
defaultVNI = 1
|
||
|
)
|
||
|
|
||
|
type VXLANBackend struct {
|
||
|
subnetMgr subnet.Manager
|
||
|
extIface *backend.ExternalInterface
|
||
|
}
|
||
|
|
||
|
func New(sm subnet.Manager, extIface *backend.ExternalInterface) (backend.Backend, error) {
|
||
|
backend := &VXLANBackend{
|
||
|
subnetMgr: sm,
|
||
|
extIface: extIface,
|
||
|
}
|
||
|
|
||
|
return backend, nil
|
||
|
}
|
||
|
|
||
|
func newSubnetAttrs(publicIP net.IP, vnid uint16, mac net.HardwareAddr) (*subnet.LeaseAttrs, error) {
|
||
|
data, err := json.Marshal(&vxlanLeaseAttrs{
|
||
|
VNI: vnid,
|
||
|
VtepMAC: hardwareAddr(mac)})
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
return &subnet.LeaseAttrs{
|
||
|
PublicIP: ip.FromIP(publicIP),
|
||
|
BackendType: "vxlan",
|
||
|
BackendData: json.RawMessage(data),
|
||
|
}, nil
|
||
|
}
|
||
|
|
||
|
func (be *VXLANBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup, config *subnet.Config) (backend.Network, error) {
|
||
|
// Parse our configuration
|
||
|
cfg := struct {
|
||
|
VNI int
|
||
|
Port int
|
||
|
GBP bool
|
||
|
Learning bool
|
||
|
DirectRouting bool
|
||
|
}{
|
||
|
VNI: defaultVNI,
|
||
|
}
|
||
|
|
||
|
if len(config.Backend) > 0 {
|
||
|
if err := json.Unmarshal(config.Backend, &cfg); err != nil {
|
||
|
return nil, fmt.Errorf("error decoding VXLAN backend config: %v", err)
|
||
|
}
|
||
|
}
|
||
|
log.Infof("VXLAN config: VNI=%d Port=%d GBP=%v Learning=%v DirectRouting=%v", cfg.VNI, cfg.Port, cfg.GBP, cfg.Learning, cfg.DirectRouting)
|
||
|
|
||
|
devAttrs := vxlanDeviceAttrs{
|
||
|
vni: uint32(cfg.VNI),
|
||
|
name: fmt.Sprintf("flannel.%v", cfg.VNI),
|
||
|
vtepIndex: be.extIface.Iface.Index,
|
||
|
vtepAddr: be.extIface.IfaceAddr,
|
||
|
vtepPort: cfg.Port,
|
||
|
gbp: cfg.GBP,
|
||
|
learning: cfg.Learning,
|
||
|
}
|
||
|
|
||
|
dev, err := newVXLANDevice(&devAttrs)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
dev.directRouting = cfg.DirectRouting
|
||
|
|
||
|
subnetAttrs, err := newSubnetAttrs(be.extIface.ExtAddr, uint16(cfg.VNI), dev.MACAddr())
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
lease, err := be.subnetMgr.AcquireLease(ctx, subnetAttrs)
|
||
|
switch err {
|
||
|
case nil:
|
||
|
case context.Canceled, context.DeadlineExceeded:
|
||
|
return nil, err
|
||
|
default:
|
||
|
return nil, fmt.Errorf("failed to acquire lease: %v", err)
|
||
|
}
|
||
|
|
||
|
// Ensure that the device has a /32 address so that no broadcast routes are created.
|
||
|
// This IP is just used as a source address for host to workload traffic (so
|
||
|
// the return path for the traffic has an address on the flannel network to use as the destination)
|
||
|
if err := dev.Configure(ip.IP4Net{IP: lease.Subnet.IP, PrefixLen: 32}, config.Network); err != nil {
|
||
|
return nil, fmt.Errorf("failed to configure interface %s: %s", dev.link.Attrs().Name, err)
|
||
|
}
|
||
|
|
||
|
return newNetwork(be.subnetMgr, be.extIface, dev, ip.IP4Net{}, lease)
|
||
|
}
|
||
|
|
||
|
// So we can make it JSON (un)marshalable
|
||
|
type hardwareAddr net.HardwareAddr
|
||
|
|
||
|
func (hw hardwareAddr) MarshalJSON() ([]byte, error) {
|
||
|
return []byte(fmt.Sprintf("%q", net.HardwareAddr(hw))), nil
|
||
|
}
|
||
|
|
||
|
func (hw *hardwareAddr) UnmarshalJSON(bytes []byte) error {
|
||
|
if len(bytes) < 2 || bytes[0] != '"' || bytes[len(bytes)-1] != '"' {
|
||
|
return fmt.Errorf("error parsing hardware addr")
|
||
|
}
|
||
|
|
||
|
bytes = bytes[1 : len(bytes)-1]
|
||
|
|
||
|
mac, err := net.ParseMAC(string(bytes))
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
*hw = hardwareAddr(mac)
|
||
|
return nil
|
||
|
}
|