mirror of https://github.com/k3s-io/k3s
428 lines
14 KiB
Go
428 lines
14 KiB
Go
|
package systemd
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"fmt"
|
||
|
"math"
|
||
|
"os"
|
||
|
"regexp"
|
||
|
"strconv"
|
||
|
"strings"
|
||
|
"sync"
|
||
|
"time"
|
||
|
|
||
|
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
|
||
|
dbus "github.com/godbus/dbus/v5"
|
||
|
"github.com/opencontainers/runc/libcontainer/cgroups/devices"
|
||
|
"github.com/opencontainers/runc/libcontainer/configs"
|
||
|
"github.com/pkg/errors"
|
||
|
"github.com/sirupsen/logrus"
|
||
|
)
|
||
|
|
||
|
var (
|
||
|
connOnce sync.Once
|
||
|
connDbus *systemdDbus.Conn
|
||
|
connErr error
|
||
|
|
||
|
versionOnce sync.Once
|
||
|
version int
|
||
|
versionErr error
|
||
|
|
||
|
isRunningSystemdOnce sync.Once
|
||
|
isRunningSystemd bool
|
||
|
)
|
||
|
|
||
|
// NOTE: This function comes from package github.com/coreos/go-systemd/util
|
||
|
// It was borrowed here to avoid a dependency on cgo.
|
||
|
//
|
||
|
// IsRunningSystemd checks whether the host was booted with systemd as its init
|
||
|
// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
|
||
|
// checks whether /run/systemd/system/ exists and is a directory.
|
||
|
// http://www.freedesktop.org/software/systemd/man/sd_booted.html
|
||
|
func IsRunningSystemd() bool {
|
||
|
isRunningSystemdOnce.Do(func() {
|
||
|
fi, err := os.Lstat("/run/systemd/system")
|
||
|
isRunningSystemd = err == nil && fi.IsDir()
|
||
|
})
|
||
|
return isRunningSystemd
|
||
|
}
|
||
|
|
||
|
// systemd represents slice hierarchy using `-`, so we need to follow suit when
|
||
|
// generating the path of slice. Essentially, test-a-b.slice becomes
|
||
|
// /test.slice/test-a.slice/test-a-b.slice.
|
||
|
func ExpandSlice(slice string) (string, error) {
|
||
|
suffix := ".slice"
|
||
|
// Name has to end with ".slice", but can't be just ".slice".
|
||
|
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
|
||
|
return "", fmt.Errorf("invalid slice name: %s", slice)
|
||
|
}
|
||
|
|
||
|
// Path-separators are not allowed.
|
||
|
if strings.Contains(slice, "/") {
|
||
|
return "", fmt.Errorf("invalid slice name: %s", slice)
|
||
|
}
|
||
|
|
||
|
var path, prefix string
|
||
|
sliceName := strings.TrimSuffix(slice, suffix)
|
||
|
// if input was -.slice, we should just return root now
|
||
|
if sliceName == "-" {
|
||
|
return "/", nil
|
||
|
}
|
||
|
for _, component := range strings.Split(sliceName, "-") {
|
||
|
// test--a.slice isn't permitted, nor is -test.slice.
|
||
|
if component == "" {
|
||
|
return "", fmt.Errorf("invalid slice name: %s", slice)
|
||
|
}
|
||
|
|
||
|
// Append the component to the path and to the prefix.
|
||
|
path += "/" + prefix + component + suffix
|
||
|
prefix += component + "-"
|
||
|
}
|
||
|
return path, nil
|
||
|
}
|
||
|
|
||
|
func groupPrefix(ruleType configs.DeviceType) (string, error) {
|
||
|
switch ruleType {
|
||
|
case configs.BlockDevice:
|
||
|
return "block-", nil
|
||
|
case configs.CharDevice:
|
||
|
return "char-", nil
|
||
|
default:
|
||
|
return "", errors.Errorf("device type %v has no group prefix", ruleType)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// findDeviceGroup tries to find the device group name (as listed in
|
||
|
// /proc/devices) with the type prefixed as requried for DeviceAllow, for a
|
||
|
// given (type, major) combination. If more than one device group exists, an
|
||
|
// arbitrary one is chosen.
|
||
|
func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, error) {
|
||
|
fh, err := os.Open("/proc/devices")
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
defer fh.Close()
|
||
|
|
||
|
prefix, err := groupPrefix(ruleType)
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
|
||
|
scanner := bufio.NewScanner(fh)
|
||
|
var currentType configs.DeviceType
|
||
|
for scanner.Scan() {
|
||
|
// We need to strip spaces because the first number is column-aligned.
|
||
|
line := strings.TrimSpace(scanner.Text())
|
||
|
|
||
|
// Handle the "header" lines.
|
||
|
switch line {
|
||
|
case "Block devices:":
|
||
|
currentType = configs.BlockDevice
|
||
|
continue
|
||
|
case "Character devices:":
|
||
|
currentType = configs.CharDevice
|
||
|
continue
|
||
|
case "":
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
// Skip lines unrelated to our type.
|
||
|
if currentType != ruleType {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
// Parse out the (major, name).
|
||
|
var (
|
||
|
currMajor int64
|
||
|
currName string
|
||
|
)
|
||
|
if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
|
||
|
if err == nil {
|
||
|
err = errors.Errorf("wrong number of fields")
|
||
|
}
|
||
|
return "", errors.Wrapf(err, "scan /proc/devices line %q", line)
|
||
|
}
|
||
|
|
||
|
if currMajor == ruleMajor {
|
||
|
return prefix + currName, nil
|
||
|
}
|
||
|
}
|
||
|
if err := scanner.Err(); err != nil {
|
||
|
return "", errors.Wrap(err, "reading /proc/devices")
|
||
|
}
|
||
|
// Couldn't find the device group.
|
||
|
return "", nil
|
||
|
}
|
||
|
|
||
|
// generateDeviceProperties takes the configured device rules and generates a
|
||
|
// corresponding set of systemd properties to configure the devices correctly.
|
||
|
func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Property, error) {
|
||
|
// DeviceAllow is the type "a(ss)" which means we need a temporary struct
|
||
|
// to represent it in Go.
|
||
|
type deviceAllowEntry struct {
|
||
|
Path string
|
||
|
Perms string
|
||
|
}
|
||
|
|
||
|
properties := []systemdDbus.Property{
|
||
|
// Always run in the strictest white-list mode.
|
||
|
newProp("DevicePolicy", "strict"),
|
||
|
// Empty the DeviceAllow array before filling it.
|
||
|
newProp("DeviceAllow", []deviceAllowEntry{}),
|
||
|
}
|
||
|
|
||
|
// Figure out the set of rules.
|
||
|
configEmu := &devices.Emulator{}
|
||
|
for _, rule := range rules {
|
||
|
if err := configEmu.Apply(*rule); err != nil {
|
||
|
return nil, errors.Wrap(err, "apply rule for systemd")
|
||
|
}
|
||
|
}
|
||
|
// systemd doesn't support blacklists. So we log a warning, and tell
|
||
|
// systemd to act as a deny-all whitelist. This ruleset will be replaced
|
||
|
// with our normal fallback code. This may result in spurrious errors, but
|
||
|
// the only other option is to error out here.
|
||
|
if configEmu.IsBlacklist() {
|
||
|
// However, if we're dealing with an allow-all rule then we can do it.
|
||
|
if configEmu.IsAllowAll() {
|
||
|
return []systemdDbus.Property{
|
||
|
// Run in white-list mode by setting to "auto" and removing all
|
||
|
// DeviceAllow rules.
|
||
|
newProp("DevicePolicy", "auto"),
|
||
|
newProp("DeviceAllow", []deviceAllowEntry{}),
|
||
|
}, nil
|
||
|
}
|
||
|
logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
|
||
|
return properties, nil
|
||
|
}
|
||
|
|
||
|
// Now generate the set of rules we actually need to apply. Unlike the
|
||
|
// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
|
||
|
// whitelist which is the default for devices.Emulator.
|
||
|
baseEmu := &devices.Emulator{}
|
||
|
finalRules, err := baseEmu.Transition(configEmu)
|
||
|
if err != nil {
|
||
|
return nil, errors.Wrap(err, "get simplified rules for systemd")
|
||
|
}
|
||
|
var deviceAllowList []deviceAllowEntry
|
||
|
for _, rule := range finalRules {
|
||
|
if !rule.Allow {
|
||
|
// Should never happen.
|
||
|
return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
|
||
|
}
|
||
|
switch rule.Type {
|
||
|
case configs.BlockDevice, configs.CharDevice:
|
||
|
default:
|
||
|
// Should never happen.
|
||
|
return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
|
||
|
}
|
||
|
|
||
|
entry := deviceAllowEntry{
|
||
|
Perms: string(rule.Permissions),
|
||
|
}
|
||
|
|
||
|
// systemd has a fairly odd (though understandable) syntax here, and
|
||
|
// because of the OCI configuration format we have to do quite a bit of
|
||
|
// trickery to convert things:
|
||
|
//
|
||
|
// * Concrete rules with non-wildcard major/minor numbers have to use
|
||
|
// /dev/{block,char} paths. This is slightly odd because it means
|
||
|
// that we cannot add whitelist rules for devices that don't exist,
|
||
|
// but there's not too much we can do about that.
|
||
|
//
|
||
|
// However, path globbing is not support for path-based rules so we
|
||
|
// need to handle wildcards in some other manner.
|
||
|
//
|
||
|
// * Wildcard-minor rules have to specify a "device group name" (the
|
||
|
// second column in /proc/devices).
|
||
|
//
|
||
|
// * Wildcard (major and minor) rules can just specify a glob with the
|
||
|
// type ("char-*" or "block-*").
|
||
|
//
|
||
|
// The only type of rule we can't handle is wildcard-major rules, and
|
||
|
// so we'll give a warning in that case (note that the fallback code
|
||
|
// will insert any rules systemd couldn't handle). What amazing fun.
|
||
|
|
||
|
if rule.Major == configs.Wildcard {
|
||
|
// "_ *:n _" rules aren't supported by systemd.
|
||
|
if rule.Minor != configs.Wildcard {
|
||
|
logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
// "_ *:* _" rules just wildcard everything.
|
||
|
prefix, err := groupPrefix(rule.Type)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
entry.Path = prefix + "*"
|
||
|
} else if rule.Minor == configs.Wildcard {
|
||
|
// "_ n:* _" rules require a device group from /proc/devices.
|
||
|
group, err := findDeviceGroup(rule.Type, rule.Major)
|
||
|
if err != nil {
|
||
|
return nil, errors.Wrapf(err, "find device '%v/%d'", rule.Type, rule.Major)
|
||
|
}
|
||
|
if group == "" {
|
||
|
// Couldn't find a group.
|
||
|
logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
|
||
|
continue
|
||
|
}
|
||
|
entry.Path = group
|
||
|
} else {
|
||
|
// "_ n:m _" rules are just a path in /dev/{block,char}/.
|
||
|
switch rule.Type {
|
||
|
case configs.BlockDevice:
|
||
|
entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
|
||
|
case configs.CharDevice:
|
||
|
entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
|
||
|
}
|
||
|
}
|
||
|
deviceAllowList = append(deviceAllowList, entry)
|
||
|
}
|
||
|
|
||
|
properties = append(properties, newProp("DeviceAllow", deviceAllowList))
|
||
|
return properties, nil
|
||
|
}
|
||
|
|
||
|
// getDbusConnection lazy initializes systemd dbus connection
|
||
|
// and returns it
|
||
|
func getDbusConnection(rootless bool) (*systemdDbus.Conn, error) {
|
||
|
connOnce.Do(func() {
|
||
|
if rootless {
|
||
|
connDbus, connErr = NewUserSystemdDbus()
|
||
|
} else {
|
||
|
connDbus, connErr = systemdDbus.New()
|
||
|
}
|
||
|
})
|
||
|
return connDbus, connErr
|
||
|
}
|
||
|
|
||
|
func newProp(name string, units interface{}) systemdDbus.Property {
|
||
|
return systemdDbus.Property{
|
||
|
Name: name,
|
||
|
Value: dbus.MakeVariant(units),
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func getUnitName(c *configs.Cgroup) string {
|
||
|
// by default, we create a scope unless the user explicitly asks for a slice.
|
||
|
if !strings.HasSuffix(c.Name, ".slice") {
|
||
|
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
|
||
|
}
|
||
|
return c.Name
|
||
|
}
|
||
|
|
||
|
// isUnitExists returns true if the error is that a systemd unit already exists.
|
||
|
func isUnitExists(err error) bool {
|
||
|
if err != nil {
|
||
|
if dbusError, ok := err.(dbus.Error); ok {
|
||
|
return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
|
||
|
}
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error {
|
||
|
statusChan := make(chan string, 1)
|
||
|
if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
|
||
|
select {
|
||
|
case s := <-statusChan:
|
||
|
close(statusChan)
|
||
|
// Please refer to https://godoc.org/github.com/coreos/go-systemd/dbus#Conn.StartUnit
|
||
|
if s != "done" {
|
||
|
dbusConnection.ResetFailedUnit(unitName)
|
||
|
return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
|
||
|
}
|
||
|
case <-time.After(time.Second):
|
||
|
logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
|
||
|
}
|
||
|
} else if !isUnitExists(err) {
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func stopUnit(dbusConnection *systemdDbus.Conn, unitName string) error {
|
||
|
statusChan := make(chan string, 1)
|
||
|
if _, err := dbusConnection.StopUnit(unitName, "replace", statusChan); err == nil {
|
||
|
select {
|
||
|
case s := <-statusChan:
|
||
|
close(statusChan)
|
||
|
// Please refer to https://godoc.org/github.com/coreos/go-systemd/dbus#Conn.StartUnit
|
||
|
if s != "done" {
|
||
|
logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s)
|
||
|
}
|
||
|
case <-time.After(time.Second):
|
||
|
logrus.Warnf("Timed out while waiting for StopUnit(%s) completion signal from dbus. Continuing...", unitName)
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func systemdVersion(conn *systemdDbus.Conn) (int, error) {
|
||
|
versionOnce.Do(func() {
|
||
|
version = -1
|
||
|
verStr, err := conn.GetManagerProperty("Version")
|
||
|
if err != nil {
|
||
|
versionErr = err
|
||
|
return
|
||
|
}
|
||
|
|
||
|
version, versionErr = systemdVersionAtoi(verStr)
|
||
|
return
|
||
|
})
|
||
|
|
||
|
return version, versionErr
|
||
|
}
|
||
|
|
||
|
func systemdVersionAtoi(verStr string) (int, error) {
|
||
|
// verStr should be of the form:
|
||
|
// "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32"
|
||
|
// all the input strings include quotes, and the output int should be 245
|
||
|
// thus, we unconditionally remove the `"v`
|
||
|
// and then match on the first integer we can grab
|
||
|
re := regexp.MustCompile(`"?v?([0-9]+)`)
|
||
|
matches := re.FindStringSubmatch(verStr)
|
||
|
if len(matches) < 2 {
|
||
|
return 0, errors.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches)
|
||
|
}
|
||
|
ver, err := strconv.Atoi(matches[1])
|
||
|
return ver, errors.Wrapf(err, "can't parse version %s", verStr)
|
||
|
}
|
||
|
|
||
|
func addCpuQuota(conn *systemdDbus.Conn, properties *[]systemdDbus.Property, quota int64, period uint64) {
|
||
|
if period != 0 {
|
||
|
// systemd only supports CPUQuotaPeriodUSec since v242
|
||
|
sdVer, err := systemdVersion(conn)
|
||
|
if err != nil {
|
||
|
logrus.Warnf("systemdVersion: %s", err)
|
||
|
} else if sdVer >= 242 {
|
||
|
*properties = append(*properties,
|
||
|
newProp("CPUQuotaPeriodUSec", period))
|
||
|
}
|
||
|
}
|
||
|
if quota != 0 || period != 0 {
|
||
|
// corresponds to USEC_INFINITY in systemd
|
||
|
cpuQuotaPerSecUSec := uint64(math.MaxUint64)
|
||
|
if quota > 0 {
|
||
|
if period == 0 {
|
||
|
// assume the default kernel value of 100000 us (100 ms), same for v1 and v2.
|
||
|
// v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
|
||
|
// v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
|
||
|
period = 100000
|
||
|
}
|
||
|
// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
|
||
|
// (integer percentage of CPU) internally. This means that if a fractional percent of
|
||
|
// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
|
||
|
// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
|
||
|
cpuQuotaPerSecUSec = uint64(quota*1000000) / period
|
||
|
if cpuQuotaPerSecUSec%10000 != 0 {
|
||
|
cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
|
||
|
}
|
||
|
}
|
||
|
*properties = append(*properties,
|
||
|
newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
|
||
|
}
|
||
|
}
|