package link import ( "crypto/rand" "errors" "fmt" "os" "path/filepath" "runtime" "github.com/cilium/ebpf" "github.com/cilium/ebpf/internal" "github.com/cilium/ebpf/internal/unix" ) var ( kprobeEventsPath = filepath.Join(tracefsPath, "kprobe_events") ) // Kprobe attaches the given eBPF program to a perf event that fires when the // given kernel symbol starts executing. See /proc/kallsyms for available // symbols. For example, printk(): // // Kprobe("printk") // // The resulting Link must be Closed during program shutdown to avoid leaking // system resources. func Kprobe(symbol string, prog *ebpf.Program) (Link, error) { k, err := kprobe(symbol, prog, false) if err != nil { return nil, err } err = k.attach(prog) if err != nil { k.Close() return nil, err } return k, nil } // Kretprobe attaches the given eBPF program to a perf event that fires right // before the given kernel symbol exits, with the function stack left intact. // See /proc/kallsyms for available symbols. For example, printk(): // // Kretprobe("printk") // // The resulting Link must be Closed during program shutdown to avoid leaking // system resources. func Kretprobe(symbol string, prog *ebpf.Program) (Link, error) { k, err := kprobe(symbol, prog, true) if err != nil { return nil, err } err = k.attach(prog) if err != nil { k.Close() return nil, err } return k, nil } // kprobe opens a perf event on the given symbol and attaches prog to it. // If ret is true, create a kretprobe. func kprobe(symbol string, prog *ebpf.Program, ret bool) (*perfEvent, error) { if symbol == "" { return nil, fmt.Errorf("symbol name cannot be empty: %w", errInvalidInput) } if prog == nil { return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput) } if !rgxTraceEvent.MatchString(symbol) { return nil, fmt.Errorf("symbol '%s' must be alphanumeric or underscore: %w", symbol, errInvalidInput) } if prog.Type() != ebpf.Kprobe { return nil, fmt.Errorf("eBPF program type %s is not a Kprobe: %w", prog.Type(), errInvalidInput) } // Use kprobe PMU if the kernel has it available. tp, err := pmuKprobe(symbol, ret) if err == nil { return tp, nil } if err != nil && !errors.Is(err, ErrNotSupported) { return nil, fmt.Errorf("creating perf_kprobe PMU: %w", err) } // Use tracefs if kprobe PMU is missing. tp, err = tracefsKprobe(symbol, ret) if err != nil { return nil, fmt.Errorf("creating trace event '%s' in tracefs: %w", symbol, err) } return tp, nil } // pmuKprobe opens a perf event based on a Performance Monitoring Unit. // Requires at least 4.17 (e12f03d7031a "perf/core: Implement the // 'perf_kprobe' PMU"). // Returns ErrNotSupported if the kernel doesn't support perf_kprobe PMU, // or os.ErrNotExist if the given symbol does not exist in the kernel. func pmuKprobe(symbol string, ret bool) (*perfEvent, error) { // Getting the PMU type will fail if the kernel doesn't support // the perf_kprobe PMU. et, err := getPMUEventType("kprobe") if err != nil { return nil, err } // Create a pointer to a NUL-terminated string for the kernel. sp, err := unsafeStringPtr(symbol) if err != nil { return nil, err } // TODO: Parse the position of the bit from /sys/bus/event_source/devices/%s/format/retprobe. config := 0 if ret { config = 1 } attr := unix.PerfEventAttr{ Type: uint32(et), // PMU event type read from sysfs Ext1: uint64(uintptr(sp)), // Kernel symbol to trace Config: uint64(config), // perf_kprobe PMU treats config as flags } fd, err := unix.PerfEventOpen(&attr, perfAllThreads, 0, -1, unix.PERF_FLAG_FD_CLOEXEC) // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL // when trying to create a kretprobe for a missing symbol. Make sure ENOENT // is returned to the caller. if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) { return nil, fmt.Errorf("symbol '%s' not found: %w", symbol, os.ErrNotExist) } if err != nil { return nil, fmt.Errorf("opening perf event: %w", err) } // Ensure the string pointer is not collected before PerfEventOpen returns. runtime.KeepAlive(sp) // Kernel has perf_kprobe PMU available, initialize perf event. return &perfEvent{ fd: internal.NewFD(uint32(fd)), pmuID: et, name: symbol, ret: ret, progType: ebpf.Kprobe, }, nil } // tracefsKprobe creates a trace event by writing an entry to /kprobe_events. // A new trace event group name is generated on every call to support creating // multiple trace events for the same kernel symbol. A perf event is then opened // on the newly-created trace event and returned to the caller. func tracefsKprobe(symbol string, ret bool) (*perfEvent, error) { // Generate a random string for each trace event we attempt to create. // This value is used as the 'group' token in tracefs to allow creating // multiple kprobe trace events with the same name. group, err := randomGroup("ebpf") if err != nil { return nil, fmt.Errorf("randomizing group name: %w", err) } // Before attempting to create a trace event through tracefs, // check if an event with the same group and name already exists. // Kernels 4.x and earlier don't return os.ErrExist on writing a duplicate // entry, so we need to rely on reads for detecting uniqueness. _, err = getTraceEventID(group, symbol) if err == nil { return nil, fmt.Errorf("trace event already exists: %s/%s", group, symbol) } // The read is expected to fail with ErrNotSupported due to a non-existing event. if err != nil && !errors.Is(err, ErrNotSupported) { return nil, fmt.Errorf("checking trace event %s/%s: %w", group, symbol, err) } // Create the kprobe trace event using tracefs. if err := createTraceFSKprobeEvent(group, symbol, ret); err != nil { return nil, fmt.Errorf("creating kprobe event on tracefs: %w", err) } // Get the newly-created trace event's id. tid, err := getTraceEventID(group, symbol) if err != nil { return nil, fmt.Errorf("getting trace event id: %w", err) } // Kprobes are ephemeral tracepoints and share the same perf event type. fd, err := openTracepointPerfEvent(tid) if err != nil { return nil, err } return &perfEvent{ fd: fd, group: group, name: symbol, ret: ret, tracefsID: tid, progType: ebpf.Kprobe, // kernel only allows attaching kprobe programs to kprobe events }, nil } // createTraceFSKprobeEvent creates a new ephemeral trace event by writing to // /kprobe_events. Returns ErrNotSupported if symbol is not a valid // kernel symbol, or if it is not traceable with kprobes. func createTraceFSKprobeEvent(group, symbol string, ret bool) error { // Open the kprobe_events file in tracefs. f, err := os.OpenFile(kprobeEventsPath, os.O_APPEND|os.O_WRONLY, 0666) if err != nil { return fmt.Errorf("error opening kprobe_events: %w", err) } defer f.Close() // The kprobe_events syntax is as follows (see Documentation/trace/kprobetrace.txt): // p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe // r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe // -:[GRP/]EVENT : Clear a probe // // Some examples: // r:ebpf_1234/r_my_kretprobe nf_conntrack_destroy // p:ebpf_5678/p_my_kprobe __x64_sys_execve // // Leaving the kretprobe's MAXACTIVE set to 0 (or absent) will make the // kernel default to NR_CPUS. This is desired in most eBPF cases since // subsampling or rate limiting logic can be more accurately implemented in // the eBPF program itself. See Documentation/kprobes.txt for more details. pe := fmt.Sprintf("%s:%s/%s %s", kprobePrefix(ret), group, symbol, symbol) _, err = f.WriteString(pe) // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL // when trying to create a kretprobe for a missing symbol. Make sure ENOENT // is returned to the caller. if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) { return fmt.Errorf("kernel symbol %s not found: %w", symbol, os.ErrNotExist) } if err != nil { return fmt.Errorf("writing '%s' to kprobe_events: %w", pe, err) } return nil } // closeTraceFSKprobeEvent removes the kprobe with the given group, symbol and kind // from /kprobe_events. func closeTraceFSKprobeEvent(group, symbol string) error { f, err := os.OpenFile(kprobeEventsPath, os.O_APPEND|os.O_WRONLY, 0666) if err != nil { return fmt.Errorf("error opening kprobe_events: %w", err) } defer f.Close() // See kprobe_events syntax above. Kprobe type does not need to be specified // for removals. pe := fmt.Sprintf("-:%s/%s", group, symbol) if _, err = f.WriteString(pe); err != nil { return fmt.Errorf("writing '%s' to kprobe_events: %w", pe, err) } return nil } // randomGroup generates a pseudorandom string for use as a tracefs group name. // Returns an error when the output string would exceed 63 characters (kernel // limitation), when rand.Read() fails or when prefix contains characters not // allowed by rgxTraceEvent. func randomGroup(prefix string) (string, error) { if !rgxTraceEvent.MatchString(prefix) { return "", fmt.Errorf("prefix '%s' must be alphanumeric or underscore: %w", prefix, errInvalidInput) } b := make([]byte, 8) if _, err := rand.Read(b); err != nil { return "", fmt.Errorf("reading random bytes: %w", err) } group := fmt.Sprintf("%s_%x", prefix, b) if len(group) > 63 { return "", fmt.Errorf("group name '%s' cannot be longer than 63 characters: %w", group, errInvalidInput) } return group, nil } func kprobePrefix(ret bool) string { if ret { return "r" } return "p" }