mirror of https://github.com/k3s-io/k3s
462 lines
14 KiB
Go
462 lines
14 KiB
Go
// +build linux
|
|
// +build cgo
|
|
|
|
package shared
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"sync"
|
|
"sync/atomic"
|
|
"unsafe"
|
|
|
|
"golang.org/x/sys/unix"
|
|
|
|
"github.com/lxc/lxd/shared/logger"
|
|
)
|
|
|
|
/*
|
|
#ifndef _GNU_SOURCE
|
|
#define _GNU_SOURCE 1
|
|
#endif
|
|
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <grp.h>
|
|
#include <limits.h>
|
|
#include <poll.h>
|
|
#include <pty.h>
|
|
#include <pwd.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/types.h>
|
|
#include <sys/un.h>
|
|
|
|
#define ABSTRACT_UNIX_SOCK_LEN sizeof(((struct sockaddr_un *)0)->sun_path)
|
|
|
|
// This is an adaption from https://codereview.appspot.com/4589049, to be
|
|
// included in the stdlib with the stdlib's license.
|
|
|
|
void configure_pty(int fd) {
|
|
struct termios term_settings;
|
|
struct winsize win;
|
|
|
|
if (tcgetattr(fd, &term_settings) < 0) {
|
|
fprintf(stderr, "Failed to get settings: %s\n", strerror(errno));
|
|
return;
|
|
}
|
|
|
|
term_settings.c_iflag |= IMAXBEL;
|
|
term_settings.c_iflag |= IUTF8;
|
|
term_settings.c_iflag |= BRKINT;
|
|
term_settings.c_iflag |= IXANY;
|
|
|
|
term_settings.c_cflag |= HUPCL;
|
|
|
|
if (tcsetattr(fd, TCSANOW, &term_settings) < 0) {
|
|
fprintf(stderr, "Failed to set settings: %s\n", strerror(errno));
|
|
return;
|
|
}
|
|
|
|
if (ioctl(fd, TIOCGWINSZ, &win) < 0) {
|
|
fprintf(stderr, "Failed to get the terminal size: %s\n", strerror(errno));
|
|
return;
|
|
}
|
|
|
|
win.ws_col = 80;
|
|
win.ws_row = 25;
|
|
|
|
if (ioctl(fd, TIOCSWINSZ, &win) < 0) {
|
|
fprintf(stderr, "Failed to set the terminal size: %s\n", strerror(errno));
|
|
return;
|
|
}
|
|
|
|
if (fcntl(fd, F_SETFD, FD_CLOEXEC) < 0) {
|
|
fprintf(stderr, "Failed to set FD_CLOEXEC: %s\n", strerror(errno));
|
|
return;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
void create_pty(int *master, int *slave, uid_t uid, gid_t gid) {
|
|
if (openpty(master, slave, NULL, NULL, NULL) < 0) {
|
|
fprintf(stderr, "Failed to openpty: %s\n", strerror(errno));
|
|
return;
|
|
}
|
|
|
|
configure_pty(*master);
|
|
configure_pty(*slave);
|
|
|
|
if (fchown(*slave, uid, gid) < 0) {
|
|
fprintf(stderr, "Warning: error chowning pty to container root\n");
|
|
fprintf(stderr, "Continuing...\n");
|
|
}
|
|
}
|
|
|
|
void create_pipe(int *master, int *slave) {
|
|
int pipefd[2];
|
|
|
|
if (pipe2(pipefd, O_CLOEXEC) < 0) {
|
|
fprintf(stderr, "Failed to create a pipe: %s\n", strerror(errno));
|
|
return;
|
|
}
|
|
|
|
*master = pipefd[0];
|
|
*slave = pipefd[1];
|
|
}
|
|
|
|
int get_poll_revents(int lfd, int timeout, int flags, int *revents, int *saved_errno)
|
|
{
|
|
int ret;
|
|
struct pollfd pfd = {lfd, flags, 0};
|
|
|
|
again:
|
|
ret = poll(&pfd, 1, timeout);
|
|
if (ret < 0) {
|
|
if (errno == EINTR)
|
|
goto again;
|
|
|
|
*saved_errno = errno;
|
|
fprintf(stderr, "Failed to poll() on file descriptor.\n");
|
|
return -1;
|
|
}
|
|
|
|
*revents = pfd.revents;
|
|
|
|
return ret;
|
|
}
|
|
*/
|
|
import "C"
|
|
|
|
const ABSTRACT_UNIX_SOCK_LEN int = C.ABSTRACT_UNIX_SOCK_LEN
|
|
|
|
const POLLIN int = C.POLLIN
|
|
const POLLPRI int = C.POLLPRI
|
|
const POLLNVAL int = C.POLLNVAL
|
|
const POLLERR int = C.POLLERR
|
|
const POLLHUP int = C.POLLHUP
|
|
const POLLRDHUP int = C.POLLRDHUP
|
|
|
|
func GetPollRevents(fd int, timeout int, flags int) (int, int, error) {
|
|
var err error
|
|
revents := C.int(0)
|
|
saved_errno := C.int(0)
|
|
|
|
ret := C.get_poll_revents(C.int(fd), C.int(timeout), C.int(flags), &revents, &saved_errno)
|
|
if int(ret) < 0 {
|
|
err = unix.Errno(saved_errno)
|
|
}
|
|
|
|
return int(ret), int(revents), err
|
|
}
|
|
|
|
func OpenPty(uid, gid int64) (master *os.File, slave *os.File, err error) {
|
|
fd_master := C.int(-1)
|
|
fd_slave := C.int(-1)
|
|
rootUid := C.uid_t(uid)
|
|
rootGid := C.gid_t(gid)
|
|
|
|
C.create_pty(&fd_master, &fd_slave, rootUid, rootGid)
|
|
|
|
if fd_master == -1 || fd_slave == -1 {
|
|
return nil, nil, errors.New("Failed to create a new pts pair")
|
|
}
|
|
|
|
master = os.NewFile(uintptr(fd_master), "master")
|
|
slave = os.NewFile(uintptr(fd_slave), "slave")
|
|
|
|
return master, slave, nil
|
|
}
|
|
|
|
func Pipe() (master *os.File, slave *os.File, err error) {
|
|
fd_master := C.int(-1)
|
|
fd_slave := C.int(-1)
|
|
|
|
C.create_pipe(&fd_master, &fd_slave)
|
|
|
|
if fd_master == -1 || fd_slave == -1 {
|
|
return nil, nil, errors.New("Failed to create a new pipe")
|
|
}
|
|
|
|
master = os.NewFile(uintptr(fd_master), "master")
|
|
slave = os.NewFile(uintptr(fd_slave), "slave")
|
|
|
|
return master, slave, nil
|
|
}
|
|
|
|
// UserId is an adaption from https://codereview.appspot.com/4589049.
|
|
func UserId(name string) (int, error) {
|
|
var pw C.struct_passwd
|
|
var result *C.struct_passwd
|
|
|
|
bufSize := C.sysconf(C._SC_GETPW_R_SIZE_MAX)
|
|
if bufSize < 0 {
|
|
bufSize = 4096
|
|
}
|
|
|
|
buf := C.malloc(C.size_t(bufSize))
|
|
if buf == nil {
|
|
return -1, fmt.Errorf("allocation failed")
|
|
}
|
|
defer C.free(buf)
|
|
|
|
cname := C.CString(name)
|
|
defer C.free(unsafe.Pointer(cname))
|
|
|
|
again:
|
|
rv, errno := C.getpwnam_r(cname,
|
|
&pw,
|
|
(*C.char)(buf),
|
|
C.size_t(bufSize),
|
|
&result)
|
|
if rv < 0 {
|
|
// OOM killer will take care of us if we end up doing this too
|
|
// often.
|
|
if errno == unix.ERANGE {
|
|
bufSize *= 2
|
|
tmp := C.realloc(buf, C.size_t(bufSize))
|
|
if tmp == nil {
|
|
return -1, fmt.Errorf("allocation failed")
|
|
}
|
|
buf = tmp
|
|
goto again
|
|
}
|
|
return -1, fmt.Errorf("failed user lookup: %s", unix.Errno(rv))
|
|
}
|
|
|
|
if result == nil {
|
|
return -1, fmt.Errorf("unknown user %s", name)
|
|
}
|
|
|
|
return int(C.int(result.pw_uid)), nil
|
|
}
|
|
|
|
// GroupId is an adaption from https://codereview.appspot.com/4589049.
|
|
func GroupId(name string) (int, error) {
|
|
var grp C.struct_group
|
|
var result *C.struct_group
|
|
|
|
bufSize := C.sysconf(C._SC_GETGR_R_SIZE_MAX)
|
|
if bufSize < 0 {
|
|
bufSize = 4096
|
|
}
|
|
|
|
buf := C.malloc(C.size_t(bufSize))
|
|
if buf == nil {
|
|
return -1, fmt.Errorf("allocation failed")
|
|
}
|
|
|
|
cname := C.CString(name)
|
|
defer C.free(unsafe.Pointer(cname))
|
|
|
|
again:
|
|
rv, errno := C.getgrnam_r(cname,
|
|
&grp,
|
|
(*C.char)(buf),
|
|
C.size_t(bufSize),
|
|
&result)
|
|
if rv != 0 {
|
|
// OOM killer will take care of us if we end up doing this too
|
|
// often.
|
|
if errno == unix.ERANGE {
|
|
bufSize *= 2
|
|
tmp := C.realloc(buf, C.size_t(bufSize))
|
|
if tmp == nil {
|
|
return -1, fmt.Errorf("allocation failed")
|
|
}
|
|
buf = tmp
|
|
goto again
|
|
}
|
|
|
|
C.free(buf)
|
|
return -1, fmt.Errorf("failed group lookup: %s", unix.Errno(rv))
|
|
}
|
|
C.free(buf)
|
|
|
|
if result == nil {
|
|
return -1, fmt.Errorf("unknown group %s", name)
|
|
}
|
|
|
|
return int(C.int(result.gr_gid)), nil
|
|
}
|
|
|
|
// Extensively commented directly in the code. Please leave the comments!
|
|
// Looking at this in a couple of months noone will know why and how this works
|
|
// anymore.
|
|
func ExecReaderToChannel(r io.Reader, bufferSize int, exited <-chan bool, fd int) <-chan []byte {
|
|
if bufferSize <= (128 * 1024) {
|
|
bufferSize = (128 * 1024)
|
|
}
|
|
|
|
ch := make(chan ([]byte))
|
|
|
|
// Takes care that the closeChannel() function is exactly executed once.
|
|
// This allows us to avoid using a mutex.
|
|
var once sync.Once
|
|
closeChannel := func() {
|
|
close(ch)
|
|
}
|
|
|
|
// [1]: This function has just one job: Dealing with the case where we
|
|
// are running an interactive shell session where we put a process in
|
|
// the background that does hold stdin/stdout open, but does not
|
|
// generate any output at all. This case cannot be dealt with in the
|
|
// following function call. Here's why: Assume the above case, now the
|
|
// attached child (the shell in this example) exits. This will not
|
|
// generate any poll() event: We won't get POLLHUP because the
|
|
// background process is holding stdin/stdout open and noone is writing
|
|
// to it. So we effectively block on GetPollRevents() in the function
|
|
// below. Hence, we use another go routine here who's only job is to
|
|
// handle that case: When we detect that the child has exited we check
|
|
// whether a POLLIN or POLLHUP event has been generated. If not, we know
|
|
// that there's nothing buffered on stdout and exit.
|
|
var attachedChildIsDead int32 = 0
|
|
go func() {
|
|
<-exited
|
|
|
|
atomic.StoreInt32(&attachedChildIsDead, 1)
|
|
|
|
ret, revents, err := GetPollRevents(fd, 0, (POLLIN | POLLPRI | POLLERR | POLLHUP | POLLRDHUP | POLLNVAL))
|
|
if ret < 0 {
|
|
logger.Errorf("Failed to poll(POLLIN | POLLPRI | POLLHUP | POLLRDHUP) on file descriptor: %s.", err)
|
|
} else if ret > 0 {
|
|
if (revents & POLLERR) > 0 {
|
|
logger.Warnf("Detected poll(POLLERR) event.")
|
|
} else if (revents & POLLNVAL) > 0 {
|
|
logger.Warnf("Detected poll(POLLNVAL) event.")
|
|
}
|
|
} else if ret == 0 {
|
|
logger.Debugf("No data in stdout: exiting.")
|
|
once.Do(closeChannel)
|
|
return
|
|
}
|
|
}()
|
|
|
|
go func() {
|
|
readSize := (128 * 1024)
|
|
offset := 0
|
|
buf := make([]byte, bufferSize)
|
|
avoidAtomicLoad := false
|
|
|
|
defer once.Do(closeChannel)
|
|
for {
|
|
nr := 0
|
|
var err error
|
|
|
|
ret, revents, err := GetPollRevents(fd, -1, (POLLIN | POLLPRI | POLLERR | POLLHUP | POLLRDHUP | POLLNVAL))
|
|
if ret < 0 {
|
|
// This condition is only reached in cases where we are massively f*cked since we even handle
|
|
// EINTR in the underlying C wrapper around poll(). So let's exit here.
|
|
logger.Errorf("Failed to poll(POLLIN | POLLPRI | POLLERR | POLLHUP | POLLRDHUP) on file descriptor: %s. Exiting.", err)
|
|
return
|
|
}
|
|
|
|
// [2]: If the process exits before all its data has been read by us and no other process holds stdin or
|
|
// stdout open, then we will observe a (POLLHUP | POLLRDHUP | POLLIN) event. This means, we need to
|
|
// keep on reading from the pty file descriptor until we get a simple POLLHUP back.
|
|
both := ((revents & (POLLIN | POLLPRI)) > 0) && ((revents & (POLLHUP | POLLRDHUP)) > 0)
|
|
if both {
|
|
logger.Debugf("Detected poll(POLLIN | POLLPRI | POLLHUP | POLLRDHUP) event.")
|
|
read := buf[offset : offset+readSize]
|
|
nr, err = r.Read(read)
|
|
}
|
|
|
|
if (revents & POLLERR) > 0 {
|
|
logger.Warnf("Detected poll(POLLERR) event: exiting.")
|
|
return
|
|
} else if (revents & POLLNVAL) > 0 {
|
|
logger.Warnf("Detected poll(POLLNVAL) event: exiting.")
|
|
return
|
|
}
|
|
|
|
if ((revents & (POLLIN | POLLPRI)) > 0) && !both {
|
|
// This might appear unintuitive at first but is actually a nice trick: Assume we are running
|
|
// a shell session in a container and put a process in the background that is writing to
|
|
// stdout. Now assume the attached process (aka the shell in this example) exits because we
|
|
// used Ctrl+D to send EOF or something. If no other process would be holding stdout open we
|
|
// would expect to observe either a (POLLHUP | POLLRDHUP | POLLIN | POLLPRI) event if there
|
|
// is still data buffered from the previous process or a simple (POLLHUP | POLLRDHUP) if
|
|
// no data is buffered. The fact that we only observe a (POLLIN | POLLPRI) event means that
|
|
// another process is holding stdout open and is writing to it.
|
|
// One counter argument that can be leveraged is (brauner looks at tycho :))
|
|
// "Hey, you need to write at least one additional tty buffer to make sure that
|
|
// everything that the attached child has written is actually shown."
|
|
// The answer to that is:
|
|
// "This case can only happen if the process has exited and has left data in stdout which
|
|
// would generate a (POLLIN | POLLPRI | POLLHUP | POLLRDHUP) event and this case is already
|
|
// handled and triggers another codepath. (See [2].)"
|
|
if avoidAtomicLoad || atomic.LoadInt32(&attachedChildIsDead) == 1 {
|
|
avoidAtomicLoad = true
|
|
// Handle race between atomic.StorInt32() in the go routine
|
|
// explained in [1] and atomic.LoadInt32() in the go routine
|
|
// here:
|
|
// We need to check for (POLLHUP | POLLRDHUP) here again since we might
|
|
// still be handling a pure POLLIN event from a write prior to the childs
|
|
// exit. But the child might have exited right before and performed
|
|
// atomic.StoreInt32() to update attachedChildIsDead before we
|
|
// performed our atomic.LoadInt32(). This means we accidentally hit this
|
|
// codepath and are misinformed about the available poll() events. So we
|
|
// need to perform a non-blocking poll() again to exclude that case:
|
|
//
|
|
// - If we detect no (POLLHUP | POLLRDHUP) event we know the child
|
|
// has already exited but someone else is holding stdin/stdout open and
|
|
// writing to it.
|
|
// Note that his case should only ever be triggered in situations like
|
|
// running a shell and doing stuff like:
|
|
// > ./lxc exec xen1 -- bash
|
|
// root@xen1:~# yes &
|
|
// .
|
|
// .
|
|
// .
|
|
// now send Ctrl+D or type "exit". By the time the Ctrl+D/exit event is
|
|
// triggered, we will have read all of the childs data it has written to
|
|
// stdout and so we can assume that anything that comes now belongs to
|
|
// the process that is holding stdin/stdout open.
|
|
//
|
|
// - If we detect a (POLLHUP | POLLRDHUP) event we know that we've
|
|
// hit this codepath on accident caused by the race between
|
|
// atomic.StoreInt32() in the go routine explained in [1] and
|
|
// atomic.LoadInt32() in this go routine. So the next call to
|
|
// GetPollRevents() will either return
|
|
// (POLLIN | POLLPRI | POLLERR | POLLHUP | POLLRDHUP)
|
|
// or (POLLHUP | POLLRDHUP). Both will trigger another codepath (See [2].)
|
|
// that takes care that all data of the child that is buffered in
|
|
// stdout is written out.
|
|
ret, revents, err := GetPollRevents(fd, 0, (POLLIN | POLLPRI | POLLERR | POLLHUP | POLLRDHUP | POLLNVAL))
|
|
if ret < 0 {
|
|
logger.Errorf("Failed to poll(POLLIN | POLLPRI | POLLERR | POLLHUP | POLLRDHUP) on file descriptor: %s. Exiting.", err)
|
|
return
|
|
} else if (revents & (POLLHUP | POLLRDHUP | POLLERR | POLLNVAL)) == 0 {
|
|
logger.Debugf("Exiting but background processes are still running.")
|
|
return
|
|
}
|
|
}
|
|
read := buf[offset : offset+readSize]
|
|
nr, err = r.Read(read)
|
|
}
|
|
|
|
// The attached process has exited and we have read all data that may have
|
|
// been buffered.
|
|
if ((revents & (POLLHUP | POLLRDHUP)) > 0) && !both {
|
|
logger.Debugf("Detected poll(POLLHUP) event: exiting.")
|
|
return
|
|
}
|
|
|
|
offset += nr
|
|
if offset > 0 && (offset+readSize >= bufferSize || err != nil) {
|
|
ch <- buf[0:offset]
|
|
offset = 0
|
|
buf = make([]byte, bufferSize)
|
|
}
|
|
}
|
|
}()
|
|
|
|
return ch
|
|
}
|