consul/agent/proxyprocess/daemon_test.go

package proxyprocess

import (
	"io/ioutil"
	"os"
	"os/exec"
	"path/filepath"
	"strconv"
	"syscall"
	"testing"
	"time"

	"github.com/hashicorp/consul/sdk/testutil/retry"
	"github.com/hashicorp/go-uuid"
	"github.com/stretchr/testify/require"
)

func TestDaemon_impl(t *testing.T) {
	var _ Proxy = new(Daemon)
}

func TestDaemonStartStop(t *testing.T) {
	t.Parallel()

	require := require.New(t)
	td, closer := testTempDir(t)
	defer closer()

	path := filepath.Join(td, "file")
	uuid, err := uuid.GenerateUUID()
	require.NoError(err)

	cmd, destroy := helperProcess("start-stop", path)
	defer destroy()

	d := &Daemon{
		Command:    cmd,
		ProxyID:    "tubes",
		ProxyToken: uuid,
		Logger:     testLogger,
	}
	require.NoError(d.Start())
	defer d.Stop()

	// Wait for the file to exist
	retry.Run(t, func(r *retry.R) {
		_, err := os.Stat(path)
		if err == nil {
			return
		}

		r.Fatalf("error: %s", err)
	})

	// Verify that the contents of the file is the token. This verifies
	// that we properly passed the token as an env var.
	data, err := ioutil.ReadFile(path)
	require.NoError(err)
	require.Equal("tubes:"+uuid, string(data))

	// Stop the process
	require.NoError(d.Stop())

	// File should no longer exist.
	retry.Run(t, func(r *retry.R) {
		_, err := os.Stat(path)
		if os.IsNotExist(err) {
			return
		}

		// err might be nil here but that's okay
		r.Fatalf("should not exist: %s", err)
	})
}

func TestDaemonRestart(t *testing.T) {
	t.Parallel()

	require := require.New(t)
	td, closer := testTempDir(t)
	defer closer()
	path := filepath.Join(td, "file")

	cmd, destroy := helperProcess("restart", path)
	defer destroy()

	d := &Daemon{
		Command: cmd,
		Logger:  testLogger,
	}
	require.NoError(d.Start())
	defer d.Stop()

	// Wait for the file to exist. We save the func so we can reuse the test.
	waitFile := func() {
		retry.Run(t, func(r *retry.R) {
			_, err := os.Stat(path)
			if err == nil {
				return
			}
			r.Fatalf("error waiting for path: %s", err)
		})
	}
	waitFile()

	// Delete the file
	require.NoError(os.Remove(path))

	// File should re-appear because the process is restart
	waitFile()
}

func TestDaemonLaunchesNewProcessGroup(t *testing.T) {
	t.Parallel()

	require := require.New(t)
	td, closer := testTempDir(t)
	defer closer()

	path := filepath.Join(td, "file")
	pidPath := filepath.Join(td, "child.pid")

	// Start the parent process wrapping a start-stop test. The parent is acting
	// as our "agent". We need an extra indirection to be able to kill the "agent"
	// and still be running the test process.
	parentCmd, destroy := helperProcess("parent", pidPath, "start-stop", path)
	defer destroy()

	// We MUST run this as a separate process group otherwise the Kill below will
	// kill this test process (and possibly your shell/editor that launched it!)
	parentCmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}

	require.NoError(parentCmd.Start())

	// Wait for the pid file to exist so we know parent is running
	retry.Run(t, func(r *retry.R) {
		_, err := os.Stat(pidPath)
		if err == nil {
			return
		}

		r.Fatalf("error: %s", err)
	})

	// And wait for the actual file to be sure the child is running (it should be
	// since parent doesn't write PID until child starts but the child might not
	// have completed the write to disk yet which causes flakiness below).
	retry.Run(t, func(r *retry.R) {
		_, err := os.Stat(path)
		if err == nil {
			return
		}

		r.Fatalf("error: %s", err)
	})

	// Get the child PID
	bs, err := ioutil.ReadFile(pidPath)
	require.NoError(err)
	pid, err := strconv.Atoi(string(bs))
	require.NoError(err)
	proc, err := os.FindProcess(pid)
	require.NoError(err)

	// Always cleanup child process after
	defer func() {
		if proc != nil {
			proc.Kill()
		}
	}()

	// Now kill the parent's whole process group and wait for it
	pgid, err := syscall.Getpgid(parentCmd.Process.Pid)

	require.NoError(err)
	// Yep the minus PGid is how you kill a whole process group in unix... no idea
	// how this works on windows. We TERM no KILL since we rely on the child
	// catching the signal and deleting it's file to detect correct behavior.
	require.NoError(syscall.Kill(-pgid, syscall.SIGTERM))

	_, err = parentCmd.Process.Wait()
	require.NoError(err)

	// The child should still be running so file should still be there
	_, err = os.Stat(path)
	require.NoError(err, "child should still be running")

	// TEST PART 2 - verify that adopting an existing process works and picks up
	// monitoring even though it's not a child. We can't do this accurately with
	// Restart test since even if we create a new `Daemon` object the test process
	// is still the parent. We need the indirection of the `parent` test helper to
	// actually verify "adoption" on restart works.

	// Start a new parent that will "adopt" the existing child even though it will
	// not be an actual child process.
	fosterCmd, destroy := helperProcess("parent", pidPath, "start-stop", path)
	defer destroy()

	// Don't care about it being same process group this time as we will just kill
	// it normally.
	require.NoError(fosterCmd.Start())
	defer func() {
		// Clean up the daemon and wait for it to prevent it becoming a zombie.
		fosterCmd.Process.Kill()
		fosterCmd.Wait()
	}()

	// The child should still be running so file should still be there
	_, err = os.Stat(path)
	require.NoError(err, "child should still be running")

	{
		// Get the child PID - it shouldn't have changed and should be running
		bs2, err := ioutil.ReadFile(pidPath)
		require.NoError(err)
		pid2, err := strconv.Atoi(string(bs2))
		require.NoError(err)
		// Defer a cleanup (til end of test function)
		proc, err := os.FindProcess(pid)
		require.NoError(err)
		defer func() { proc.Kill() }()

		require.Equal(pid, pid2)
		t.Logf("Child PID was %d and still %d", pid, pid2)
	}

	// Now killing the child directly should still be restarted by the Daemon
	require.NoError(proc.Kill())
	proc = nil

	retry.Run(t, func(r *retry.R) {
		// Get the child PID - it should have changed
		bs, err := ioutil.ReadFile(pidPath)
		r.Check(err)

		newPid, err := strconv.Atoi(string(bs))
		r.Check(err)
		if newPid == pid {
			r.Fatalf("Child PID file not changed, Daemon not restarting it")
		}
		t.Logf("Child PID was %d and is now %d", pid, newPid)
	})

	// I had to run through this test in debugger a lot of times checking ps state
	// by hand at different points to convince myself it was doing the right
	// thing. It doesn't help that with verbose logs on it seems that the stdio
	// from the `parent` process can sometimes miss lines out due to timing. For
	// example the `[INFO] agent/proxy: daemon exited...` log from Daemon that
	// indicates that the child was detected to have failed and is restarting is
	// never output on my Mac at full speed. But if I run in debugger and have it
	// pause at the step after the child is killed above, then it shows. The
	// `[DEBUG] agent/proxy: starting proxy:` for the restart does always come
	// through though which is odd. I assume this is some odd quirk of timing
	// between processes and stdio or something but it makes debugging this stuff
	// even harder!

	// Let defer clean up the child process(es)

	// Get the NEW child PID
	bs, err = ioutil.ReadFile(pidPath)
	require.NoError(err)
	pid, err = strconv.Atoi(string(bs))
	require.NoError(err)
	proc2, err := os.FindProcess(pid)
	require.NoError(err)

	// Always cleanup child process after
	defer func() {
		if proc2 != nil {
			proc2.Kill()
		}
	}()
}

func TestDaemonStop_kill(t *testing.T) {
	t.Parallel()

	require := require.New(t)
	td, closer := testTempDir(t)
	defer closer()

	path := filepath.Join(td, "file")

	cmd, destroy := helperProcess("stop-kill", path)
	defer destroy()

	d := &Daemon{
		Command:      cmd,
		ProxyToken:   "hello",
		Logger:       testLogger,
		gracefulWait: 200 * time.Millisecond,
	}
	require.NoError(d.Start())

	// Wait for the file to exist
	retry.Run(t, func(r *retry.R) {
		_, err := os.Stat(path)
		if err == nil {
			return
		}

		r.Fatalf("error: %s", err)
	})

	// Stop the process
	require.NoError(d.Stop())

	// Stat the file so that we can get the mtime
	fi, err := os.Stat(path)
	require.NoError(err)
	mtime := fi.ModTime()

	// The mtime shouldn't change
	time.Sleep(100 * time.Millisecond)
	fi, err = os.Stat(path)
	require.NoError(err)
	require.Equal(mtime, fi.ModTime())
}

func TestDaemonStop_killAdopted(t *testing.T) {
	t.Parallel()

	require := require.New(t)
	td, closer := testTempDir(t)
	defer closer()

	path := filepath.Join(td, "file")

	// In this test we want to ensure that graceful/ungraceful stop works with
	// processes that were adopted by current process but not started by it. (i.e.
	// we have to poll them not use Wait).
	//
	// We could use `parent` indirection to get a child that is actually not
	// started by this process but that's a lot of hoops to jump through on top of
	// an already complex multi-process test case.
	//
	// For now we rely on an implementation detail of Daemon which is potentially
	// brittle but beats lots of extra complexity here. Currently, if
	// Daemon.process is non-nil, the keepAlive loop will explicitly assume it's
	// not a child and so will use polling to monitor it. If we ever change that
	// it might invalidate this test and we would either need more indirection
	// here, or an alternative explicit signal on Daemon like Daemon.forcePoll to
	// ensure we are exercising that code path.

	// Start the "child" process
	childCmd, destroy := helperProcess("stop-kill", path)
	defer destroy()

	require.NoError(childCmd.Start())
	go func() { childCmd.Wait() }() // Prevent it becoming a zombie when killed
	defer func() { childCmd.Process.Kill() }()

	// Create the Daemon
	cmd, destroy := helperProcess("stop-kill", path)
	defer destroy()

	d := &Daemon{
		Command:      cmd,
		ProxyToken:   "hello",
		Logger:       testLogger,
		gracefulWait: 200 * time.Millisecond,
		// Can't just set process as it will bypass intializing stopCh etc.
	}
	// Adopt the pid from a fake state snapshot (this correctly initializes Daemon
	// for adoption)
	fakeSnap := map[string]interface{}{
		"Pid":         childCmd.Process.Pid,
		"CommandPath": childCmd.Path,
		"CommandArgs": childCmd.Args,
		"CommandDir":  childCmd.Dir,
		"CommandEnv":  childCmd.Env,
		"ProxyToken":  d.ProxyToken,
	}
	require.NoError(d.UnmarshalSnapshot(fakeSnap))
	require.NoError(d.Start())

	// Wait for the file to exist (child was already running so this doesn't
	// guarantee that Daemon is in "polling" state)
	retry.Run(t, func(r *retry.R) {
		_, err := os.Stat(path)
		if err == nil {
			return
		}

		r.Fatalf("error: %s", err)
	})

	// Stop the process
	require.NoError(d.Stop())

	// Stat the file so that we can get the mtime
	fi, err := os.Stat(path)
	require.NoError(err)
	mtime := fi.ModTime()

	// The mtime shouldn't change
	time.Sleep(100 * time.Millisecond)
	fi, err = os.Stat(path)
	require.NoError(err)
	require.Equal(mtime, fi.ModTime())
}

func TestDaemonStart_pidFile(t *testing.T) {
	t.Parallel()

	require := require.New(t)
	td, closer := testTempDir(t)
	defer closer()

	path := filepath.Join(td, "file")
	pidPath := filepath.Join(td, "pid")
	uuid, err := uuid.GenerateUUID()
	require.NoError(err)

	cmd, destroy := helperProcess("start-once", path)
	defer destroy()

	d := &Daemon{
		Command:    cmd,
		ProxyToken: uuid,
		Logger:     testLogger,
		PidPath:    pidPath,
	}
	require.NoError(d.Start())
	defer d.Stop()

	// Wait for the file to exist
	retry.Run(t, func(r *retry.R) {
		_, err := os.Stat(pidPath)
		if err == nil {
			return
		}

		r.Fatalf("error: %s", err)
	})

	// Check the pid file
	pidRaw, err := ioutil.ReadFile(pidPath)
	require.NoError(err)
	require.NotEmpty(pidRaw)

	// Stop
	require.NoError(d.Stop())

	// Pid file should be gone
	_, err = os.Stat(pidPath)
	require.True(os.IsNotExist(err))
}

// Verify the pid file changes on restart
func TestDaemonRestart_pidFile(t *testing.T) {
	t.Parallel()

	require := require.New(t)
	td, closer := testTempDir(t)
	defer closer()
	path := filepath.Join(td, "file")
	pidPath := filepath.Join(td, "pid")

	cmd, destroy := helperProcess("restart", path)
	defer destroy()

	d := &Daemon{
		Command: cmd,
		Logger:  testLogger,
		PidPath: pidPath,
	}
	require.NoError(d.Start())
	defer d.Stop()

	// Wait for the file to exist. We save the func so we can reuse the test.
	waitFile := func(path string) {
		retry.Run(t, func(r *retry.R) {
			_, err := os.Stat(path)
			if err == nil {
				return
			}
			r.Fatalf("error waiting for path: %s", err)
		})
	}
	waitFile(path)
	waitFile(pidPath)

	// Check the pid file
	pidRaw, err := ioutil.ReadFile(pidPath)
	require.NoError(err)
	require.NotEmpty(pidRaw)

	// Delete the file
	require.NoError(os.Remove(pidPath))
	require.NoError(os.Remove(path))

	// File should re-appear because the process is restart
	waitFile(path)
	waitFile(pidPath)

	// Check the pid file and it should not equal
	pidRaw2, err := ioutil.ReadFile(pidPath)
	require.NoError(err)
	require.NotEmpty(pidRaw2)
	require.NotEqual(pidRaw, pidRaw2)
}

func TestDaemonEqual(t *testing.T) {
	cases := []struct {
		Name     string
		D1, D2   Proxy
		Expected bool
	}{
		{
			"Different type",
			&Daemon{
				Command: &exec.Cmd{},
			},
			&Noop{},
			false,
		},

		{
			"Nil",
			&Daemon{
				Command: &exec.Cmd{},
			},
			nil,
			false,
		},

		{
			"Equal",
			&Daemon{
				Command: &exec.Cmd{},
			},
			&Daemon{
				Command: &exec.Cmd{},
			},
			true,
		},

		{
			"Different proxy ID",
			&Daemon{
				Command: &exec.Cmd{Path: "/foo"},
				ProxyID: "web",
			},
			&Daemon{
				Command: &exec.Cmd{Path: "/foo"},
				ProxyID: "db",
			},
			false,
		},

		{
			"Different path",
			&Daemon{
				Command: &exec.Cmd{Path: "/foo"},
			},
			&Daemon{
				Command: &exec.Cmd{Path: "/bar"},
			},
			false,
		},

		{
			"Different dir",
			&Daemon{
				Command: &exec.Cmd{Dir: "/foo"},
			},
			&Daemon{
				Command: &exec.Cmd{Dir: "/bar"},
			},
			false,
		},

		{
			"Different args",
			&Daemon{
				Command: &exec.Cmd{Args: []string{"foo"}},
			},
			&Daemon{
				Command: &exec.Cmd{Args: []string{"bar"}},
			},
			false,
		},

		{
			"Different token",
			&Daemon{
				Command:    &exec.Cmd{},
				ProxyToken: "one",
			},
			&Daemon{
				Command:    &exec.Cmd{},
				ProxyToken: "two",
			},
			false,
		},
	}

	for _, tc := range cases {
		t.Run(tc.Name, func(t *testing.T) {
			actual := tc.D1.Equal(tc.D2)
			require.Equal(t, tc.Expected, actual)
		})
	}
}

func TestDaemonMarshalSnapshot(t *testing.T) {
	cases := []struct {
		Name     string
		Proxy    Proxy
		Expected map[string]interface{}
	}{
		{
			"stopped daemon",
			&Daemon{
				Command: &exec.Cmd{Path: "/foo"},
			},
			nil,
		},

		{
			"basic",
			&Daemon{
				Command: &exec.Cmd{Path: "/foo"},
				ProxyID: "web",
				process: &os.Process{Pid: 42},
			},
			map[string]interface{}{
				"Pid":         42,
				"CommandPath": "/foo",
				"CommandArgs": []string(nil),
				"CommandDir":  "",
				"CommandEnv":  []string(nil),
				"ProxyToken":  "",
				"ProxyID":     "web",
			},
		},
	}

	for _, tc := range cases {
		t.Run(tc.Name, func(t *testing.T) {
			actual := tc.Proxy.MarshalSnapshot()
			require.Equal(t, tc.Expected, actual)
		})
	}
}

func TestDaemonUnmarshalSnapshot(t *testing.T) {
	t.Parallel()

	require := require.New(t)
	td, closer := testTempDir(t)
	defer closer()

	path := filepath.Join(td, "file")
	uuid, err := uuid.GenerateUUID()
	require.NoError(err)

	cmd, destroy := helperProcess("start-stop", path)
	defer destroy()

	d := &Daemon{
		Command:    cmd,
		ProxyToken: uuid,
		Logger:     testLogger,
	}
	defer d.Stop()
	require.NoError(d.Start())

	// Wait for the file to exist
	retry.Run(t, func(r *retry.R) {
		_, err := os.Stat(path)
		if err == nil {
			return
		}

		r.Fatalf("error: %s", err)
	})

	// Snapshot
	snap := d.MarshalSnapshot()

	// Stop the original daemon but keep it alive
	require.NoError(d.Close())

	// Restore the second daemon
	d2 := &Daemon{Logger: testLogger}
	require.NoError(d2.UnmarshalSnapshot(snap))

	// Verify the daemon is still running
	_, err = os.Stat(path)
	require.NoError(err)

	// Stop the process
	require.NoError(d2.Stop())

	// File should no longer exist.
	retry.Run(t, func(r *retry.R) {
		_, err := os.Stat(path)
		if os.IsNotExist(err) {
			return
		}

		// err might be nil here but that's okay
		r.Fatalf("should not exist: %s", err)
	})
}

func TestDaemonUnmarshalSnapshot_notRunning(t *testing.T) {
	t.Parallel()

	require := require.New(t)
	td, closer := testTempDir(t)
	defer closer()

	path := filepath.Join(td, "file")
	uuid, err := uuid.GenerateUUID()
	require.NoError(err)

	cmd, destroy := helperProcess("start-stop", path)
	defer destroy()

	d := &Daemon{
		Command:    cmd,
		ProxyToken: uuid,
		Logger:     testLogger,
	}
	defer d.Stop()
	require.NoError(d.Start())

	// Wait for the file to exist
	retry.Run(t, func(r *retry.R) {
		_, err := os.Stat(path)
		if err == nil {
			return
		}

		r.Fatalf("error: %s", err)
	})

	// Snapshot
	snap := d.MarshalSnapshot()

	// Stop the original daemon
	require.NoError(d.Stop())

	// Restore the second daemon
	d2 := &Daemon{Logger: testLogger}
	require.Error(d2.UnmarshalSnapshot(snap))
}