From b2ff583392cef8ce268119d5ba13bc8e6ddd835f Mon Sep 17 00:00:00 2001 From: Paul Banks Date: Wed, 6 Jun 2018 13:45:39 +0100 Subject: [PATCH] Test for adopted process Stop race and fix --- agent/proxy/daemon.go | 57 ++++++--- agent/proxy/daemon_test.go | 230 ++++++++++++++++++++++++++++++++++++- agent/proxy/proxy_test.go | 62 +++++++++- 3 files changed, 330 insertions(+), 19 deletions(-) diff --git a/agent/proxy/daemon.go b/agent/proxy/daemon.go index f18cc750e3..d33505a2c6 100644 --- a/agent/proxy/daemon.go +++ b/agent/proxy/daemon.go @@ -8,6 +8,7 @@ import ( "reflect" "strconv" "sync" + "syscall" "time" "github.com/hashicorp/consul/lib/file" @@ -108,7 +109,11 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) { // attempts keeps track of the number of restart attempts we've had and // is used to calculate the wait time using an exponential backoff. var attemptsDeadline time.Time - var attempts uint + var attempts uint32 + + // Assume the process is adopted, we reset this when we start a new process + // ourselves below and use it to decide on a strategy for waiting. + adopted := true for { if process == nil { @@ -121,7 +126,11 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) { // Calculate the exponential backoff and wait if we have to if attempts > DaemonRestartBackoffMin { - waitTime := (1 << (attempts - DaemonRestartBackoffMin)) * time.Second + exponent := (attempts - DaemonRestartBackoffMin) + if exponent > 31 { + exponent = 31 + } + waitTime := (1 << exponent) * time.Second if waitTime > DaemonRestartMaxWait { waitTime = DaemonRestartMaxWait } @@ -159,6 +168,7 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) { process, err = p.start() if err == nil { p.process = process + adopted = false } p.lock.Unlock() @@ -169,25 +179,34 @@ func (p *Daemon) keepAlive(stopCh <-chan struct{}, exitedCh chan<- struct{}) { } - // Wait for the process to exit. Note that if we restored this proxy - // then Wait will always fail because we likely aren't the parent - // process. Therefore, we do an extra sanity check after to use other - // syscalls to verify the process is truly dead. - ps, err := process.Wait() - if _, err := findProcess(process.Pid); err == nil { - select { - case <-time.After(1 * time.Second): - // We want a busy loop, but not too busy. 1 second between - // detecting a process death seems reasonable. + var ps *os.ProcessState + var err error - case <-stopCh: - // If we receive a stop request we want to exit immediately. - return + if adopted { + // assign to err outside scope + _, err = findProcess(process.Pid) + if err == nil { + // Process appears to be running still, wait a bit before we poll again. + // We want a busy loop, but not too busy. 1 second between detecting a + // process death seems reasonable. + // + // SUBTELTY: we must NOT select on stopCh here since the Stop function + // assumes that as soon as this method returns and closes exitedCh, that + // the process is no longer running. If we are polling then we don't + // know that is true until we've polled again so we have to keep polling + // until the process goes away even if we know the Daemon is stopping. + time.Sleep(1 * time.Second) + + // Restart the loop, process is still set so we effectively jump back to + // the findProcess call above. + continue } - - continue + } else { + // Wait for child to exit + ps, err = process.Wait() } + // Process exited somehow. process = nil if err != nil { p.Logger.Printf("[INFO] agent/proxy: daemon exited with error: %s", err) @@ -219,6 +238,10 @@ func (p *Daemon) start() (*os.Process, error) { cmd.Args = []string{cmd.Path} } + // Start it in a new sessions (and hence process group) so that killing agent + // (even with Ctrl-C) won't kill proxy. + cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} + // Start it p.Logger.Printf("[DEBUG] agent/proxy: starting proxy: %q %#v", cmd.Path, cmd.Args[1:]) if err := cmd.Start(); err != nil { diff --git a/agent/proxy/daemon_test.go b/agent/proxy/daemon_test.go index 7202ddd655..0895922783 100644 --- a/agent/proxy/daemon_test.go +++ b/agent/proxy/daemon_test.go @@ -5,6 +5,8 @@ import ( "os" "os/exec" "path/filepath" + "strconv" + "syscall" "testing" "time" @@ -102,6 +104,150 @@ func TestDaemonRestart(t *testing.T) { waitFile() } +func TestDaemonLaunchesNewProcessGroup(t *testing.T) { + t.Parallel() + + require := require.New(t) + td, closer := testTempDir(t) + defer closer() + + path := filepath.Join(td, "file") + pidPath := filepath.Join(td, "child.pid") + + // Start the parent process wrapping a start-stop test. The parent is acting + // as our "agent". We need an extra indirection to be able to kill the "agent" + // and still be running the test process. + parentCmd := helperProcess("parent", pidPath, "start-stop", path) + + // We MUST run this as a separate process group otherwise the Kill below will + // kill this test process (and possibly your shell/editor that launched it!) + parentCmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} + + require.NoError(parentCmd.Start()) + + // Wait for the pid file to exist so we know parent is running + retry.Run(t, func(r *retry.R) { + _, err := os.Stat(pidPath) + if err == nil { + return + } + + r.Fatalf("error: %s", err) + }) + + // And wait for the actual file to be sure the child is running (it should be + // since parent doesn't write PID until child starts but the child might not + // have completed the write to disk yet which causes flakiness below). + retry.Run(t, func(r *retry.R) { + _, err := os.Stat(path) + if err == nil { + return + } + + r.Fatalf("error: %s", err) + }) + + // Get the child PID + bs, err := ioutil.ReadFile(pidPath) + require.NoError(err) + pid, err := strconv.Atoi(string(bs)) + require.NoError(err) + proc, err := os.FindProcess(pid) + require.NoError(err) + + // Always cleanup child process after + defer func() { + if proc != nil { + proc.Kill() + } + }() + + // Now kill the parent's whole process group and wait for it + pgid, err := syscall.Getpgid(parentCmd.Process.Pid) + + require.NoError(err) + // Yep the minus PGid is how you kill a whole process group in unix... no idea + // how this works on windows. We TERM no KILL since we rely on the child + // catching the signal and deleting it's file to detect correct behaviour. + require.NoError(syscall.Kill(-pgid, syscall.SIGTERM)) + + _, err = parentCmd.Process.Wait() + require.NoError(err) + + // The child should still be running so file should still be there + _, err = os.Stat(path) + require.NoError(err, "child should still be running") + + // TEST PART 2 - verify that adopting an existing process works and picks up + // monitoring even though it's not a child. We can't do this accurately with + // Restart test since even if we create a new `Daemon` object the test process + // is still the parent. We need the indirection of the `parent` test helper to + // actually verify "adoption" on restart works. + + // Start a new parent that will "adopt" the existing child even though it will + // not be an actual child process. + fosterCmd := helperProcess("parent", pidPath, "start-stop", path) + // Don't care about it being same process group this time as we will just kill + // it normally. + require.NoError(fosterCmd.Start()) + defer func() { + // Clean up the daemon and wait for it to prevent it becoming a zombie. + fosterCmd.Process.Kill() + fosterCmd.Wait() + }() + + // The child should still be running so file should still be there + _, err = os.Stat(path) + require.NoError(err, "child should still be running") + + { + // Get the child PID - it shouldn't have changed and should be running + bs2, err := ioutil.ReadFile(pidPath) + require.NoError(err) + pid2, err := strconv.Atoi(string(bs2)) + require.NoError(err) + // Defer a cleanup (til end of test function) + proc, err := os.FindProcess(pid) + require.NoError(err) + defer func() { proc.Kill() }() + + require.Equal(pid, pid2) + t.Logf("Child PID was %d and still %d", pid, pid2) + } + + // Now killing the child directly should still be restarted by the Daemon + require.NoError(proc.Kill()) + proc = nil + + retry.Run(t, func(r *retry.R) { + // Get the child PID - it should have changed + bs, err := ioutil.ReadFile(pidPath) + r.Check(err) + + newPid, err := strconv.Atoi(string(bs)) + r.Check(err) + if newPid == pid { + r.Fatalf("Child PID file not changed, Daemon not restarting it") + } + t.Logf("Child PID was %d and is now %d", pid, newPid) + }) + + // I had to run through this test in debugger a lot of times checking ps state + // by hand at different points to convince myself it was doing the right + // thing. It doesn't help that with verbose logs on it seems that the stdio + // from the `parent` process can sometimes miss lines out due to timing. For + // example the `[INFO] agent/proxy: daemon exited...` log from Daemon that + // indicates that the child was detected to have failed and is restarting is + // never output on my Mac at full speed. But if I run in debugger and have it + // pause at the step after the child is killed above, then it shows. The + // `[DEBUG] agent/proxy: starting proxy:` for the restart does always come + // through though which is odd. I assume this is some odd quirk of timing + // between processes and stdio or something but it makes debugging this stuff + // even harder! + + // Let defer clean up the child process(es) +} + func TestDaemonStop_kill(t *testing.T) { t.Parallel() @@ -132,7 +278,85 @@ func TestDaemonStop_kill(t *testing.T) { // Stop the process require.NoError(d.Stop()) - // State the file so that we can get the mtime + // Stat the file so that we can get the mtime + fi, err := os.Stat(path) + require.NoError(err) + mtime := fi.ModTime() + + // The mtime shouldn't change + time.Sleep(100 * time.Millisecond) + fi, err = os.Stat(path) + require.NoError(err) + require.Equal(mtime, fi.ModTime()) +} + +func TestDaemonStop_killAdopted(t *testing.T) { + t.Parallel() + + require := require.New(t) + td, closer := testTempDir(t) + defer closer() + + path := filepath.Join(td, "file") + + // In this test we want to ensure that gracefull/ungraceful stop works with + // processes that were adopted by current process but not started by it. (i.e. + // we have to poll them not use Wait). + // + // We could use `parent` indirection to get a child that is actually not + // started by this process but that's a lot of hoops to jump through on top of + // an already complex multi-process test case. + // + // For now we rely on an implementation detail of Daemon which is potentially + // brittle but beats lots of extra complexity here. Currently, if + // Daemon.process is non-nil, the keepAlive loop will explicitly assume it's + // not a child and so will use polling to monitor it. If we ever change that + // it might invalidate this test and we would either need more indirection + // here, or an alternative explicit signal on Daemon like Daemon.forcePoll to + // ensure we are exercising that code path. + + // Start the "child" process + childCmd := helperProcess("stop-kill", path) + require.NoError(childCmd.Start()) + go func() { childCmd.Wait() }() // Prevent it becoming a zombie when killed + defer func() { childCmd.Process.Kill() }() + + // Create the Daemon + d := &Daemon{ + Command: helperProcess("stop-kill", path), + ProxyToken: "hello", + Logger: testLogger, + gracefulWait: 200 * time.Millisecond, + // Can't just set process as it will bypass intializing stopCh etc. + } + // Adopt the pid from a fake state snapshot (this correctly initialises Daemon + // for adoption) + fakeSnap := map[string]interface{}{ + "Pid": childCmd.Process.Pid, + "CommandPath": childCmd.Path, + "CommandArgs": childCmd.Args, + "CommandDir": childCmd.Dir, + "CommandEnv": childCmd.Env, + "ProxyToken": d.ProxyToken, + } + require.NoError(d.UnmarshalSnapshot(fakeSnap)) + require.NoError(d.Start()) + + // Wait for the file to exist (child was already running so this doesn't + // gaurantee that Daemon is in "polling" state) + retry.Run(t, func(r *retry.R) { + _, err := os.Stat(path) + if err == nil { + return + } + + r.Fatalf("error: %s", err) + }) + + // Stop the process + require.NoError(d.Stop()) + + // Stat the file so that we can get the mtime fi, err := os.Stat(path) require.NoError(err) mtime := fi.ModTime() @@ -404,6 +628,10 @@ func TestDaemonUnmarshalSnapshot(t *testing.T) { d2 := &Daemon{Logger: testLogger} require.NoError(d2.UnmarshalSnapshot(snap)) + // Verify the daemon is still running + _, err = os.Stat(path) + require.NoError(err) + // Stop the process require.NoError(d2.Stop()) diff --git a/agent/proxy/proxy_test.go b/agent/proxy/proxy_test.go index 9b123787ca..a04bb65d59 100644 --- a/agent/proxy/proxy_test.go +++ b/agent/proxy/proxy_test.go @@ -7,6 +7,8 @@ import ( "os" "os/exec" "os/signal" + "strconv" + "syscall" "testing" "time" ) @@ -74,7 +76,7 @@ func TestHelperProcess(t *testing.T) { // and deletes it only when it is stopped. case "start-stop": ch := make(chan os.Signal, 1) - signal.Notify(ch, os.Interrupt) + signal.Notify(ch, os.Interrupt, syscall.SIGTERM) defer signal.Stop(ch) path := args[0] @@ -155,6 +157,64 @@ func TestHelperProcess(t *testing.T) { <-make(chan struct{}) + // Parent runs the given process in a Daemon and then sleeps until the test + // code kills it. It exists to test that the Daemon-managed child process + // survives it's parent exiting which we can't test directly without exiting + // the test process so we need an extra level of indirection. The test code + // using this must pass a file path as the first argument for the child + // processes PID to be written and then must take care to clean up that PID + // later or the child will be left running forever. + // + // If the PID file already exists, it will "adopt" the child rather than + // launch a new one. + case "parent": + // We will write the PID for the child to the file in the first argument + // then pass rest of args through to command. + pidFile := args[0] + + d := &Daemon{ + Command: helperProcess(args[1:]...), + Logger: testLogger, + PidPath: pidFile, + } + + _, err := os.Stat(pidFile) + if err == nil { + // pidFile exists, read it and "adopt" the process + bs, err := ioutil.ReadFile(pidFile) + if err != nil { + log.Printf("Error: %s", err) + os.Exit(1) + } + pid, err := strconv.Atoi(string(bs)) + if err != nil { + log.Printf("Error: %s", err) + os.Exit(1) + } + // Make a fake snapshot to load + snapshot := map[string]interface{}{ + "Pid": pid, + "CommandPath": d.Command.Path, + "CommandArgs": d.Command.Args, + "CommandDir": d.Command.Dir, + "CommandEnv": d.Command.Env, + "ProxyToken": "", + } + d.UnmarshalSnapshot(snapshot) + } + + if err := d.Start(); err != nil { + log.Printf("Error: %s", err) + os.Exit(1) + } + log.Println("Started child") + + // Wait "forever" (calling test chooses when we exit with signal/Wait to + // minimise coordination). + for { + time.Sleep(time.Hour) + } + default: fmt.Fprintf(os.Stderr, "Unknown command: %q\n", cmd) os.Exit(2)