From 9f1f791ac2e1377781c4f8807a23d86d92ad6499 Mon Sep 17 00:00:00 2001 From: DongWei Date: Wed, 14 Feb 2024 22:36:16 +0800 Subject: [PATCH] filesystem: fix mountTimeout not working issue (#2903) Signed-off-by: DongWei --- collector/filesystem_linux.go | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/collector/filesystem_linux.go b/collector/filesystem_linux.go index 6e7623e2..2251cc4b 100644 --- a/collector/filesystem_linux.go +++ b/collector/filesystem_linux.go @@ -122,16 +122,8 @@ func (c *filesystemCollector) processStat(labels filesystemLabels) filesystemSta buf := new(unix.Statfs_t) err := unix.Statfs(rootfsFilePath(labels.mountPoint), buf) - stuckMountsMtx.Lock() close(success) - // If the mount has been marked as stuck, unmark it and log it's recovery. - if _, ok := stuckMounts[labels.mountPoint]; ok { - level.Debug(c.logger).Log("msg", "Mount point has recovered, monitoring will resume", "mountpoint", labels.mountPoint) - delete(stuckMounts, labels.mountPoint) - } - stuckMountsMtx.Unlock() - if err != nil { level.Debug(c.logger).Log("msg", "Error on statfs() system call", "rootfs", rootfsFilePath(labels.mountPoint), "err", err) return filesystemStats{ @@ -161,17 +153,29 @@ func stuckMountWatcher(mountPoint string, success chan struct{}, logger log.Logg select { case <-success: // Success + // If the mount has been marked as stuck, unmark it and log it's recovery. + stuckMountsMtx.Lock() + defer stuckMountsMtx.Unlock() + if _, ok := stuckMounts[mountPoint]; ok { + level.Debug(logger).Log("msg", "Mount point has recovered, monitoring will resume", "mountpoint", mountPoint) + delete(stuckMounts, mountPoint) + } case <-mountCheckTimer.C: // Timed out, mark mount as stuck stuckMountsMtx.Lock() + defer stuckMountsMtx.Unlock() select { case <-success: // Success came in just after the timeout was reached, don't label the mount as stuck + // If the mount has been marked as stuck, unmark it and log it's recovery. + if _, ok := stuckMounts[mountPoint]; ok { + level.Debug(logger).Log("msg", "Mount point has recovered, monitoring will resume", "mountpoint", mountPoint) + delete(stuckMounts, mountPoint) + } default: level.Debug(logger).Log("msg", "Mount point timed out, it is being labeled as stuck and will not be monitored", "mountpoint", mountPoint) stuckMounts[mountPoint] = struct{}{} } - stuckMountsMtx.Unlock() } }