diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 898a39c60..baf2f2f0f 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -386,6 +386,21 @@ { alert: 'PrometheusHAGroupCrashlooping', expr: ||| + ( + prometheus_tsdb_clean_start{%(prometheusSelector)s} == 0 + and + ( + count by (%(prometheusHAGroupLabels)s) ( + changes(process_start_time_seconds{%(prometheusSelector)s}[30m]) > 1 + ) + / + count by (%(prometheusHAGroupLabels)s) ( + up{%(prometheusSelector)s} + ) + ) + > 0.5 + ) + or ( count by (%(prometheusHAGroupLabels)s) ( changes(process_start_time_seconds{%(prometheusSelector)s}[30m]) > 4 @@ -403,7 +418,7 @@ }, annotations: { summary: 'More than half of the Prometheus instances within the same HA group are crashlooping.', - description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have restarted at least 5 times in the last 30m.' % $._config, + description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have had at least 5 total restarts or 2 unclean restarts in the last 30m.' % $._config, }, }, ], diff --git a/tsdb/db.go b/tsdb/db.go index 1114be824..6507c25f8 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -58,6 +58,10 @@ const ( tmpForCreationBlockDirSuffix = ".tmp-for-creation" // Pre-2.21 tmp dir suffix, used in clean-up functions. tmpLegacy = ".tmp" + + lockfileDisabled = -1 + lockfileReplaced = 0 + lockfileCreatedCleanly = 1 ) var ( @@ -153,8 +157,9 @@ type BlocksToDeleteFunc func(blocks []*Block) map[ulid.ULID]struct{} // DB handles reads and writes of time series falling into // a hashed partition of a seriedb. type DB struct { - dir string - lockf fileutil.Releaser + dir string + lockf fileutil.Releaser + lockfPath string logger log.Logger metrics *dbMetrics @@ -186,19 +191,20 @@ type DB struct { } type dbMetrics struct { - loadedBlocks prometheus.GaugeFunc - symbolTableSize prometheus.GaugeFunc - reloads prometheus.Counter - reloadsFailed prometheus.Counter - compactionsFailed prometheus.Counter - compactionsTriggered prometheus.Counter - compactionsSkipped prometheus.Counter - sizeRetentionCount prometheus.Counter - timeRetentionCount prometheus.Counter - startTime prometheus.GaugeFunc - tombCleanTimer prometheus.Histogram - blocksBytes prometheus.Gauge - maxBytes prometheus.Gauge + loadedBlocks prometheus.GaugeFunc + symbolTableSize prometheus.GaugeFunc + reloads prometheus.Counter + reloadsFailed prometheus.Counter + compactionsFailed prometheus.Counter + compactionsTriggered prometheus.Counter + compactionsSkipped prometheus.Counter + sizeRetentionCount prometheus.Counter + timeRetentionCount prometheus.Counter + startTime prometheus.GaugeFunc + tombCleanTimer prometheus.Histogram + blocksBytes prometheus.Gauge + maxBytes prometheus.Gauge + lockfileCreatedCleanly prometheus.Gauge } func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { @@ -276,6 +282,10 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { Name: "prometheus_tsdb_size_retentions_total", Help: "The number of times that blocks were deleted because the maximum number of bytes was exceeded.", }) + m.lockfileCreatedCleanly = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "prometheus_tsdb_clean_start", + Help: "-1: lockfile is disabled. 0: a lockfile from a previous execution was replaced. 1: lockfile creation was clean", + }) if r != nil { r.MustRegister( @@ -292,6 +302,7 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { m.tombCleanTimer, m.blocksBytes, m.maxBytes, + m.lockfileCreatedCleanly, ) } return m @@ -653,12 +664,22 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs db.blocksToDelete = DefaultBlocksToDelete(db) } + lockfileCreationStatus := lockfileDisabled if !opts.NoLockfile { absdir, err := filepath.Abs(dir) if err != nil { return nil, err } - lockf, _, err := fileutil.Flock(filepath.Join(absdir, "lock")) + db.lockfPath = filepath.Join(absdir, "lock") + + if _, err := os.Stat(db.lockfPath); err == nil { + level.Warn(db.logger).Log("msg", "A TSDB lockfile from a previous execution already existed. It was replaced", "file", db.lockfPath) + lockfileCreationStatus = lockfileReplaced + } else { + lockfileCreationStatus = lockfileCreatedCleanly + } + + lockf, _, err := fileutil.Flock(db.lockfPath) if err != nil { return nil, errors.Wrap(err, "lock DB directory") } @@ -707,6 +728,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs if maxBytes < 0 { maxBytes = 0 } + db.metrics.lockfileCreatedCleanly.Set(float64(lockfileCreationStatus)) db.metrics.maxBytes.Set(float64(maxBytes)) if err := db.reload(); err != nil { @@ -1418,6 +1440,7 @@ func (db *DB) Close() error { errs := tsdb_errors.NewMulti(g.Wait()) if db.lockf != nil { errs.Add(db.lockf.Release()) + errs.Add(os.Remove(db.lockfPath)) } if db.head != nil { errs.Add(db.head.Close()) diff --git a/tsdb/db_test.go b/tsdb/db_test.go index 8548f6848..6d977d77d 100644 --- a/tsdb/db_test.go +++ b/tsdb/db_test.go @@ -3125,6 +3125,70 @@ func TestNoPanicOnTSDBOpenError(t *testing.T) { require.NoError(t, lockf.Release()) } +func TestLockfileMetric(t *testing.T) { + cases := []struct { + fileAlreadyExists bool + lockFileDisabled bool + expectedValue int + }{ + { + fileAlreadyExists: false, + lockFileDisabled: false, + expectedValue: lockfileCreatedCleanly, + }, + { + fileAlreadyExists: true, + lockFileDisabled: false, + expectedValue: lockfileReplaced, + }, + { + fileAlreadyExists: true, + lockFileDisabled: true, + expectedValue: lockfileDisabled, + }, + { + fileAlreadyExists: false, + lockFileDisabled: true, + expectedValue: lockfileDisabled, + }, + } + + for _, c := range cases { + t.Run(fmt.Sprintf("%+v", c), func(t *testing.T) { + tmpdir, err := ioutil.TempDir("", "test") + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, os.RemoveAll(tmpdir)) + }) + absdir, err := filepath.Abs(tmpdir) + require.NoError(t, err) + + // Test preconditions (file already exists + lockfile option) + lockfilePath := filepath.Join(absdir, "lock") + if c.fileAlreadyExists { + err = ioutil.WriteFile(lockfilePath, []byte{}, 0644) + require.NoError(t, err) + } + opts := DefaultOptions() + opts.NoLockfile = c.lockFileDisabled + + // Create the DB, this should create a lockfile and the metrics + db, err := Open(tmpdir, nil, nil, opts) + require.NoError(t, err) + require.Equal(t, float64(c.expectedValue), prom_testutil.ToFloat64(db.metrics.lockfileCreatedCleanly)) + + // Close the DB, this should delete the lockfile + require.NoError(t, db.Close()) + + // Check that the lockfile is always deleted + if !c.lockFileDisabled { + _, err = os.Stat(lockfilePath) + require.Error(t, err, "lockfile was not deleted") + } + }) + } +} + func TestQuerier_ShouldNotPanicIfHeadChunkIsTruncatedWhileReadingQueriedChunks(t *testing.T) { t.Skip("TODO: investigate why process crash in CI")