Quarantine series upon problem writing to the series file

This fixes https://github.com/prometheus/prometheus/issues/1059 , but
not in the obvious way (simply not updating the persist watermark,
because that's actually not that simple - we don't really know what
has gone wrong exactly). As any errors relevant here are most likely
caused by severe and unrecoverable problems with the series file,
Using the now quarantine feature is the right step. We don't really
have to be worried about any inconsistent state of the series because
it will be removed for good ASAP. Another plus is that we don't have
to declare the whole storage dirty anymore.
pull/1453/head
beorn7 9 years ago
parent 0ea5801e47
commit fc7de5374a

@ -369,13 +369,10 @@ func (p *persistence) labelValuesForLabelName(ln model.LabelName) (model.LabelVa
// the (zero-based) index of the first persisted chunk within the series // the (zero-based) index of the first persisted chunk within the series
// file. In case of an error, the returned index is -1 (to avoid the // file. In case of an error, the returned index is -1 (to avoid the
// misconception that the chunk was written at position 0). // misconception that the chunk was written at position 0).
//
// Returning an error signals problems with the series file. In this case, the
// caller should quarantine the series.
func (p *persistence) persistChunks(fp model.Fingerprint, chunks []chunk) (index int, err error) { func (p *persistence) persistChunks(fp model.Fingerprint, chunks []chunk) (index int, err error) {
defer func() {
if err != nil {
p.setDirty(true, fmt.Errorf("error in method persistChunks: %s", err))
}
}()
f, err := p.openChunkFileForWriting(fp) f, err := p.openChunkFileForWriting(fp)
if err != nil { if err != nil {
return -1, err return -1, err
@ -915,6 +912,9 @@ func (p *persistence) loadSeriesMapAndHeads() (sm *seriesMap, chunksToPersist in
// deleted (in which case the returned timestamp will be 0 and must be ignored). // deleted (in which case the returned timestamp will be 0 and must be ignored).
// It is the caller's responsibility to make sure nothing is persisted or loaded // It is the caller's responsibility to make sure nothing is persisted or loaded
// for the same fingerprint concurrently. // for the same fingerprint concurrently.
//
// Returning an error signals problems with the series file. In this case, the
// caller should quarantine the series.
func (p *persistence) dropAndPersistChunks( func (p *persistence) dropAndPersistChunks(
fp model.Fingerprint, beforeTime model.Time, chunks []chunk, fp model.Fingerprint, beforeTime model.Time, chunks []chunk,
) ( ) (
@ -927,12 +927,6 @@ func (p *persistence) dropAndPersistChunks(
// Style note: With the many return values, it was decided to use naked // Style note: With the many return values, it was decided to use naked
// returns in this method. They make the method more readable, but // returns in this method. They make the method more readable, but
// please handle with care! // please handle with care!
defer func() {
if err != nil {
p.setDirty(true, fmt.Errorf("error in method dropAndPersistChunks: %s", err))
}
}()
if len(chunks) > 0 { if len(chunks) > 0 {
// We have chunks to persist. First check if those are already // We have chunks to persist. First check if those are already
// too old. If that's the case, the chunks in the series file // too old. If that's the case, the chunks in the series file

@ -16,6 +16,7 @@ package local
import ( import (
"container/list" "container/list"
"errors"
"fmt" "fmt"
"math" "math"
"sync" "sync"
@ -1108,8 +1109,20 @@ func (s *memorySeriesStorage) maintainMemorySeries(
func (s *memorySeriesStorage) writeMemorySeries( func (s *memorySeriesStorage) writeMemorySeries(
fp model.Fingerprint, series *memorySeries, beforeTime model.Time, fp model.Fingerprint, series *memorySeries, beforeTime model.Time,
) bool { ) bool {
cds := series.chunksToPersist() var (
persistErr error
cds = series.chunksToPersist()
)
defer func() { defer func() {
if persistErr != nil {
s.quarantineSeries(fp, series.metric, persistErr)
s.persistErrors.Inc()
}
// The following is done even in case of an error to ensure
// correct counter bookkeeping and to not pin chunks in memory
// that belong to a series that is scheduled for quarantine
// anyway.
for _, cd := range cds { for _, cd := range cds {
cd.unpin(s.evictRequests) cd.unpin(s.evictRequests)
} }
@ -1130,9 +1143,9 @@ func (s *memorySeriesStorage) writeMemorySeries(
if len(cds) == 0 { if len(cds) == 0 {
return false return false
} }
offset, err := s.persistence.persistChunks(fp, chunks) var offset int
if err != nil { offset, persistErr = s.persistence.persistChunks(fp, chunks)
s.persistErrors.Inc() if persistErr != nil {
return false return false
} }
if series.chunkDescsOffset == -1 { if series.chunkDescsOffset == -1 {
@ -1144,14 +1157,12 @@ func (s *memorySeriesStorage) writeMemorySeries(
return false return false
} }
newFirstTime, offset, numDroppedFromPersistence, allDroppedFromPersistence, err := newFirstTime, offset, numDroppedFromPersistence, allDroppedFromPersistence, persistErr :=
s.persistence.dropAndPersistChunks(fp, beforeTime, chunks) s.persistence.dropAndPersistChunks(fp, beforeTime, chunks)
if err != nil { if persistErr != nil {
s.persistErrors.Inc()
return false return false
} }
if err := series.dropChunks(beforeTime); err != nil { if persistErr = series.dropChunks(beforeTime); persistErr != nil {
s.persistErrors.Inc()
return false return false
} }
if len(series.chunkDescs) == 0 && allDroppedFromPersistence { if len(series.chunkDescs) == 0 && allDroppedFromPersistence {
@ -1168,8 +1179,8 @@ func (s *memorySeriesStorage) writeMemorySeries(
} else { } else {
series.chunkDescsOffset -= numDroppedFromPersistence series.chunkDescsOffset -= numDroppedFromPersistence
if series.chunkDescsOffset < 0 { if series.chunkDescsOffset < 0 {
s.persistence.setDirty(true, fmt.Errorf("dropped more chunks from persistence than from memory for fingerprint %v, series %v", fp, series)) persistErr = errors.New("dropped more chunks from persistence than from memory")
series.chunkDescsOffset = -1 // Makes sure it will be looked at during crash recovery. series.chunkDescsOffset = -1
} }
} }
return false return false

Loading…
Cancel
Save