mirror of https://github.com/prometheus/prometheus
Batch samples before sending them to channels
Channels can cause bottlenecks and tons of context switches when reading hundreds of thousands of samples per second from a single queue. Instead, pre-batch the samples to amortize the cost of the concurrency overhead. Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>pull/9830/head
parent
4cae788689
commit
319249f9db
|
@ -915,7 +915,7 @@ type shards struct {
|
|||
mtx sync.RWMutex // With the WAL, this is never actually contended.
|
||||
|
||||
qm *QueueManager
|
||||
queues []chan interface{}
|
||||
queues []*queue
|
||||
// So we can accurately track how many of each are lost during shard shutdowns.
|
||||
enqueuedSamples atomic.Int64
|
||||
enqueuedExemplars atomic.Int64
|
||||
|
@ -943,9 +943,9 @@ func (s *shards) start(n int) {
|
|||
s.qm.metrics.pendingSamples.Set(0)
|
||||
s.qm.metrics.numShards.Set(float64(n))
|
||||
|
||||
newQueues := make([]chan interface{}, n)
|
||||
newQueues := make([]*queue, n)
|
||||
for i := 0; i < n; i++ {
|
||||
newQueues[i] = make(chan interface{}, s.qm.cfg.Capacity)
|
||||
newQueues[i] = newQueue(s.qm.cfg.MaxSamplesPerSend, s.qm.cfg.Capacity)
|
||||
}
|
||||
|
||||
s.queues = newQueues
|
||||
|
@ -978,7 +978,7 @@ func (s *shards) stop() {
|
|||
s.mtx.Lock()
|
||||
defer s.mtx.Unlock()
|
||||
for _, queue := range s.queues {
|
||||
close(queue)
|
||||
go queue.FlushAndShutdown(s.done)
|
||||
}
|
||||
select {
|
||||
case <-s.done:
|
||||
|
@ -1013,7 +1013,11 @@ func (s *shards) enqueue(ref chunks.HeadSeriesRef, data interface{}) bool {
|
|||
select {
|
||||
case <-s.softShutdown:
|
||||
return false
|
||||
case s.queues[shard] <- data:
|
||||
default:
|
||||
appended := s.queues[shard].Append(data, s.softShutdown)
|
||||
if !appended {
|
||||
return false
|
||||
}
|
||||
switch data.(type) {
|
||||
case writeSample:
|
||||
s.qm.metrics.pendingSamples.Inc()
|
||||
|
@ -1028,7 +1032,93 @@ func (s *shards) enqueue(ref chunks.HeadSeriesRef, data interface{}) bool {
|
|||
}
|
||||
}
|
||||
|
||||
func (s *shards) runShard(ctx context.Context, shardID int, queue chan interface{}) {
|
||||
type queue struct {
|
||||
batch []interface{}
|
||||
batchQueue chan []interface{}
|
||||
|
||||
// Since we know there are a limited number of batches out, using a stack
|
||||
// is easy and safe so a sync.Pool is not necessary.
|
||||
batchPool [][]interface{}
|
||||
// This mutex covers adding and removing batches from the batchPool.
|
||||
poolMux sync.Mutex
|
||||
}
|
||||
|
||||
func newQueue(batchSize, capacity int) *queue {
|
||||
batches := capacity / batchSize
|
||||
return &queue{
|
||||
batch: make([]interface{}, 0, batchSize),
|
||||
batchQueue: make(chan []interface{}, batches),
|
||||
// batchPool should have capacity for everything in the channel + 1 for
|
||||
// the batch being processed.
|
||||
batchPool: make([][]interface{}, 0, batches+1),
|
||||
}
|
||||
}
|
||||
|
||||
func (q *queue) Append(datum interface{}, stop <-chan struct{}) bool {
|
||||
q.batch = append(q.batch, datum)
|
||||
if len(q.batch) == cap(q.batch) {
|
||||
select {
|
||||
case q.batchQueue <- q.batch:
|
||||
q.batch = q.newBatch(cap(q.batch))
|
||||
return true
|
||||
case <-stop:
|
||||
// Remove the sample we just appended. It will get retried.
|
||||
q.batch = q.batch[:len(q.batch)-1]
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (q *queue) Chan() <-chan []interface{} {
|
||||
return q.batchQueue
|
||||
}
|
||||
|
||||
// Batch returns the current batch and allocates a new batch. Must not be
|
||||
// called concurrently with Append.
|
||||
func (q *queue) Batch() []interface{} {
|
||||
batch := q.batch
|
||||
q.batch = q.newBatch(cap(batch))
|
||||
return batch
|
||||
}
|
||||
|
||||
// ReturnForReuse adds the batch buffer back to the internal pool.
|
||||
func (q *queue) ReturnForReuse(batch []interface{}) {
|
||||
q.poolMux.Lock()
|
||||
defer q.poolMux.Unlock()
|
||||
if len(q.batchPool) < cap(q.batchPool) {
|
||||
q.batchPool = append(q.batchPool, batch[:0])
|
||||
}
|
||||
}
|
||||
|
||||
// FlushAndShutdown stops the queue and flushes any samples. No appends can be
|
||||
// made after this is called.
|
||||
func (q *queue) FlushAndShutdown(done <-chan struct{}) {
|
||||
if len(q.batch) > 0 {
|
||||
select {
|
||||
case q.batchQueue <- q.batch:
|
||||
case <-done:
|
||||
// The shard has been hard shut down, so no more samples can be
|
||||
// sent. Drop everything left in the queue.
|
||||
}
|
||||
}
|
||||
q.batch = nil
|
||||
close(q.batchQueue)
|
||||
}
|
||||
|
||||
func (q *queue) newBatch(capacity int) []interface{} {
|
||||
q.poolMux.Lock()
|
||||
defer q.poolMux.Unlock()
|
||||
batches := len(q.batchPool)
|
||||
if batches > 0 {
|
||||
batch := q.batchPool[batches-1]
|
||||
q.batchPool = q.batchPool[:batches-1]
|
||||
return batch
|
||||
}
|
||||
return make([]interface{}, 0, capacity)
|
||||
}
|
||||
|
||||
func (s *shards) runShard(ctx context.Context, shardID int, queue *queue) {
|
||||
defer func() {
|
||||
if s.running.Dec() == 0 {
|
||||
close(s.done)
|
||||
|
@ -1040,8 +1130,7 @@ func (s *shards) runShard(ctx context.Context, shardID int, queue chan interface
|
|||
// Send batches of at most MaxSamplesPerSend samples to the remote storage.
|
||||
// If we have fewer samples than that, flush them out after a deadline anyways.
|
||||
var (
|
||||
max = s.qm.cfg.MaxSamplesPerSend
|
||||
nPending, nPendingSamples, nPendingExemplars = 0, 0, 0
|
||||
max = s.qm.cfg.MaxSamplesPerSend
|
||||
|
||||
pBuf = proto.NewBuffer(nil)
|
||||
buf []byte
|
||||
|
@ -1050,6 +1139,7 @@ func (s *shards) runShard(ctx context.Context, shardID int, queue chan interface
|
|||
max += int(float64(max) * 0.1)
|
||||
}
|
||||
|
||||
batchQueue := queue.Chan()
|
||||
pendingData := make([]prompb.TimeSeries, max)
|
||||
for i := range pendingData {
|
||||
pendingData[i].Samples = []prompb.Sample{{}}
|
||||
|
@ -1074,8 +1164,8 @@ func (s *shards) runShard(ctx context.Context, shardID int, queue chan interface
|
|||
case <-ctx.Done():
|
||||
// In this case we drop all samples in the buffer and the queue.
|
||||
// Remove them from pending and mark them as failed.
|
||||
droppedSamples := nPendingSamples + int(s.enqueuedSamples.Load())
|
||||
droppedExemplars := nPendingExemplars + int(s.enqueuedExemplars.Load())
|
||||
droppedSamples := int(s.enqueuedSamples.Load())
|
||||
droppedExemplars := int(s.enqueuedExemplars.Load())
|
||||
s.qm.metrics.pendingSamples.Sub(float64(droppedSamples))
|
||||
s.qm.metrics.pendingExemplars.Sub(float64(droppedExemplars))
|
||||
s.qm.metrics.failedSamplesTotal.Add(float64(droppedSamples))
|
||||
|
@ -1084,66 +1174,71 @@ func (s *shards) runShard(ctx context.Context, shardID int, queue chan interface
|
|||
s.exemplarsDroppedOnHardShutdown.Add(uint32(droppedExemplars))
|
||||
return
|
||||
|
||||
case sample, ok := <-queue:
|
||||
case batch, ok := <-batchQueue:
|
||||
if !ok {
|
||||
if nPendingSamples > 0 || nPendingExemplars > 0 {
|
||||
level.Debug(s.qm.logger).Log("msg", "Flushing data to remote storage...", "samples", nPendingSamples, "exemplars", nPendingExemplars)
|
||||
s.sendSamples(ctx, pendingData[:nPending], nPendingSamples, nPendingExemplars, pBuf, &buf)
|
||||
s.qm.metrics.pendingSamples.Sub(float64(nPendingSamples))
|
||||
s.qm.metrics.pendingExemplars.Sub(float64(nPendingExemplars))
|
||||
level.Debug(s.qm.logger).Log("msg", "Done flushing.")
|
||||
}
|
||||
return
|
||||
}
|
||||
nPendingSamples, nPendingExemplars := s.populateTimeSeries(batch, pendingData)
|
||||
queue.ReturnForReuse(batch)
|
||||
n := nPendingSamples + nPendingExemplars
|
||||
s.sendSamples(ctx, pendingData[:n], nPendingSamples, nPendingExemplars, pBuf, &buf)
|
||||
s.qm.metrics.pendingSamples.Sub(float64(nPendingSamples))
|
||||
s.qm.metrics.pendingExemplars.Sub(float64(nPendingExemplars))
|
||||
|
||||
pendingData[nPending].Samples = pendingData[nPending].Samples[:0]
|
||||
if s.qm.sendExemplars {
|
||||
pendingData[nPending].Exemplars = pendingData[nPending].Exemplars[:0]
|
||||
}
|
||||
// Number of pending samples is limited by the fact that sendSamples (via sendSamplesWithBackoff)
|
||||
// retries endlessly, so once we reach max samples, if we can never send to the endpoint we'll
|
||||
// stop reading from the queue. This makes it safe to reference pendingSamples by index.
|
||||
switch d := sample.(type) {
|
||||
case writeSample:
|
||||
pendingData[nPending].Labels = labelsToLabelsProto(d.seriesLabels, pendingData[nPending].Labels)
|
||||
pendingData[nPending].Samples = append(pendingData[nPending].Samples, d.sample)
|
||||
nPendingSamples++
|
||||
nPending++
|
||||
|
||||
case writeExemplar:
|
||||
pendingData[nPending].Labels = labelsToLabelsProto(d.seriesLabels, pendingData[nPending].Labels)
|
||||
pendingData[nPending].Exemplars = append(pendingData[nPending].Exemplars, d.exemplar)
|
||||
nPendingExemplars++
|
||||
nPending++
|
||||
}
|
||||
|
||||
if nPending >= max {
|
||||
s.sendSamples(ctx, pendingData[:nPending], nPendingSamples, nPendingExemplars, pBuf, &buf)
|
||||
s.qm.metrics.pendingSamples.Sub(float64(nPendingSamples))
|
||||
s.qm.metrics.pendingExemplars.Sub(float64(nPendingExemplars))
|
||||
nPendingSamples = 0
|
||||
nPendingExemplars = 0
|
||||
nPending = 0
|
||||
|
||||
stop()
|
||||
timer.Reset(time.Duration(s.qm.cfg.BatchSendDeadline))
|
||||
}
|
||||
stop()
|
||||
timer.Reset(time.Duration(s.qm.cfg.BatchSendDeadline))
|
||||
|
||||
case <-timer.C:
|
||||
if nPendingSamples > 0 || nPendingExemplars > 0 {
|
||||
// We need to take the lock when getting a batch to avoid
|
||||
// concurrent Appends. Generally this will only happen on low
|
||||
// traffic instances.
|
||||
s.mtx.Lock()
|
||||
// First, we need to see if we can happen to get a batch from the queue if it filled while acquiring the lock.
|
||||
var batch []interface{}
|
||||
select {
|
||||
case batch = <-batchQueue:
|
||||
default:
|
||||
batch = queue.Batch()
|
||||
}
|
||||
s.mtx.Unlock()
|
||||
if len(batch) > 0 {
|
||||
nPendingSamples, nPendingExemplars := s.populateTimeSeries(batch, pendingData)
|
||||
n := nPendingSamples + nPendingExemplars
|
||||
level.Debug(s.qm.logger).Log("msg", "runShard timer ticked, sending buffered data", "samples", nPendingSamples, "exemplars", nPendingExemplars, "shard", shardNum)
|
||||
s.sendSamples(ctx, pendingData[:nPending], nPendingSamples, nPendingExemplars, pBuf, &buf)
|
||||
s.sendSamples(ctx, pendingData[:n], nPendingSamples, nPendingExemplars, pBuf, &buf)
|
||||
s.qm.metrics.pendingSamples.Sub(float64(nPendingSamples))
|
||||
s.qm.metrics.pendingExemplars.Sub(float64(nPendingExemplars))
|
||||
nPendingSamples = 0
|
||||
nPendingExemplars = 0
|
||||
nPending = 0
|
||||
}
|
||||
queue.ReturnForReuse(batch)
|
||||
timer.Reset(time.Duration(s.qm.cfg.BatchSendDeadline))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *shards) populateTimeSeries(batch []interface{}, pendingData []prompb.TimeSeries) (int, int) {
|
||||
var nPendingSamples, nPendingExemplars int
|
||||
for nPending, sample := range batch {
|
||||
pendingData[nPending].Samples = pendingData[nPending].Samples[:0]
|
||||
if s.qm.sendExemplars {
|
||||
pendingData[nPending].Exemplars = pendingData[nPending].Exemplars[:0]
|
||||
}
|
||||
// Number of pending samples is limited by the fact that sendSamples (via sendSamplesWithBackoff)
|
||||
// retries endlessly, so once we reach max samples, if we can never send to the endpoint we'll
|
||||
// stop reading from the queue. This makes it safe to reference pendingSamples by index.
|
||||
switch d := sample.(type) {
|
||||
case writeSample:
|
||||
pendingData[nPending].Labels = labelsToLabelsProto(d.seriesLabels, pendingData[nPending].Labels)
|
||||
pendingData[nPending].Samples = append(pendingData[nPending].Samples, d.sample)
|
||||
nPendingSamples++
|
||||
case writeExemplar:
|
||||
pendingData[nPending].Labels = labelsToLabelsProto(d.seriesLabels, pendingData[nPending].Labels)
|
||||
pendingData[nPending].Exemplars = append(pendingData[nPending].Exemplars, d.exemplar)
|
||||
nPendingExemplars++
|
||||
}
|
||||
}
|
||||
return nPendingSamples, nPendingExemplars
|
||||
}
|
||||
|
||||
func (s *shards) sendSamples(ctx context.Context, samples []prompb.TimeSeries, sampleCount, exemplarCount int, pBuf *proto.Buffer, buf *[]byte) {
|
||||
begin := time.Now()
|
||||
err := s.sendSamplesWithBackoff(ctx, samples, sampleCount, exemplarCount, pBuf, buf)
|
||||
|
|
Loading…
Reference in New Issue