mirror of https://github.com/k3s-io/k3s
557 lines
15 KiB
Go
557 lines
15 KiB
Go
package sqllog
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/k3s-io/kine/pkg/broadcaster"
|
|
"github.com/k3s-io/kine/pkg/server"
|
|
"github.com/pkg/errors"
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
const (
|
|
compactInterval = 5 * time.Minute
|
|
compactTimeout = 5 * time.Second
|
|
compactMinRetain = 1000
|
|
compactBatchSize = 1000
|
|
pollBatchSize = 500
|
|
)
|
|
|
|
type SQLLog struct {
|
|
d server.Dialect
|
|
broadcaster broadcaster.Broadcaster
|
|
ctx context.Context
|
|
notify chan int64
|
|
}
|
|
|
|
func New(d server.Dialect) *SQLLog {
|
|
l := &SQLLog{
|
|
d: d,
|
|
notify: make(chan int64, 1024),
|
|
}
|
|
return l
|
|
}
|
|
|
|
func (s *SQLLog) Start(ctx context.Context) error {
|
|
s.ctx = ctx
|
|
return s.compactStart(s.ctx)
|
|
}
|
|
|
|
func (s *SQLLog) compactStart(ctx context.Context) error {
|
|
logrus.Tracef("COMPACTSTART")
|
|
|
|
rows, err := s.d.After(ctx, "compact_rev_key", 0, 0)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
_, _, events, err := RowsToEvents(rows)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
logrus.Tracef("COMPACTSTART len(events)=%v", len(events))
|
|
|
|
if len(events) == 0 {
|
|
_, err := s.Append(ctx, &server.Event{
|
|
Create: true,
|
|
KV: &server.KeyValue{
|
|
Key: "compact_rev_key",
|
|
Value: []byte(""),
|
|
},
|
|
})
|
|
return err
|
|
} else if len(events) == 1 {
|
|
return nil
|
|
}
|
|
|
|
t, err := s.d.BeginTx(ctx, &sql.TxOptions{Isolation: sql.LevelSerializable})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer t.MustRollback()
|
|
|
|
// this is to work around a bug in which we ended up with two compact_rev_key rows
|
|
maxRev := int64(0)
|
|
maxID := int64(0)
|
|
for _, event := range events {
|
|
if event.PrevKV != nil && event.PrevKV.ModRevision > maxRev {
|
|
maxRev = event.PrevKV.ModRevision
|
|
maxID = event.KV.ModRevision
|
|
}
|
|
logrus.Tracef("COMPACTSTART maxRev=%v maxID=%v", maxRev, maxID)
|
|
}
|
|
|
|
for _, event := range events {
|
|
logrus.Tracef("COMPACTSTART event.KV.ModRevision=%v maxID=%v", event.KV.ModRevision, maxID)
|
|
if event.KV.ModRevision == maxID {
|
|
continue
|
|
}
|
|
if err := t.DeleteRevision(ctx, event.KV.ModRevision); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return t.Commit()
|
|
}
|
|
|
|
// compactor periodically compacts historical versions of keys.
|
|
// It will compact keys with versions older than given interval, but never within the last 1000 revisions.
|
|
// In other words, after compaction, it will only contain key revisions set during last interval.
|
|
// Any API call for the older versions of keys will return error.
|
|
// Interval is the time interval between each compaction. The first compaction happens after "interval".
|
|
// This logic is directly cribbed from k8s.io/apiserver/pkg/storage/etcd3/compact.go
|
|
func (s *SQLLog) compactor(interval time.Duration) {
|
|
t := time.NewTicker(interval)
|
|
compactRev, _ := s.d.GetCompactRevision(s.ctx)
|
|
targetCompactRev, _ := s.d.CurrentRevision(s.ctx)
|
|
logrus.Tracef("COMPACT starting compactRev=%d targetCompactRev=%d", compactRev, targetCompactRev)
|
|
|
|
outer:
|
|
for {
|
|
select {
|
|
case <-s.ctx.Done():
|
|
return
|
|
case <-t.C:
|
|
}
|
|
|
|
// Break up the compaction into smaller batches to avoid locking the database with excessively
|
|
// long transactions. When things are working normally deletes should proceed quite quickly, but if
|
|
// run against a database where compaction has stalled (see rancher/k3s#1311) it may take a long time
|
|
// (several hundred ms) just for the database to execute the subquery to select the revisions to delete.
|
|
|
|
var (
|
|
iterCompactRev int64
|
|
compactedRev int64
|
|
currentRev int64
|
|
err error
|
|
)
|
|
|
|
iterCompactRev = compactRev
|
|
compactedRev = compactRev
|
|
|
|
for iterCompactRev < targetCompactRev {
|
|
// Set move iteration target compactBatchSize revisions forward, or
|
|
// just as far as we need to hit the compaction target if that would
|
|
// overshoot it.
|
|
iterCompactRev += compactBatchSize
|
|
if iterCompactRev > targetCompactRev {
|
|
iterCompactRev = targetCompactRev
|
|
}
|
|
|
|
compactedRev, currentRev, err = s.compact(compactedRev, iterCompactRev)
|
|
if err != nil {
|
|
// ErrCompacted indicates that no further work is necessary - either compactRev changed since the
|
|
// last iteration because another client has compacted, or the requested revision has already been compacted.
|
|
if err == server.ErrCompacted {
|
|
break
|
|
} else {
|
|
logrus.Errorf("Compact failed: %v", err)
|
|
continue outer
|
|
}
|
|
}
|
|
}
|
|
|
|
// Record the final results for the outer loop
|
|
compactRev = compactedRev
|
|
targetCompactRev = currentRev
|
|
}
|
|
}
|
|
|
|
// compact removes deleted or replaced rows from the database. compactRev is the revision that was last compacted to.
|
|
// If this changes between compactions, we know that someone else has compacted and we don't need to do it.
|
|
// targetCompactRev is the revision that we should try to compact to. Upon success, the function returns the revision
|
|
// compacted to, and the revision that we should try to compact to next time (the current revision).
|
|
// This logic is directly cribbed from k8s.io/apiserver/pkg/storage/etcd3/compact.go
|
|
func (s *SQLLog) compact(compactRev int64, targetCompactRev int64) (int64, int64, error) {
|
|
ctx, cancel := context.WithTimeout(s.ctx, compactTimeout)
|
|
defer cancel()
|
|
|
|
t, err := s.d.BeginTx(ctx, &sql.TxOptions{Isolation: sql.LevelSerializable})
|
|
if err != nil {
|
|
return compactRev, targetCompactRev, errors.Wrap(err, "failed to begin transaction")
|
|
}
|
|
defer t.MustRollback()
|
|
|
|
currentRev, err := t.CurrentRevision(s.ctx)
|
|
if err != nil {
|
|
return compactRev, targetCompactRev, errors.Wrap(err, "failed to get current revision")
|
|
}
|
|
|
|
dbCompactRev, err := t.GetCompactRevision(s.ctx)
|
|
if err != nil {
|
|
return compactRev, targetCompactRev, errors.Wrap(err, "failed to get compact revision")
|
|
}
|
|
|
|
if compactRev != dbCompactRev {
|
|
logrus.Tracef("COMPACT compact revision changed since last iteration: %d => %d", compactRev, dbCompactRev)
|
|
return dbCompactRev, currentRev, server.ErrCompacted
|
|
}
|
|
|
|
// Ensure that we never compact the most recent 1000 revisions
|
|
targetCompactRev = safeCompactRev(targetCompactRev, currentRev)
|
|
|
|
// Don't bother compacting to a revision that has already been compacted
|
|
if targetCompactRev <= compactRev {
|
|
logrus.Tracef("COMPACT revision %d has already been compacted", targetCompactRev)
|
|
return dbCompactRev, currentRev, server.ErrCompacted
|
|
}
|
|
|
|
logrus.Tracef("COMPACT compactRev=%d targetCompactRev=%d currentRev=%d", compactRev, targetCompactRev, currentRev)
|
|
|
|
start := time.Now()
|
|
deletedRows, err := t.Compact(s.ctx, targetCompactRev)
|
|
if err != nil {
|
|
return compactRev, targetCompactRev, errors.Wrapf(err, "failed to compact to revision %d", targetCompactRev)
|
|
}
|
|
|
|
if err := t.SetCompactRevision(s.ctx, targetCompactRev); err != nil {
|
|
return compactRev, targetCompactRev, errors.Wrap(err, "failed to record compact revision")
|
|
}
|
|
|
|
t.MustCommit()
|
|
logrus.Debugf("COMPACT deleted %d rows from %d revisions in %s - compacted to %d/%d", deletedRows, (targetCompactRev - compactRev), time.Since(start), targetCompactRev, currentRev)
|
|
|
|
return targetCompactRev, currentRev, nil
|
|
}
|
|
|
|
func (s *SQLLog) CurrentRevision(ctx context.Context) (int64, error) {
|
|
return s.d.CurrentRevision(ctx)
|
|
}
|
|
|
|
func (s *SQLLog) After(ctx context.Context, prefix string, revision, limit int64) (int64, []*server.Event, error) {
|
|
if strings.HasSuffix(prefix, "/") {
|
|
prefix += "%"
|
|
}
|
|
|
|
rows, err := s.d.After(ctx, prefix, revision, limit)
|
|
if err != nil {
|
|
return 0, nil, err
|
|
}
|
|
|
|
rev, compact, result, err := RowsToEvents(rows)
|
|
if revision > 0 && revision < compact {
|
|
return rev, result, server.ErrCompacted
|
|
}
|
|
|
|
return rev, result, err
|
|
}
|
|
|
|
func (s *SQLLog) List(ctx context.Context, prefix, startKey string, limit, revision int64, includeDeleted bool) (int64, []*server.Event, error) {
|
|
var (
|
|
rows *sql.Rows
|
|
err error
|
|
)
|
|
|
|
// It's assumed that when there is a start key that that key exists.
|
|
if strings.HasSuffix(prefix, "/") {
|
|
// In the situation of a list start the startKey will not exist so set to ""
|
|
if prefix == startKey {
|
|
startKey = ""
|
|
}
|
|
prefix += "%"
|
|
} else {
|
|
// Also if this isn't a list there is no reason to pass startKey
|
|
startKey = ""
|
|
}
|
|
|
|
if revision == 0 {
|
|
rows, err = s.d.ListCurrent(ctx, prefix, limit, includeDeleted)
|
|
} else {
|
|
rows, err = s.d.List(ctx, prefix, startKey, limit, revision, includeDeleted)
|
|
}
|
|
if err != nil {
|
|
return 0, nil, err
|
|
}
|
|
|
|
rev, compact, result, err := RowsToEvents(rows)
|
|
if err != nil {
|
|
return 0, nil, err
|
|
}
|
|
|
|
if revision > 0 && len(result) == 0 {
|
|
// a zero length result won't have the compact revision so get it manually
|
|
compact, err = s.d.GetCompactRevision(ctx)
|
|
if err != nil {
|
|
return 0, nil, err
|
|
}
|
|
}
|
|
|
|
if revision > 0 && revision < compact {
|
|
return rev, result, server.ErrCompacted
|
|
}
|
|
|
|
select {
|
|
case s.notify <- rev:
|
|
default:
|
|
}
|
|
|
|
return rev, result, err
|
|
}
|
|
|
|
func RowsToEvents(rows *sql.Rows) (int64, int64, []*server.Event, error) {
|
|
var (
|
|
result []*server.Event
|
|
rev int64
|
|
compact int64
|
|
)
|
|
defer rows.Close()
|
|
|
|
for rows.Next() {
|
|
event := &server.Event{}
|
|
if err := scan(rows, &rev, &compact, event); err != nil {
|
|
return 0, 0, nil, err
|
|
}
|
|
result = append(result, event)
|
|
}
|
|
|
|
return rev, compact, result, nil
|
|
}
|
|
|
|
func (s *SQLLog) Watch(ctx context.Context, prefix string) <-chan []*server.Event {
|
|
res := make(chan []*server.Event, 100)
|
|
values, err := s.broadcaster.Subscribe(ctx, s.startWatch)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
checkPrefix := strings.HasSuffix(prefix, "/")
|
|
|
|
go func() {
|
|
defer close(res)
|
|
for i := range values {
|
|
events, ok := filter(i, checkPrefix, prefix)
|
|
if ok {
|
|
res <- events
|
|
}
|
|
}
|
|
}()
|
|
|
|
return res
|
|
}
|
|
|
|
func filter(events interface{}, checkPrefix bool, prefix string) ([]*server.Event, bool) {
|
|
eventList := events.([]*server.Event)
|
|
filteredEventList := make([]*server.Event, 0, len(eventList))
|
|
|
|
for _, event := range eventList {
|
|
if (checkPrefix && strings.HasPrefix(event.KV.Key, prefix)) || event.KV.Key == prefix {
|
|
filteredEventList = append(filteredEventList, event)
|
|
}
|
|
}
|
|
|
|
return filteredEventList, len(filteredEventList) > 0
|
|
}
|
|
|
|
func (s *SQLLog) startWatch() (chan interface{}, error) {
|
|
pollStart, err := s.d.GetCompactRevision(s.ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
c := make(chan interface{})
|
|
// start compaction and polling at the same time to watch starts
|
|
// at the oldest revision, but compaction doesn't create gaps
|
|
go s.compactor(compactInterval)
|
|
go s.poll(c, pollStart)
|
|
return c, nil
|
|
}
|
|
|
|
func (s *SQLLog) poll(result chan interface{}, pollStart int64) {
|
|
var (
|
|
last = pollStart
|
|
skip int64
|
|
skipTime time.Time
|
|
waitForMore = true
|
|
)
|
|
|
|
wait := time.NewTicker(time.Second)
|
|
defer wait.Stop()
|
|
defer close(result)
|
|
|
|
for {
|
|
if waitForMore {
|
|
select {
|
|
case <-s.ctx.Done():
|
|
return
|
|
case check := <-s.notify:
|
|
if check <= last {
|
|
continue
|
|
}
|
|
case <-wait.C:
|
|
}
|
|
}
|
|
waitForMore = true
|
|
|
|
rows, err := s.d.After(s.ctx, "%", last, pollBatchSize)
|
|
if err != nil {
|
|
logrus.Errorf("fail to list latest changes: %v", err)
|
|
continue
|
|
}
|
|
|
|
_, _, events, err := RowsToEvents(rows)
|
|
if err != nil {
|
|
logrus.Errorf("fail to convert rows changes: %v", err)
|
|
continue
|
|
}
|
|
|
|
if len(events) == 0 {
|
|
continue
|
|
}
|
|
|
|
waitForMore = len(events) < 100
|
|
|
|
rev := last
|
|
var (
|
|
sequential []*server.Event
|
|
saveLast bool
|
|
)
|
|
|
|
for _, event := range events {
|
|
next := rev + 1
|
|
// Ensure that we are notifying events in a sequential fashion. For example if we find row 4 before 3
|
|
// we don't want to notify row 4 because 3 is essentially dropped forever.
|
|
if event.KV.ModRevision != next {
|
|
logrus.Tracef("MODREVISION GAP: expected %v, got %v", next, event.KV.ModRevision)
|
|
if canSkipRevision(next, skip, skipTime) {
|
|
// This situation should never happen, but we have it here as a fallback just for unknown reasons
|
|
// we don't want to pause all watches forever
|
|
logrus.Errorf("GAP %s, revision=%d, delete=%v, next=%d", event.KV.Key, event.KV.ModRevision, event.Delete, next)
|
|
} else if skip != next {
|
|
// This is the first time we have encountered this missing revision, so record time start
|
|
// and trigger a quick retry for simple out of order events
|
|
skip = next
|
|
skipTime = time.Now()
|
|
select {
|
|
case s.notify <- next:
|
|
default:
|
|
}
|
|
break
|
|
} else {
|
|
if err := s.d.Fill(s.ctx, next); err == nil {
|
|
logrus.Tracef("FILL, revision=%d, err=%v", next, err)
|
|
select {
|
|
case s.notify <- next:
|
|
default:
|
|
}
|
|
} else {
|
|
logrus.Tracef("FILL FAILED, revision=%d, err=%v", next, err)
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
// we have done something now that we should save the last revision. We don't save here now because
|
|
// the next loop could fail leading to saving the reported revision without reporting it. In practice this
|
|
// loop right now has no error exit so the next loop shouldn't fail, but if we for some reason add a method
|
|
// that returns error, that would be a tricky bug to find. So instead we only save the last revision at
|
|
// the same time we write to the channel.
|
|
saveLast = true
|
|
rev = event.KV.ModRevision
|
|
if s.d.IsFill(event.KV.Key) {
|
|
logrus.Tracef("NOT TRIGGER FILL %s, revision=%d, delete=%v", event.KV.Key, event.KV.ModRevision, event.Delete)
|
|
} else {
|
|
sequential = append(sequential, event)
|
|
logrus.Tracef("TRIGGERED %s, revision=%d, delete=%v", event.KV.Key, event.KV.ModRevision, event.Delete)
|
|
}
|
|
}
|
|
|
|
if saveLast {
|
|
last = rev
|
|
if len(sequential) > 0 {
|
|
result <- sequential
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func canSkipRevision(rev, skip int64, skipTime time.Time) bool {
|
|
return rev == skip && time.Since(skipTime) > time.Second
|
|
}
|
|
|
|
func (s *SQLLog) Count(ctx context.Context, prefix string) (int64, int64, error) {
|
|
if strings.HasSuffix(prefix, "/") {
|
|
prefix += "%"
|
|
}
|
|
return s.d.Count(ctx, prefix)
|
|
}
|
|
|
|
func (s *SQLLog) Append(ctx context.Context, event *server.Event) (int64, error) {
|
|
e := *event
|
|
if e.KV == nil {
|
|
e.KV = &server.KeyValue{}
|
|
}
|
|
if e.PrevKV == nil {
|
|
e.PrevKV = &server.KeyValue{}
|
|
}
|
|
|
|
rev, err := s.d.Insert(ctx, e.KV.Key,
|
|
e.Create,
|
|
e.Delete,
|
|
e.KV.CreateRevision,
|
|
e.PrevKV.ModRevision,
|
|
e.KV.Lease,
|
|
e.KV.Value,
|
|
e.PrevKV.Value,
|
|
)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
select {
|
|
case s.notify <- rev:
|
|
default:
|
|
}
|
|
return rev, nil
|
|
}
|
|
|
|
func scan(rows *sql.Rows, rev *int64, compact *int64, event *server.Event) error {
|
|
event.KV = &server.KeyValue{}
|
|
event.PrevKV = &server.KeyValue{}
|
|
|
|
c := &sql.NullInt64{}
|
|
|
|
err := rows.Scan(
|
|
rev,
|
|
c,
|
|
&event.KV.ModRevision,
|
|
&event.KV.Key,
|
|
&event.Create,
|
|
&event.Delete,
|
|
&event.KV.CreateRevision,
|
|
&event.PrevKV.ModRevision,
|
|
&event.KV.Lease,
|
|
&event.KV.Value,
|
|
&event.PrevKV.Value,
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if event.Create {
|
|
event.KV.CreateRevision = event.KV.ModRevision
|
|
event.PrevKV = nil
|
|
}
|
|
|
|
*compact = c.Int64
|
|
return nil
|
|
}
|
|
|
|
// safeCompactRev ensures that we never compact the most recent 1000 revisions.
|
|
func safeCompactRev(targetCompactRev int64, currentRev int64) int64 {
|
|
safeRev := currentRev - compactMinRetain
|
|
if targetCompactRev < safeRev {
|
|
safeRev = targetCompactRev
|
|
}
|
|
if safeRev < 0 {
|
|
safeRev = 0
|
|
}
|
|
return safeRev
|
|
}
|
|
|
|
func (s *SQLLog) DbSize(ctx context.Context) (int64, error) {
|
|
return s.d.GetSize(ctx)
|
|
}
|