2019-01-12 04:58:27 +00:00
/ *
Copyright 2016 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package etcd3
import (
"context"
"errors"
"fmt"
"os"
2020-12-01 01:06:26 +00:00
"reflect"
2019-01-12 04:58:27 +00:00
"strconv"
"strings"
"sync"
2020-03-26 21:07:15 +00:00
apierrors "k8s.io/apimachinery/pkg/api/errors"
2019-01-12 04:58:27 +00:00
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/watch"
"k8s.io/apiserver/pkg/storage"
2020-12-01 01:06:26 +00:00
"k8s.io/apiserver/pkg/storage/etcd3/metrics"
2019-01-12 04:58:27 +00:00
"k8s.io/apiserver/pkg/storage/value"
2019-12-12 01:27:03 +00:00
"go.etcd.io/etcd/clientv3"
2020-08-10 17:43:49 +00:00
"k8s.io/klog/v2"
2019-01-12 04:58:27 +00:00
)
const (
// We have set a buffer in order to reduce times of context switches.
incomingBufSize = 100
outgoingBufSize = 100
)
// fatalOnDecodeError is used during testing to panic the server if watcher encounters a decoding error
var fatalOnDecodeError = false
// errTestingDecode is the only error that testingDeferOnDecodeError catches during a panic
var errTestingDecode = errors . New ( "sentinel error only used during testing to indicate watch decoding error" )
// testingDeferOnDecodeError is used during testing to recover from a panic caused by errTestingDecode, all other values continue to panic
func testingDeferOnDecodeError ( ) {
if r := recover ( ) ; r != nil && r != errTestingDecode {
panic ( r )
}
}
func init ( ) {
// check to see if we are running in a test environment
2019-08-30 18:33:25 +00:00
TestOnlySetFatalOnDecodeError ( true )
2019-01-12 04:58:27 +00:00
fatalOnDecodeError , _ = strconv . ParseBool ( os . Getenv ( "KUBE_PANIC_WATCH_DECODE_ERROR" ) )
}
2019-08-30 18:33:25 +00:00
// TestOnlySetFatalOnDecodeError should only be used for cases where decode errors are expected and need to be tested. e.g. conversion webhooks.
func TestOnlySetFatalOnDecodeError ( b bool ) {
fatalOnDecodeError = b
}
2019-01-12 04:58:27 +00:00
type watcher struct {
client * clientv3 . Client
codec runtime . Codec
2020-12-01 01:06:26 +00:00
newFunc func ( ) runtime . Object
objectType string
2019-01-12 04:58:27 +00:00
versioner storage . Versioner
transformer value . Transformer
}
// watchChan implements watch.Interface.
type watchChan struct {
watcher * watcher
key string
initialRev int64
recursive bool
2020-12-01 01:06:26 +00:00
progressNotify bool
2019-01-12 04:58:27 +00:00
internalPred storage . SelectionPredicate
ctx context . Context
cancel context . CancelFunc
incomingEventChan chan * event
resultChan chan watch . Event
errChan chan error
}
2020-12-01 01:06:26 +00:00
func newWatcher ( client * clientv3 . Client , codec runtime . Codec , newFunc func ( ) runtime . Object , versioner storage . Versioner , transformer value . Transformer ) * watcher {
res := & watcher {
2019-01-12 04:58:27 +00:00
client : client ,
codec : codec ,
2020-12-01 01:06:26 +00:00
newFunc : newFunc ,
2019-01-12 04:58:27 +00:00
versioner : versioner ,
transformer : transformer ,
}
2020-12-01 01:06:26 +00:00
if newFunc == nil {
res . objectType = "<unknown>"
} else {
res . objectType = reflect . TypeOf ( newFunc ( ) ) . String ( )
}
return res
2019-01-12 04:58:27 +00:00
}
// Watch watches on a key and returns a watch.Interface that transfers relevant notifications.
// If rev is zero, it will return the existing object(s) and then start watching from
// the maximum revision+1 from returned objects.
// If rev is non-zero, it will watch events happened after given revision.
// If recursive is false, it watches on given key.
// If recursive is true, it watches any children and directories under the key, excluding the root key itself.
// pred must be non-nil. Only if pred matches the change, it will be returned.
2020-12-01 01:06:26 +00:00
func ( w * watcher ) Watch ( ctx context . Context , key string , rev int64 , recursive , progressNotify bool , pred storage . SelectionPredicate ) ( watch . Interface , error ) {
2019-01-12 04:58:27 +00:00
if recursive && ! strings . HasSuffix ( key , "/" ) {
key += "/"
}
2020-12-01 01:06:26 +00:00
wc := w . createWatchChan ( ctx , key , rev , recursive , progressNotify , pred )
2019-01-12 04:58:27 +00:00
go wc . run ( )
return wc , nil
}
2020-12-01 01:06:26 +00:00
func ( w * watcher ) createWatchChan ( ctx context . Context , key string , rev int64 , recursive , progressNotify bool , pred storage . SelectionPredicate ) * watchChan {
2019-01-12 04:58:27 +00:00
wc := & watchChan {
watcher : w ,
key : key ,
initialRev : rev ,
recursive : recursive ,
2020-12-01 01:06:26 +00:00
progressNotify : progressNotify ,
2019-01-12 04:58:27 +00:00
internalPred : pred ,
incomingEventChan : make ( chan * event , incomingBufSize ) ,
resultChan : make ( chan watch . Event , outgoingBufSize ) ,
errChan : make ( chan error , 1 ) ,
}
if pred . Empty ( ) {
// The filter doesn't filter out any object.
wc . internalPred = storage . Everything
}
2020-08-10 17:43:49 +00:00
// The etcd server waits until it cannot find a leader for 3 election
// timeouts to cancel existing streams. 3 is currently a hard coded
// constant. The election timeout defaults to 1000ms. If the cluster is
// healthy, when the leader is stopped, the leadership transfer should be
// smooth. (leader transfers its leadership before stopping). If leader is
// hard killed, other servers will take an election timeout to realize
// leader lost and start campaign.
wc . ctx , wc . cancel = context . WithCancel ( clientv3 . WithRequireLeader ( ctx ) )
2019-01-12 04:58:27 +00:00
return wc
}
func ( wc * watchChan ) run ( ) {
watchClosedCh := make ( chan struct { } )
go wc . startWatching ( watchClosedCh )
var resultChanWG sync . WaitGroup
resultChanWG . Add ( 1 )
go wc . processEvent ( & resultChanWG )
select {
case err := <- wc . errChan :
if err == context . Canceled {
break
}
errResult := transformErrorToEvent ( err )
if errResult != nil {
// error result is guaranteed to be received by user before closing ResultChan.
select {
case wc . resultChan <- * errResult :
case <- wc . ctx . Done ( ) : // user has given up all results
}
}
case <- watchClosedCh :
case <- wc . ctx . Done ( ) : // user cancel
}
// We use wc.ctx to reap all goroutines. Under whatever condition, we should stop them all.
// It's fine to double cancel.
wc . cancel ( )
// we need to wait until resultChan wouldn't be used anymore
resultChanWG . Wait ( )
close ( wc . resultChan )
}
func ( wc * watchChan ) Stop ( ) {
wc . cancel ( )
}
func ( wc * watchChan ) ResultChan ( ) <- chan watch . Event {
return wc . resultChan
}
// sync tries to retrieve existing data and send them to process.
// The revision to watch will be set to the revision in response.
// All events sent will have isCreated=true
func ( wc * watchChan ) sync ( ) error {
opts := [ ] clientv3 . OpOption { }
if wc . recursive {
opts = append ( opts , clientv3 . WithPrefix ( ) )
}
getResp , err := wc . watcher . client . Get ( wc . ctx , wc . key , opts ... )
if err != nil {
return err
}
wc . initialRev = getResp . Header . Revision
for _ , kv := range getResp . Kvs {
wc . sendEvent ( parseKV ( kv ) )
}
return nil
}
2020-03-26 21:07:15 +00:00
// logWatchChannelErr checks whether the error is about mvcc revision compaction which is regarded as warning
func logWatchChannelErr ( err error ) {
if ! strings . Contains ( err . Error ( ) , "mvcc: required revision has been compacted" ) {
klog . Errorf ( "watch chan error: %v" , err )
} else {
klog . Warningf ( "watch chan error: %v" , err )
}
}
2019-01-12 04:58:27 +00:00
// startWatching does:
// - get current objects if initialRev=0; set initialRev to current rev
// - watch on given key and send events to process.
func ( wc * watchChan ) startWatching ( watchClosedCh chan struct { } ) {
if wc . initialRev == 0 {
if err := wc . sync ( ) ; err != nil {
klog . Errorf ( "failed to sync with latest state: %v" , err )
wc . sendError ( err )
return
}
}
opts := [ ] clientv3 . OpOption { clientv3 . WithRev ( wc . initialRev + 1 ) , clientv3 . WithPrevKV ( ) }
if wc . recursive {
opts = append ( opts , clientv3 . WithPrefix ( ) )
}
2020-12-01 01:06:26 +00:00
if wc . progressNotify {
opts = append ( opts , clientv3 . WithProgressNotify ( ) )
}
2019-01-12 04:58:27 +00:00
wch := wc . watcher . client . Watch ( wc . ctx , wc . key , opts ... )
for wres := range wch {
if wres . Err ( ) != nil {
err := wres . Err ( )
// If there is an error on server (e.g. compaction), the channel will return it before closed.
2020-03-26 21:07:15 +00:00
logWatchChannelErr ( err )
2019-01-12 04:58:27 +00:00
wc . sendError ( err )
return
}
2020-12-01 01:06:26 +00:00
if wres . IsProgressNotify ( ) {
wc . sendEvent ( progressNotifyEvent ( wres . Header . GetRevision ( ) ) )
metrics . RecordEtcdBookmark ( wc . watcher . objectType )
continue
}
2019-01-12 04:58:27 +00:00
for _ , e := range wres . Events {
2019-06-12 21:00:25 +00:00
parsedEvent , err := parseEvent ( e )
if err != nil {
2020-03-26 21:07:15 +00:00
logWatchChannelErr ( err )
2019-06-12 21:00:25 +00:00
wc . sendError ( err )
return
}
wc . sendEvent ( parsedEvent )
2019-01-12 04:58:27 +00:00
}
}
// When we come to this point, it's only possible that client side ends the watch.
// e.g. cancel the context, close the client.
// If this watch chan is broken and context isn't cancelled, other goroutines will still hang.
// We should notify the main thread that this goroutine has exited.
close ( watchClosedCh )
}
// processEvent processes events from etcd watcher and sends results to resultChan.
func ( wc * watchChan ) processEvent ( wg * sync . WaitGroup ) {
defer wg . Done ( )
for {
select {
case e := <- wc . incomingEventChan :
res := wc . transform ( e )
if res == nil {
continue
}
if len ( wc . resultChan ) == outgoingBufSize {
2020-08-10 17:43:49 +00:00
klog . V ( 3 ) . InfoS ( "Fast watcher, slow processing. Probably caused by slow dispatching events to watchers" , "outgoingEvents" , outgoingBufSize )
2019-01-12 04:58:27 +00:00
}
// If user couldn't receive results fast enough, we also block incoming events from watcher.
// Because storing events in local will cause more memory usage.
// The worst case would be closing the fast watcher.
select {
case wc . resultChan <- * res :
case <- wc . ctx . Done ( ) :
return
}
case <- wc . ctx . Done ( ) :
return
}
}
}
func ( wc * watchChan ) filter ( obj runtime . Object ) bool {
if wc . internalPred . Empty ( ) {
return true
}
matched , err := wc . internalPred . Matches ( obj )
return err == nil && matched
}
func ( wc * watchChan ) acceptAll ( ) bool {
return wc . internalPred . Empty ( )
}
// transform transforms an event into a result for user if not filtered.
func ( wc * watchChan ) transform ( e * event ) ( res * watch . Event ) {
curObj , oldObj , err := wc . prepareObjs ( e )
if err != nil {
klog . Errorf ( "failed to prepare current and previous objects: %v" , err )
wc . sendError ( err )
return nil
}
switch {
2020-12-01 01:06:26 +00:00
case e . isProgressNotify :
if wc . watcher . newFunc == nil {
return nil
}
object := wc . watcher . newFunc ( )
if err := wc . watcher . versioner . UpdateObject ( object , uint64 ( e . rev ) ) ; err != nil {
klog . Errorf ( "failed to propagate object version: %v" , err )
return nil
}
res = & watch . Event {
Type : watch . Bookmark ,
Object : object ,
}
2019-01-12 04:58:27 +00:00
case e . isDeleted :
if ! wc . filter ( oldObj ) {
return nil
}
res = & watch . Event {
Type : watch . Deleted ,
Object : oldObj ,
}
case e . isCreated :
if ! wc . filter ( curObj ) {
return nil
}
res = & watch . Event {
Type : watch . Added ,
Object : curObj ,
}
default :
if wc . acceptAll ( ) {
res = & watch . Event {
Type : watch . Modified ,
Object : curObj ,
}
return res
}
curObjPasses := wc . filter ( curObj )
oldObjPasses := wc . filter ( oldObj )
switch {
case curObjPasses && oldObjPasses :
res = & watch . Event {
Type : watch . Modified ,
Object : curObj ,
}
case curObjPasses && ! oldObjPasses :
res = & watch . Event {
Type : watch . Added ,
Object : curObj ,
}
case ! curObjPasses && oldObjPasses :
res = & watch . Event {
Type : watch . Deleted ,
Object : oldObj ,
}
}
}
return res
}
func transformErrorToEvent ( err error ) * watch . Event {
err = interpretWatchError ( err )
2020-03-26 21:07:15 +00:00
if _ , ok := err . ( apierrors . APIStatus ) ; ! ok {
err = apierrors . NewInternalError ( err )
2019-01-12 04:58:27 +00:00
}
2020-03-26 21:07:15 +00:00
status := err . ( apierrors . APIStatus ) . Status ( )
2019-01-12 04:58:27 +00:00
return & watch . Event {
Type : watch . Error ,
Object : & status ,
}
}
func ( wc * watchChan ) sendError ( err error ) {
select {
case wc . errChan <- err :
case <- wc . ctx . Done ( ) :
}
}
func ( wc * watchChan ) sendEvent ( e * event ) {
if len ( wc . incomingEventChan ) == incomingBufSize {
2020-08-10 17:43:49 +00:00
klog . V ( 3 ) . InfoS ( "Fast watcher, slow processing. Probably caused by slow decoding, user not receiving fast, or other processing logic" , "incomingEvents" , incomingBufSize )
2019-01-12 04:58:27 +00:00
}
select {
case wc . incomingEventChan <- e :
case <- wc . ctx . Done ( ) :
}
}
func ( wc * watchChan ) prepareObjs ( e * event ) ( curObj runtime . Object , oldObj runtime . Object , err error ) {
2020-12-01 01:06:26 +00:00
if e . isProgressNotify {
// progressNotify events doesn't contain neither current nor previous object version,
return nil , nil , nil
}
2019-01-12 04:58:27 +00:00
if ! e . isDeleted {
data , _ , err := wc . watcher . transformer . TransformFromStorage ( e . value , authenticatedDataString ( e . key ) )
if err != nil {
return nil , nil , err
}
curObj , err = decodeObj ( wc . watcher . codec , wc . watcher . versioner , data , e . rev )
if err != nil {
return nil , nil , err
}
}
// We need to decode prevValue, only if this is deletion event or
// the underlying filter doesn't accept all objects (otherwise we
// know that the filter for previous object will return true and
// we need the object only to compute whether it was filtered out
// before).
if len ( e . prevValue ) > 0 && ( e . isDeleted || ! wc . acceptAll ( ) ) {
data , _ , err := wc . watcher . transformer . TransformFromStorage ( e . prevValue , authenticatedDataString ( e . key ) )
if err != nil {
return nil , nil , err
}
// Note that this sends the *old* object with the etcd revision for the time at
// which it gets deleted.
oldObj , err = decodeObj ( wc . watcher . codec , wc . watcher . versioner , data , e . rev )
if err != nil {
return nil , nil , err
}
}
return curObj , oldObj , nil
}
func decodeObj ( codec runtime . Codec , versioner storage . Versioner , data [ ] byte , rev int64 ) ( _ runtime . Object , err error ) {
obj , err := runtime . Decode ( codec , [ ] byte ( data ) )
if err != nil {
if fatalOnDecodeError {
// catch watch decode error iff we caused it on
// purpose during a unit test
defer testingDeferOnDecodeError ( )
// we are running in a test environment and thus an
// error here is due to a coder mistake if the defer
// does not catch it
panic ( err )
}
return nil , err
}
// ensure resource version is set on the object we load from etcd
if err := versioner . UpdateObject ( obj , uint64 ( rev ) ) ; err != nil {
return nil , fmt . Errorf ( "failure to version api object (%d) %#v: %v" , rev , obj , err )
}
return obj , nil
}