|
|
|
@ -19,10 +19,31 @@ import (
|
|
|
|
|
"strings"
|
|
|
|
|
"time"
|
|
|
|
|
|
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
|
|
|
"github.com/prometheus/common/log"
|
|
|
|
|
"github.com/samuel/go-zookeeper/zk"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
var (
|
|
|
|
|
failureCounter = prometheus.NewCounter(prometheus.CounterOpts{
|
|
|
|
|
Namespace: "prometheus",
|
|
|
|
|
Subsystem: "treecache",
|
|
|
|
|
Name: "zookeeper_failures_total",
|
|
|
|
|
Help: "The total number of ZooKeeper failures.",
|
|
|
|
|
})
|
|
|
|
|
numWatchers = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
|
|
|
Namespace: "prometheus",
|
|
|
|
|
Subsystem: "treecache",
|
|
|
|
|
Name: "watcher_goroutines",
|
|
|
|
|
Help: "The current number of watcher goroutines.",
|
|
|
|
|
})
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
|
prometheus.MustRegister(failureCounter)
|
|
|
|
|
prometheus.MustRegister(numWatchers)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
type ZookeeperLogger struct {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -81,6 +102,7 @@ func (tc *ZookeeperTreeCache) loop(failureMode bool) {
|
|
|
|
|
retryChan := make(chan struct{})
|
|
|
|
|
|
|
|
|
|
failure := func() {
|
|
|
|
|
failureCounter.Inc()
|
|
|
|
|
failureMode = true
|
|
|
|
|
time.AfterFunc(time.Second*10, func() {
|
|
|
|
|
retryChan <- struct{}{}
|
|
|
|
@ -129,13 +151,19 @@ func (tc *ZookeeperTreeCache) loop(failureMode bool) {
|
|
|
|
|
}
|
|
|
|
|
case <-retryChan:
|
|
|
|
|
log.Infof("Attempting to resync state with Zookeeper")
|
|
|
|
|
previousState := &zookeeperTreeCacheNode{
|
|
|
|
|
children: tc.head.children,
|
|
|
|
|
}
|
|
|
|
|
// Reset root child nodes before traversing the Zookeeper path.
|
|
|
|
|
tc.head.children = make(map[string]*zookeeperTreeCacheNode)
|
|
|
|
|
err := tc.recursiveNodeUpdate(tc.prefix, tc.head)
|
|
|
|
|
if err != nil {
|
|
|
|
|
|
|
|
|
|
if err := tc.recursiveNodeUpdate(tc.prefix, tc.head); err != nil {
|
|
|
|
|
log.Errorf("Error during Zookeeper resync: %s", err)
|
|
|
|
|
// Revert to our previous state.
|
|
|
|
|
tc.head.children = previousState.children
|
|
|
|
|
failure()
|
|
|
|
|
} else {
|
|
|
|
|
tc.resyncState(tc.prefix, tc.head, previousState)
|
|
|
|
|
log.Infof("Zookeeper resync successful")
|
|
|
|
|
failureMode = false
|
|
|
|
|
}
|
|
|
|
@ -199,6 +227,7 @@ func (tc *ZookeeperTreeCache) recursiveNodeUpdate(path string, node *zookeeperTr
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
|
numWatchers.Inc()
|
|
|
|
|
// Pass up zookeeper events, until the node is deleted.
|
|
|
|
|
select {
|
|
|
|
|
case event := <-dataWatcher:
|
|
|
|
@ -207,10 +236,21 @@ func (tc *ZookeeperTreeCache) recursiveNodeUpdate(path string, node *zookeeperTr
|
|
|
|
|
node.events <- event
|
|
|
|
|
case <-node.done:
|
|
|
|
|
}
|
|
|
|
|
numWatchers.Dec()
|
|
|
|
|
}()
|
|
|
|
|
return nil
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (tc *ZookeeperTreeCache) resyncState(path string, currentState, previousState *zookeeperTreeCacheNode) {
|
|
|
|
|
for child, previousNode := range previousState.children {
|
|
|
|
|
if currentNode, present := currentState.children[child]; present {
|
|
|
|
|
tc.resyncState(path+"/"+child, currentNode, previousNode)
|
|
|
|
|
} else {
|
|
|
|
|
tc.recursiveDelete(path+"/"+child, previousNode)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func (tc *ZookeeperTreeCache) recursiveDelete(path string, node *zookeeperTreeCacheNode) {
|
|
|
|
|
if !node.stopped {
|
|
|
|
|
node.done <- struct{}{}
|
|
|
|
|