mirror of https://github.com/hashicorp/consul
238 lines
5.7 KiB
Go
238 lines
5.7 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package hcp
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
hcpclient "github.com/hashicorp/consul/agent/hcp/client"
|
|
"github.com/hashicorp/consul/agent/hcp/config"
|
|
"github.com/hashicorp/consul/agent/hcp/scada"
|
|
"github.com/hashicorp/consul/lib"
|
|
"github.com/hashicorp/go-hclog"
|
|
)
|
|
|
|
var (
|
|
defaultManagerMinInterval = 45 * time.Minute
|
|
defaultManagerMaxInterval = 75 * time.Minute
|
|
)
|
|
|
|
type ManagerConfig struct {
|
|
Client hcpclient.Client
|
|
CloudConfig config.CloudConfig
|
|
SCADAProvider scada.Provider
|
|
TelemetryProvider *hcpProviderImpl
|
|
|
|
StatusFn StatusCallback
|
|
MinInterval time.Duration
|
|
MaxInterval time.Duration
|
|
|
|
Logger hclog.Logger
|
|
}
|
|
|
|
func (cfg *ManagerConfig) enabled() bool {
|
|
return cfg.Client != nil && cfg.StatusFn != nil
|
|
}
|
|
|
|
func (cfg *ManagerConfig) nextHeartbeat() time.Duration {
|
|
min := cfg.MinInterval
|
|
if min == 0 {
|
|
min = defaultManagerMinInterval
|
|
}
|
|
|
|
max := cfg.MaxInterval
|
|
if max == 0 {
|
|
max = defaultManagerMaxInterval
|
|
}
|
|
if max < min {
|
|
max = min
|
|
}
|
|
return min + lib.RandomStagger(max-min)
|
|
}
|
|
|
|
type StatusCallback func(context.Context) (hcpclient.ServerStatus, error)
|
|
|
|
type Manager struct {
|
|
logger hclog.Logger
|
|
|
|
cfg ManagerConfig
|
|
cfgMu sync.RWMutex
|
|
|
|
updateCh chan struct{}
|
|
|
|
// testUpdateSent is set by unit tests to signal when the manager's status update has triggered
|
|
testUpdateSent chan struct{}
|
|
}
|
|
|
|
// NewManager returns a Manager initialized with the given configuration.
|
|
func NewManager(cfg ManagerConfig) *Manager {
|
|
return &Manager{
|
|
logger: cfg.Logger,
|
|
cfg: cfg,
|
|
|
|
updateCh: make(chan struct{}, 1),
|
|
}
|
|
}
|
|
|
|
// Run executes the Manager it's designed to be run in its own goroutine for
|
|
// the life of a server agent. It should be run even if HCP is not configured
|
|
// yet for servers since a config update might configure it later and
|
|
// UpdateConfig called. It will effectively do nothing if there are no HCP
|
|
// credentials set other than wait for some to be added.
|
|
func (m *Manager) Run(ctx context.Context) error {
|
|
var err error
|
|
m.logger.Debug("HCP manager starting")
|
|
|
|
// Update and start the SCADA provider
|
|
err = m.startSCADAProvider()
|
|
if err != nil {
|
|
m.logger.Error("failed to start scada provider", "error", err)
|
|
return err
|
|
}
|
|
|
|
// Update and start the telemetry provider to enable the HCP metrics sink
|
|
if err := m.startTelemetryProvider(ctx); err != nil {
|
|
m.logger.Error("failed to update telemetry config provider", "error", err)
|
|
return err
|
|
}
|
|
|
|
// immediately send initial update
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
case <-m.updateCh: // empty the update chan if there is a queued update to prevent repeated update in main loop
|
|
err = m.sendUpdate()
|
|
default:
|
|
err = m.sendUpdate()
|
|
}
|
|
|
|
// main loop
|
|
for {
|
|
m.cfgMu.RLock()
|
|
cfg := m.cfg
|
|
m.cfgMu.RUnlock()
|
|
nextUpdate := cfg.nextHeartbeat()
|
|
if err != nil {
|
|
m.logger.Error("failed to send server status to HCP", "err", err, "next_heartbeat", nextUpdate.String())
|
|
}
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
|
|
case <-m.updateCh:
|
|
err = m.sendUpdate()
|
|
|
|
case <-time.After(nextUpdate):
|
|
err = m.sendUpdate()
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *Manager) startSCADAProvider() error {
|
|
provider := m.cfg.SCADAProvider
|
|
if provider == nil {
|
|
return nil
|
|
}
|
|
|
|
// Update the SCADA provider configuration with HCP configurations
|
|
m.logger.Debug("updating scada provider with HCP configuration")
|
|
err := provider.UpdateHCPConfig(m.cfg.CloudConfig)
|
|
if err != nil {
|
|
m.logger.Error("failed to update scada provider with HCP configuration", "err", err)
|
|
return err
|
|
}
|
|
|
|
// Update the SCADA provider metadata
|
|
provider.UpdateMeta(map[string]string{
|
|
"consul_server_id": string(m.cfg.CloudConfig.NodeID),
|
|
})
|
|
|
|
// Start the SCADA provider
|
|
err = provider.Start()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (m *Manager) startTelemetryProvider(ctx context.Context) error {
|
|
if m.cfg.TelemetryProvider == nil {
|
|
return nil
|
|
}
|
|
|
|
m.cfg.TelemetryProvider.Run(ctx, &HCPProviderCfg{
|
|
HCPClient: m.cfg.Client,
|
|
HCPConfig: &m.cfg.CloudConfig,
|
|
})
|
|
|
|
return nil
|
|
}
|
|
|
|
func (m *Manager) UpdateConfig(cfg ManagerConfig) {
|
|
m.cfgMu.Lock()
|
|
defer m.cfgMu.Unlock()
|
|
old := m.cfg
|
|
m.cfg = cfg
|
|
if old.enabled() || cfg.enabled() {
|
|
// Only log about this if cloud is actually configured or it would be
|
|
// confusing. We check both old and new in case we are disabling cloud or
|
|
// enabling it or just updating it.
|
|
m.logger.Info("updated HCP configuration")
|
|
}
|
|
|
|
// Send a new status update since we might have just gotten connection details
|
|
// for the first time.
|
|
m.SendUpdate()
|
|
}
|
|
|
|
func (m *Manager) SendUpdate() {
|
|
m.logger.Debug("HCP triggering status update")
|
|
select {
|
|
case m.updateCh <- struct{}{}:
|
|
// trigger update
|
|
default:
|
|
// if chan is full then there is already an update triggered that will soon
|
|
// be acted on so don't bother blocking.
|
|
}
|
|
}
|
|
|
|
// TODO: we should have retried on failures here with backoff but take into
|
|
// account that if a new update is triggered while we are still retrying we
|
|
// should not start another retry loop. Something like have a "dirty" flag which
|
|
// we mark on first PushUpdate and then a retry timer as well as the interval
|
|
// and a "isRetrying" state or something so that we attempt to send update, but
|
|
// then fetch fresh info on each attempt to send so if we are already in a retry
|
|
// backoff a new push is a no-op.
|
|
func (m *Manager) sendUpdate() error {
|
|
m.cfgMu.RLock()
|
|
cfg := m.cfg
|
|
m.cfgMu.RUnlock()
|
|
|
|
if !cfg.enabled() {
|
|
return nil
|
|
}
|
|
|
|
if m.testUpdateSent != nil {
|
|
defer func() {
|
|
select {
|
|
case m.testUpdateSent <- struct{}{}:
|
|
default:
|
|
}
|
|
}()
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
|
|
s, err := cfg.StatusFn(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return m.cfg.Client.PushServerStatus(ctx, &s)
|
|
}
|