2023-03-28 18:39:22 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
2023-08-11 13:12:13 +00:00
|
|
|
// SPDX-License-Identifier: BUSL-1.1
|
2023-03-28 18:39:22 +00:00
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
// Package bootstrap handles bootstrapping an agent's config from HCP.
|
2022-09-26 18:58:15 +00:00
|
|
|
package bootstrap
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"context"
|
2023-04-27 20:27:39 +00:00
|
|
|
"crypto/tls"
|
|
|
|
"crypto/x509"
|
2022-09-26 18:58:15 +00:00
|
|
|
"encoding/json"
|
2023-04-27 20:27:39 +00:00
|
|
|
"encoding/pem"
|
2022-09-26 18:58:15 +00:00
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"os"
|
|
|
|
"path/filepath"
|
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
|
Move HCP Manager lifecycle management out of Link controller (#20401)
* Add function to get update channel for watching HCP Link
* Add MonitorHCPLink function
This function can be called in a goroutine to manage the lifecycle
of the HCP manager.
* Update HCP Manager config in link monitor before starting
This updates HCPMonitorLink so it updates the HCP manager
with an HCP client and management token when a Link is upserted.
* Let MonitorHCPManager handle lifecycle instead of link controller
* Remove cleanup from Link controller and move it to MonitorHCPLink
Previously, the Link Controller was responsible for cleaning up the
HCP-related files on the file system. This change makes it so
MonitorHCPLink handles this cleanup. As a result, we are able to remove
the PlacementEachServer placement strategy for the Link controller
because it no longer needs to do this per-node cleanup.
* Remove HCP Manager dependency from Link Controller
The Link controller does not need to have HCP Manager
as a dependency anymore, so this removes that dependency
in order to simplify the design.
* Add Linked prefix to Linked status variables
This is in preparation for adding a new status type to the
Link resource.
* Add new "validated" status type to link resource
The link resource controller will now set a "validated" status
in addition to the "linked" status. This is needed so that other
components (eg the HCP manager) know when the Link is ready to link
with HCP.
* Fix tests
* Handle new 'EndOfSnapshot' WatchList event
* Fix watch test
* Remove unnecessary config from TestAgent_scadaProvider
Since the Scada provider is now started on agent startup
regardless of whether a cloud config is provided, this removes
the cloud config override from the relevant test.
This change is not exactly related to the changes from this PR,
but rather is something small and sort of related that was noticed
while working on this PR.
* Simplify link watch test and remove sleep from link watch
This updates the link watch test so that it uses more mocks
and does not require setting up the infrastructure for the HCP Link
controller.
This also removes the time.Sleep delay in the link watcher loop in favor
of an error counter. When we receive 10 consecutive errors, we shut down
the link watcher loop.
* Add better logging for link validation. Remove EndOfSnapshot test.
* Refactor link monitor test into a table test
* Add some clarifying comments to link monitor
* Simplify link watch test
* Test a bunch more errors cases in link monitor test
* Use exponential backoff instead of errorCounter in LinkWatch
* Move link watch and link monitor into a single goroutine called from server.go
* Refactor HCP link watcher to use single go-routine.
Previously, if the WatchClient errored, we would've never recovered
because we never retry to create the stream. With this change,
we have a single goroutine that runs for the life of the server agent
and if the WatchClient stream ever errors, we retry the creation
of the stream with an exponential backoff.
2024-02-12 15:48:23 +00:00
|
|
|
"github.com/hashicorp/go-hclog"
|
|
|
|
"github.com/hashicorp/go-uuid"
|
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
"github.com/hashicorp/consul/agent/connect"
|
Move HCP Manager lifecycle management out of Link controller (#20401)
* Add function to get update channel for watching HCP Link
* Add MonitorHCPLink function
This function can be called in a goroutine to manage the lifecycle
of the HCP manager.
* Update HCP Manager config in link monitor before starting
This updates HCPMonitorLink so it updates the HCP manager
with an HCP client and management token when a Link is upserted.
* Let MonitorHCPManager handle lifecycle instead of link controller
* Remove cleanup from Link controller and move it to MonitorHCPLink
Previously, the Link Controller was responsible for cleaning up the
HCP-related files on the file system. This change makes it so
MonitorHCPLink handles this cleanup. As a result, we are able to remove
the PlacementEachServer placement strategy for the Link controller
because it no longer needs to do this per-node cleanup.
* Remove HCP Manager dependency from Link Controller
The Link controller does not need to have HCP Manager
as a dependency anymore, so this removes that dependency
in order to simplify the design.
* Add Linked prefix to Linked status variables
This is in preparation for adding a new status type to the
Link resource.
* Add new "validated" status type to link resource
The link resource controller will now set a "validated" status
in addition to the "linked" status. This is needed so that other
components (eg the HCP manager) know when the Link is ready to link
with HCP.
* Fix tests
* Handle new 'EndOfSnapshot' WatchList event
* Fix watch test
* Remove unnecessary config from TestAgent_scadaProvider
Since the Scada provider is now started on agent startup
regardless of whether a cloud config is provided, this removes
the cloud config override from the relevant test.
This change is not exactly related to the changes from this PR,
but rather is something small and sort of related that was noticed
while working on this PR.
* Simplify link watch test and remove sleep from link watch
This updates the link watch test so that it uses more mocks
and does not require setting up the infrastructure for the HCP Link
controller.
This also removes the time.Sleep delay in the link watcher loop in favor
of an error counter. When we receive 10 consecutive errors, we shut down
the link watcher loop.
* Add better logging for link validation. Remove EndOfSnapshot test.
* Refactor link monitor test into a table test
* Add some clarifying comments to link monitor
* Simplify link watch test
* Test a bunch more errors cases in link monitor test
* Use exponential backoff instead of errorCounter in LinkWatch
* Move link watch and link monitor into a single goroutine called from server.go
* Refactor HCP link watcher to use single go-routine.
Previously, if the WatchClient errored, we would've never recovered
because we never retry to create the stream. With this change,
we have a single goroutine that runs for the life of the server agent
and if the WatchClient stream ever errors, we retry the creation
of the stream with an exponential backoff.
2024-02-12 15:48:23 +00:00
|
|
|
"github.com/hashicorp/consul/agent/hcp/bootstrap/constants"
|
HCP Telemetry Feature (#17460)
* Move hcp client to subpackage hcpclient (#16800)
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* [HCP Observability] OTELExporter (#17128)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* [HCP Observability] OTELSink (#17159)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Initialize OTELSink with sync.Map for all the instrument stores.
* Moved PeriodicReader init to NewOtelReader function. This allows us to use a ManualReader for tests.
* Switch to mutex instead of sync.Map to avoid type assertion
* Add gauge store
* Clarify comments
* return concrete sink type
* Fix lint errors
* Move gauge store to be within sink
* Use context.TODO,rebase and clenaup opts handling
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Fix imports
* Update to latest stable version by rebasing on cc-4933, fix import, remove mutex init, fix opts error messages and use logger from ctx
* Add lots of documentation to the OTELSink
* Fix gauge store comment and check ok
* Add select and ctx.Done() check to gauge callback
* use require.Equal for attributes
* Fixed import naming
* Remove float64 calls and add a NewGaugeStore method
* Change name Store to Set in gaugeStore, add concurrency tests in both OTELSink and gauge store
* Generate 100 gauge operations
* Seperate the labels into goroutines in sink test
* Generate kv store for the test case keys to avoid using uuid
* Added a race test with 300 samples for OTELSink
* Do not pass in waitgroup and use error channel instead.
* Using SHA 7dea2225a218872e86d2f580e82c089b321617b0 to avoid build failures in otel
* Fix nits
* [HCP Observability] Init OTELSink in Telemetry (#17162)
* Move hcp client to subpackage hcpclient (#16800)
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Initialize OTELSink with sync.Map for all the instrument stores.
* Moved PeriodicReader init to NewOtelReader function. This allows us to use a ManualReader for tests.
* Switch to mutex instead of sync.Map to avoid type assertion
* Add gauge store
* Clarify comments
* return concrete sink type
* Fix lint errors
* Move gauge store to be within sink
* Use context.TODO,rebase and clenaup opts handling
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Fix imports
* Update to latest stable version by rebasing on cc-4933, fix import, remove mutex init, fix opts error messages and use logger from ctx
* Add lots of documentation to the OTELSink
* Fix gauge store comment and check ok
* Add select and ctx.Done() check to gauge callback
* use require.Equal for attributes
* Fixed import naming
* Remove float64 calls and add a NewGaugeStore method
* Change name Store to Set in gaugeStore, add concurrency tests in both OTELSink and gauge store
* Generate 100 gauge operations
* Seperate the labels into goroutines in sink test
* Generate kv store for the test case keys to avoid using uuid
* Added a race test with 300 samples for OTELSink
* [HCP Observability] OTELExporter (#17128)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Do not pass in waitgroup and use error channel instead.
* Using SHA 7dea2225a218872e86d2f580e82c089b321617b0 to avoid build failures in otel
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Initialize OTELSink with sync.Map for all the instrument stores.
* Added telemetry agent to client and init sink in deps
* Fixed client
* Initalize sink in deps
* init sink in telemetry library
* Init deps before telemetry
* Use concrete telemetry.OtelSink type
* add /v1/metrics
* Avoid returning err for telemetry init
* move sink init within the IsCloudEnabled()
* Use HCPSinkOpts in deps instead
* update golden test for configuration file
* Switch to using extra sinks in the telemetry library
* keep name MetricsConfig
* fix log in verifyCCMRegistration
* Set logger in context
* pass around MetricSink in deps
* Fix imports
* Rebased onto otel sink pr
* Fix URL in test
* [HCP Observability] OTELSink (#17159)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Initialize OTELSink with sync.Map for all the instrument stores.
* Moved PeriodicReader init to NewOtelReader function. This allows us to use a ManualReader for tests.
* Switch to mutex instead of sync.Map to avoid type assertion
* Add gauge store
* Clarify comments
* return concrete sink type
* Fix lint errors
* Move gauge store to be within sink
* Use context.TODO,rebase and clenaup opts handling
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Fix imports
* Update to latest stable version by rebasing on cc-4933, fix import, remove mutex init, fix opts error messages and use logger from ctx
* Add lots of documentation to the OTELSink
* Fix gauge store comment and check ok
* Add select and ctx.Done() check to gauge callback
* use require.Equal for attributes
* Fixed import naming
* Remove float64 calls and add a NewGaugeStore method
* Change name Store to Set in gaugeStore, add concurrency tests in both OTELSink and gauge store
* Generate 100 gauge operations
* Seperate the labels into goroutines in sink test
* Generate kv store for the test case keys to avoid using uuid
* Added a race test with 300 samples for OTELSink
* Do not pass in waitgroup and use error channel instead.
* Using SHA 7dea2225a218872e86d2f580e82c089b321617b0 to avoid build failures in otel
* Fix nits
* pass extraSinks as function param instead
* Add default interval as package export
* remove verifyCCM func
* Add clusterID
* Fix import and add t.Parallel() for missing tests
* Kick Vercel CI
* Remove scheme from endpoint path, and fix error logging
* return metrics.MetricSink for sink method
* Update SDK
* [HCP Observability] Metrics filtering and Labels in Go Metrics sink (#17184)
* Move hcp client to subpackage hcpclient (#16800)
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Initialize OTELSink with sync.Map for all the instrument stores.
* Moved PeriodicReader init to NewOtelReader function. This allows us to use a ManualReader for tests.
* Switch to mutex instead of sync.Map to avoid type assertion
* Add gauge store
* Clarify comments
* return concrete sink type
* Fix lint errors
* Move gauge store to be within sink
* Use context.TODO,rebase and clenaup opts handling
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Fix imports
* Update to latest stable version by rebasing on cc-4933, fix import, remove mutex init, fix opts error messages and use logger from ctx
* Add lots of documentation to the OTELSink
* Fix gauge store comment and check ok
* Add select and ctx.Done() check to gauge callback
* use require.Equal for attributes
* Fixed import naming
* Remove float64 calls and add a NewGaugeStore method
* Change name Store to Set in gaugeStore, add concurrency tests in both OTELSink and gauge store
* Generate 100 gauge operations
* Seperate the labels into goroutines in sink test
* Generate kv store for the test case keys to avoid using uuid
* Added a race test with 300 samples for OTELSink
* [HCP Observability] OTELExporter (#17128)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Do not pass in waitgroup and use error channel instead.
* Using SHA 7dea2225a218872e86d2f580e82c089b321617b0 to avoid build failures in otel
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Initialize OTELSink with sync.Map for all the instrument stores.
* Added telemetry agent to client and init sink in deps
* Fixed client
* Initalize sink in deps
* init sink in telemetry library
* Init deps before telemetry
* Use concrete telemetry.OtelSink type
* add /v1/metrics
* Avoid returning err for telemetry init
* move sink init within the IsCloudEnabled()
* Use HCPSinkOpts in deps instead
* update golden test for configuration file
* Switch to using extra sinks in the telemetry library
* keep name MetricsConfig
* fix log in verifyCCMRegistration
* Set logger in context
* pass around MetricSink in deps
* Fix imports
* Rebased onto otel sink pr
* Fix URL in test
* [HCP Observability] OTELSink (#17159)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Initialize OTELSink with sync.Map for all the instrument stores.
* Moved PeriodicReader init to NewOtelReader function. This allows us to use a ManualReader for tests.
* Switch to mutex instead of sync.Map to avoid type assertion
* Add gauge store
* Clarify comments
* return concrete sink type
* Fix lint errors
* Move gauge store to be within sink
* Use context.TODO,rebase and clenaup opts handling
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Fix imports
* Update to latest stable version by rebasing on cc-4933, fix import, remove mutex init, fix opts error messages and use logger from ctx
* Add lots of documentation to the OTELSink
* Fix gauge store comment and check ok
* Add select and ctx.Done() check to gauge callback
* use require.Equal for attributes
* Fixed import naming
* Remove float64 calls and add a NewGaugeStore method
* Change name Store to Set in gaugeStore, add concurrency tests in both OTELSink and gauge store
* Generate 100 gauge operations
* Seperate the labels into goroutines in sink test
* Generate kv store for the test case keys to avoid using uuid
* Added a race test with 300 samples for OTELSink
* Do not pass in waitgroup and use error channel instead.
* Using SHA 7dea2225a218872e86d2f580e82c089b321617b0 to avoid build failures in otel
* Fix nits
* pass extraSinks as function param instead
* Add default interval as package export
* remove verifyCCM func
* Add clusterID
* Fix import and add t.Parallel() for missing tests
* Kick Vercel CI
* Remove scheme from endpoint path, and fix error logging
* return metrics.MetricSink for sink method
* Update SDK
* Added telemetry agent to client and init sink in deps
* Add node_id and __replica__ default labels
* add function for default labels and set x-hcp-resource-id
* Fix labels tests
* Commit suggestion for getDefaultLabels
Co-authored-by: Joshua Timmons <joshua.timmons1@gmail.com>
* Fixed server.id, and t.Parallel()
* Make defaultLabels a method on the TelemetryConfig object
* Rename FilterList to lowercase filterList
* Cleanup filter implemetation by combining regex into a single one, and making the type lowercase
* Fix append
* use regex directly for filters
* Fix x-resource-id test to use mocked value
* Fix log.Error formats
* Forgot the len(opts.Label) optimization)
* Use cfg.NodeID instead
---------
Co-authored-by: Joshua Timmons <joshua.timmons1@gmail.com>
* remove replic tag (#17484)
* [HCP Observability] Add custom metrics for OTEL sink, improve logging, upgrade modules and cleanup metrics client (#17455)
* Add custom metrics for Exporter and transform operations
* Improve deps logging
Run go mod tidy
* Upgrade SDK and OTEL
* Remove the partial success implemetation and check for HTTP status code in metrics client
* Add x-channel
* cleanup logs in deps.go based on PR feedback
* Change to debug log and lowercase
* address test operation feedback
* use GetHumanVersion on version
* Fix error wrapping
* Fix metric names
* [HCP Observability] Turn off retries for now until dynamically configurable (#17496)
* Remove retries for now until dynamic configuration is possible
* Clarify comment
* Update changelog
* improve changelog
---------
Co-authored-by: Joshua Timmons <joshua.timmons1@gmail.com>
2023-05-29 20:11:08 +00:00
|
|
|
hcpclient "github.com/hashicorp/consul/agent/hcp/client"
|
2022-09-26 18:58:15 +00:00
|
|
|
"github.com/hashicorp/consul/lib"
|
|
|
|
"github.com/hashicorp/consul/lib/retry"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2024-01-24 15:51:43 +00:00
|
|
|
CAFileName = "server-tls-cas.pem"
|
|
|
|
CertFileName = "server-tls-cert.pem"
|
|
|
|
ConfigFileName = "server-config.json"
|
|
|
|
KeyFileName = "server-tls-key.pem"
|
|
|
|
TokenFileName = "hcp-management-token"
|
|
|
|
SuccessFileName = "successful-bootstrap"
|
2022-09-26 18:58:15 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// UI is a shim to allow the agent command to pass in it's mitchelh/cli.UI so we
|
|
|
|
// can output useful messages to the user during bootstrapping. For example if
|
|
|
|
// we have to retry several times to bootstrap we don't want the agent to just
|
|
|
|
// stall with no output which is the case if we just returned all intermediate
|
|
|
|
// warnings or errors.
|
|
|
|
type UI interface {
|
|
|
|
Output(string)
|
|
|
|
Warn(string)
|
|
|
|
Info(string)
|
|
|
|
Error(string)
|
|
|
|
}
|
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
// RawBootstrapConfig contains the Consul config as a raw JSON string and the management token
|
|
|
|
// which either was retrieved from persisted files or from the bootstrap endpoint
|
|
|
|
type RawBootstrapConfig struct {
|
|
|
|
ConfigJSON string
|
|
|
|
ManagementToken string
|
|
|
|
}
|
2022-09-26 18:58:15 +00:00
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
// FetchBootstrapConfig will fetch bootstrap configuration from remote servers and persist it to disk.
|
2023-04-27 20:27:39 +00:00
|
|
|
// It will retry until successful or a terminal error condition is found (e.g. permission denied).
|
2024-01-24 15:51:43 +00:00
|
|
|
func FetchBootstrapConfig(ctx context.Context, client hcpclient.Client, dataDir string, ui UI) (*RawBootstrapConfig, error) {
|
2022-09-26 18:58:15 +00:00
|
|
|
w := retry.Waiter{
|
|
|
|
MinWait: 1 * time.Second,
|
|
|
|
MaxWait: 5 * time.Minute,
|
|
|
|
Jitter: retry.NewJitter(50),
|
|
|
|
}
|
|
|
|
|
|
|
|
for {
|
|
|
|
// Note we don't want to shadow `ctx` here since we need that for the Wait
|
|
|
|
// below.
|
|
|
|
reqCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
|
|
defer cancel()
|
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
cfg, err := fetchBootstrapConfig(reqCtx, client, dataDir)
|
2022-09-26 18:58:15 +00:00
|
|
|
if err != nil {
|
2024-01-24 15:51:43 +00:00
|
|
|
if errors.Is(err, hcpclient.ErrUnauthorized) || errors.Is(err, hcpclient.ErrForbidden) {
|
|
|
|
// Don't retry on terminal errors
|
|
|
|
return nil, err
|
|
|
|
}
|
2023-04-27 20:27:39 +00:00
|
|
|
ui.Error(fmt.Sprintf("Error: failed to fetch bootstrap config from HCP, will retry in %s: %s",
|
2022-09-26 18:58:15 +00:00
|
|
|
w.NextWait().Round(time.Second), err))
|
|
|
|
if err := w.Wait(ctx); err != nil {
|
2023-04-27 20:27:39 +00:00
|
|
|
return nil, err
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
|
|
|
// Finished waiting, restart loop
|
|
|
|
continue
|
|
|
|
}
|
2024-01-24 15:51:43 +00:00
|
|
|
return cfg, nil
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
2024-01-24 15:51:43 +00:00
|
|
|
}
|
2022-09-26 18:58:15 +00:00
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
// fetchBootstrapConfig will fetch the bootstrap configuration from remote servers and persist it to disk.
|
|
|
|
func fetchBootstrapConfig(ctx context.Context, client hcpclient.Client, dataDir string) (*RawBootstrapConfig, error) {
|
|
|
|
reqCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
|
|
defer cancel()
|
|
|
|
|
|
|
|
resp, err := client.FetchBootstrap(reqCtx)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to fetch bootstrap config from HCP: %w", err)
|
|
|
|
}
|
2023-04-27 20:27:39 +00:00
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
bsCfg := resp
|
|
|
|
devMode := dataDir == ""
|
2023-04-27 20:27:39 +00:00
|
|
|
cfgJSON, err := persistAndProcessConfig(dataDir, devMode, bsCfg)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to persist config for existing cluster: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return &RawBootstrapConfig{
|
|
|
|
ConfigJSON: cfgJSON,
|
|
|
|
ManagementToken: bsCfg.ManagementToken,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// persistAndProcessConfig is called when we receive data from CCM.
|
|
|
|
// We validate and persist everything that was received, then also update
|
|
|
|
// the JSON config as needed.
|
HCP Telemetry Feature (#17460)
* Move hcp client to subpackage hcpclient (#16800)
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* [HCP Observability] OTELExporter (#17128)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* [HCP Observability] OTELSink (#17159)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Initialize OTELSink with sync.Map for all the instrument stores.
* Moved PeriodicReader init to NewOtelReader function. This allows us to use a ManualReader for tests.
* Switch to mutex instead of sync.Map to avoid type assertion
* Add gauge store
* Clarify comments
* return concrete sink type
* Fix lint errors
* Move gauge store to be within sink
* Use context.TODO,rebase and clenaup opts handling
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Fix imports
* Update to latest stable version by rebasing on cc-4933, fix import, remove mutex init, fix opts error messages and use logger from ctx
* Add lots of documentation to the OTELSink
* Fix gauge store comment and check ok
* Add select and ctx.Done() check to gauge callback
* use require.Equal for attributes
* Fixed import naming
* Remove float64 calls and add a NewGaugeStore method
* Change name Store to Set in gaugeStore, add concurrency tests in both OTELSink and gauge store
* Generate 100 gauge operations
* Seperate the labels into goroutines in sink test
* Generate kv store for the test case keys to avoid using uuid
* Added a race test with 300 samples for OTELSink
* Do not pass in waitgroup and use error channel instead.
* Using SHA 7dea2225a218872e86d2f580e82c089b321617b0 to avoid build failures in otel
* Fix nits
* [HCP Observability] Init OTELSink in Telemetry (#17162)
* Move hcp client to subpackage hcpclient (#16800)
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Initialize OTELSink with sync.Map for all the instrument stores.
* Moved PeriodicReader init to NewOtelReader function. This allows us to use a ManualReader for tests.
* Switch to mutex instead of sync.Map to avoid type assertion
* Add gauge store
* Clarify comments
* return concrete sink type
* Fix lint errors
* Move gauge store to be within sink
* Use context.TODO,rebase and clenaup opts handling
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Fix imports
* Update to latest stable version by rebasing on cc-4933, fix import, remove mutex init, fix opts error messages and use logger from ctx
* Add lots of documentation to the OTELSink
* Fix gauge store comment and check ok
* Add select and ctx.Done() check to gauge callback
* use require.Equal for attributes
* Fixed import naming
* Remove float64 calls and add a NewGaugeStore method
* Change name Store to Set in gaugeStore, add concurrency tests in both OTELSink and gauge store
* Generate 100 gauge operations
* Seperate the labels into goroutines in sink test
* Generate kv store for the test case keys to avoid using uuid
* Added a race test with 300 samples for OTELSink
* [HCP Observability] OTELExporter (#17128)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Do not pass in waitgroup and use error channel instead.
* Using SHA 7dea2225a218872e86d2f580e82c089b321617b0 to avoid build failures in otel
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Initialize OTELSink with sync.Map for all the instrument stores.
* Added telemetry agent to client and init sink in deps
* Fixed client
* Initalize sink in deps
* init sink in telemetry library
* Init deps before telemetry
* Use concrete telemetry.OtelSink type
* add /v1/metrics
* Avoid returning err for telemetry init
* move sink init within the IsCloudEnabled()
* Use HCPSinkOpts in deps instead
* update golden test for configuration file
* Switch to using extra sinks in the telemetry library
* keep name MetricsConfig
* fix log in verifyCCMRegistration
* Set logger in context
* pass around MetricSink in deps
* Fix imports
* Rebased onto otel sink pr
* Fix URL in test
* [HCP Observability] OTELSink (#17159)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Initialize OTELSink with sync.Map for all the instrument stores.
* Moved PeriodicReader init to NewOtelReader function. This allows us to use a ManualReader for tests.
* Switch to mutex instead of sync.Map to avoid type assertion
* Add gauge store
* Clarify comments
* return concrete sink type
* Fix lint errors
* Move gauge store to be within sink
* Use context.TODO,rebase and clenaup opts handling
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Fix imports
* Update to latest stable version by rebasing on cc-4933, fix import, remove mutex init, fix opts error messages and use logger from ctx
* Add lots of documentation to the OTELSink
* Fix gauge store comment and check ok
* Add select and ctx.Done() check to gauge callback
* use require.Equal for attributes
* Fixed import naming
* Remove float64 calls and add a NewGaugeStore method
* Change name Store to Set in gaugeStore, add concurrency tests in both OTELSink and gauge store
* Generate 100 gauge operations
* Seperate the labels into goroutines in sink test
* Generate kv store for the test case keys to avoid using uuid
* Added a race test with 300 samples for OTELSink
* Do not pass in waitgroup and use error channel instead.
* Using SHA 7dea2225a218872e86d2f580e82c089b321617b0 to avoid build failures in otel
* Fix nits
* pass extraSinks as function param instead
* Add default interval as package export
* remove verifyCCM func
* Add clusterID
* Fix import and add t.Parallel() for missing tests
* Kick Vercel CI
* Remove scheme from endpoint path, and fix error logging
* return metrics.MetricSink for sink method
* Update SDK
* [HCP Observability] Metrics filtering and Labels in Go Metrics sink (#17184)
* Move hcp client to subpackage hcpclient (#16800)
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* [HCP Observability] New MetricsClient (#17100)
* Client configured with TLS using HCP config and retry/throttle
* Add tests and godoc for metrics client
* close body after request
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* remove clone
* Extract CloudConfig and mock for future PR
* Switch to hclog.FromContext
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Initialize OTELSink with sync.Map for all the instrument stores.
* Moved PeriodicReader init to NewOtelReader function. This allows us to use a ManualReader for tests.
* Switch to mutex instead of sync.Map to avoid type assertion
* Add gauge store
* Clarify comments
* return concrete sink type
* Fix lint errors
* Move gauge store to be within sink
* Use context.TODO,rebase and clenaup opts handling
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Fix imports
* Update to latest stable version by rebasing on cc-4933, fix import, remove mutex init, fix opts error messages and use logger from ctx
* Add lots of documentation to the OTELSink
* Fix gauge store comment and check ok
* Add select and ctx.Done() check to gauge callback
* use require.Equal for attributes
* Fixed import naming
* Remove float64 calls and add a NewGaugeStore method
* Change name Store to Set in gaugeStore, add concurrency tests in both OTELSink and gauge store
* Generate 100 gauge operations
* Seperate the labels into goroutines in sink test
* Generate kv store for the test case keys to avoid using uuid
* Added a race test with 300 samples for OTELSink
* [HCP Observability] OTELExporter (#17128)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Do not pass in waitgroup and use error channel instead.
* Using SHA 7dea2225a218872e86d2f580e82c089b321617b0 to avoid build failures in otel
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Initialize OTELSink with sync.Map for all the instrument stores.
* Added telemetry agent to client and init sink in deps
* Fixed client
* Initalize sink in deps
* init sink in telemetry library
* Init deps before telemetry
* Use concrete telemetry.OtelSink type
* add /v1/metrics
* Avoid returning err for telemetry init
* move sink init within the IsCloudEnabled()
* Use HCPSinkOpts in deps instead
* update golden test for configuration file
* Switch to using extra sinks in the telemetry library
* keep name MetricsConfig
* fix log in verifyCCMRegistration
* Set logger in context
* pass around MetricSink in deps
* Fix imports
* Rebased onto otel sink pr
* Fix URL in test
* [HCP Observability] OTELSink (#17159)
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Create new OTELExporter which uses the MetricsClient
Add transform because the conversion is in an /internal package
* Fix lint error
* early return when there are no metrics
* Add NewOTELExporter() function
* Downgrade to metrics SDK version: v1.15.0-rc.1
* Fix imports
* fix small nits with comments and url.URL
* Fix tests by asserting actual error for context cancellation, fix parallel, and make mock more versatile
* Cleanup error handling and clarify empty metrics case
* Fix input/expected naming in otel_transform_test.go
* add comment for metric tracking
* Add a general isEmpty method
* Add clear error types
* update to latest version 1.15.0 of OTEL
* Client configured with TLS using HCP config and retry/throttle
* run go mod tidy
* Remove one abstraction to use the config from deps
* Address PR feedback
* Initialize OTELSink with sync.Map for all the instrument stores.
* Moved PeriodicReader init to NewOtelReader function. This allows us to use a ManualReader for tests.
* Switch to mutex instead of sync.Map to avoid type assertion
* Add gauge store
* Clarify comments
* return concrete sink type
* Fix lint errors
* Move gauge store to be within sink
* Use context.TODO,rebase and clenaup opts handling
* Rebase onto otl exporter to downgrade metrics API to v1.15.0-rc.1
* Fix imports
* Update to latest stable version by rebasing on cc-4933, fix import, remove mutex init, fix opts error messages and use logger from ctx
* Add lots of documentation to the OTELSink
* Fix gauge store comment and check ok
* Add select and ctx.Done() check to gauge callback
* use require.Equal for attributes
* Fixed import naming
* Remove float64 calls and add a NewGaugeStore method
* Change name Store to Set in gaugeStore, add concurrency tests in both OTELSink and gauge store
* Generate 100 gauge operations
* Seperate the labels into goroutines in sink test
* Generate kv store for the test case keys to avoid using uuid
* Added a race test with 300 samples for OTELSink
* Do not pass in waitgroup and use error channel instead.
* Using SHA 7dea2225a218872e86d2f580e82c089b321617b0 to avoid build failures in otel
* Fix nits
* pass extraSinks as function param instead
* Add default interval as package export
* remove verifyCCM func
* Add clusterID
* Fix import and add t.Parallel() for missing tests
* Kick Vercel CI
* Remove scheme from endpoint path, and fix error logging
* return metrics.MetricSink for sink method
* Update SDK
* Added telemetry agent to client and init sink in deps
* Add node_id and __replica__ default labels
* add function for default labels and set x-hcp-resource-id
* Fix labels tests
* Commit suggestion for getDefaultLabels
Co-authored-by: Joshua Timmons <joshua.timmons1@gmail.com>
* Fixed server.id, and t.Parallel()
* Make defaultLabels a method on the TelemetryConfig object
* Rename FilterList to lowercase filterList
* Cleanup filter implemetation by combining regex into a single one, and making the type lowercase
* Fix append
* use regex directly for filters
* Fix x-resource-id test to use mocked value
* Fix log.Error formats
* Forgot the len(opts.Label) optimization)
* Use cfg.NodeID instead
---------
Co-authored-by: Joshua Timmons <joshua.timmons1@gmail.com>
* remove replic tag (#17484)
* [HCP Observability] Add custom metrics for OTEL sink, improve logging, upgrade modules and cleanup metrics client (#17455)
* Add custom metrics for Exporter and transform operations
* Improve deps logging
Run go mod tidy
* Upgrade SDK and OTEL
* Remove the partial success implemetation and check for HTTP status code in metrics client
* Add x-channel
* cleanup logs in deps.go based on PR feedback
* Change to debug log and lowercase
* address test operation feedback
* use GetHumanVersion on version
* Fix error wrapping
* Fix metric names
* [HCP Observability] Turn off retries for now until dynamically configurable (#17496)
* Remove retries for now until dynamic configuration is possible
* Clarify comment
* Update changelog
* improve changelog
---------
Co-authored-by: Joshua Timmons <joshua.timmons1@gmail.com>
2023-05-29 20:11:08 +00:00
|
|
|
func persistAndProcessConfig(dataDir string, devMode bool, bsCfg *hcpclient.BootstrapConfig) (string, error) {
|
2023-04-27 20:27:39 +00:00
|
|
|
if devMode {
|
2022-09-26 18:58:15 +00:00
|
|
|
// Agent in dev mode, we still need somewhere to persist the certs
|
|
|
|
// temporarily though to be able to start up at all since we don't support
|
|
|
|
// inline certs right now. Use temp dir
|
|
|
|
tmp, err := os.MkdirTemp(os.TempDir(), "consul-dev-")
|
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf("failed to create temp dir for certificates: %w", err)
|
|
|
|
}
|
|
|
|
dataDir = tmp
|
|
|
|
}
|
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
// Create subdir if it's not already there.
|
Move HCP Manager lifecycle management out of Link controller (#20401)
* Add function to get update channel for watching HCP Link
* Add MonitorHCPLink function
This function can be called in a goroutine to manage the lifecycle
of the HCP manager.
* Update HCP Manager config in link monitor before starting
This updates HCPMonitorLink so it updates the HCP manager
with an HCP client and management token when a Link is upserted.
* Let MonitorHCPManager handle lifecycle instead of link controller
* Remove cleanup from Link controller and move it to MonitorHCPLink
Previously, the Link Controller was responsible for cleaning up the
HCP-related files on the file system. This change makes it so
MonitorHCPLink handles this cleanup. As a result, we are able to remove
the PlacementEachServer placement strategy for the Link controller
because it no longer needs to do this per-node cleanup.
* Remove HCP Manager dependency from Link Controller
The Link controller does not need to have HCP Manager
as a dependency anymore, so this removes that dependency
in order to simplify the design.
* Add Linked prefix to Linked status variables
This is in preparation for adding a new status type to the
Link resource.
* Add new "validated" status type to link resource
The link resource controller will now set a "validated" status
in addition to the "linked" status. This is needed so that other
components (eg the HCP manager) know when the Link is ready to link
with HCP.
* Fix tests
* Handle new 'EndOfSnapshot' WatchList event
* Fix watch test
* Remove unnecessary config from TestAgent_scadaProvider
Since the Scada provider is now started on agent startup
regardless of whether a cloud config is provided, this removes
the cloud config override from the relevant test.
This change is not exactly related to the changes from this PR,
but rather is something small and sort of related that was noticed
while working on this PR.
* Simplify link watch test and remove sleep from link watch
This updates the link watch test so that it uses more mocks
and does not require setting up the infrastructure for the HCP Link
controller.
This also removes the time.Sleep delay in the link watcher loop in favor
of an error counter. When we receive 10 consecutive errors, we shut down
the link watcher loop.
* Add better logging for link validation. Remove EndOfSnapshot test.
* Refactor link monitor test into a table test
* Add some clarifying comments to link monitor
* Simplify link watch test
* Test a bunch more errors cases in link monitor test
* Use exponential backoff instead of errorCounter in LinkWatch
* Move link watch and link monitor into a single goroutine called from server.go
* Refactor HCP link watcher to use single go-routine.
Previously, if the WatchClient errored, we would've never recovered
because we never retry to create the stream. With this change,
we have a single goroutine that runs for the life of the server agent
and if the WatchClient stream ever errors, we retry the creation
of the stream with an exponential backoff.
2024-02-12 15:48:23 +00:00
|
|
|
dir := filepath.Join(dataDir, constants.SubDir)
|
2023-04-27 20:27:39 +00:00
|
|
|
if err := lib.EnsurePath(dir, true); err != nil {
|
|
|
|
return "", fmt.Errorf("failed to ensure directory %q: %w", dir, err)
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
2023-04-27 20:27:39 +00:00
|
|
|
|
|
|
|
// Parse just to a map for now as we only have to inject to a specific place
|
|
|
|
// and parsing whole Config struct is complicated...
|
|
|
|
var cfg map[string]any
|
|
|
|
|
|
|
|
if err := json.Unmarshal([]byte(bsCfg.ConsulConfig), &cfg); err != nil {
|
|
|
|
return "", fmt.Errorf("failed to unmarshal bootstrap config: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Avoid ever setting an initial_management token from HCP now that we can
|
|
|
|
// separately bootstrap an HCP management token with a distinct accessor ID.
|
|
|
|
//
|
|
|
|
// CCM will continue to return an initial_management token because previous versions of Consul
|
|
|
|
// cannot bootstrap an HCP management token distinct from the initial management token.
|
|
|
|
// This block can be deleted once CCM supports tailoring bootstrap config responses
|
|
|
|
// based on the version of Consul that requested it.
|
|
|
|
acls, aclsOK := cfg["acl"].(map[string]any)
|
|
|
|
if aclsOK {
|
|
|
|
tokens, tokensOK := acls["tokens"].(map[string]interface{})
|
|
|
|
if tokensOK {
|
|
|
|
delete(tokens, "initial_management")
|
|
|
|
}
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
var cfgJSON string
|
|
|
|
if bsCfg.TLSCert != "" {
|
2024-01-24 15:51:43 +00:00
|
|
|
if err := ValidateTLSCerts(bsCfg.TLSCert, bsCfg.TLSCertKey, bsCfg.TLSCAs); err != nil {
|
2023-04-27 20:27:39 +00:00
|
|
|
return "", fmt.Errorf("invalid certificates: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Persist the TLS cert files from the response since we need to refer to them
|
|
|
|
// as disk files either way.
|
|
|
|
if err := persistTLSCerts(dir, bsCfg.TLSCert, bsCfg.TLSCertKey, bsCfg.TLSCAs); err != nil {
|
|
|
|
return "", fmt.Errorf("failed to persist TLS certificates to dir %q: %w", dataDir, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Store paths to the persisted TLS cert files.
|
2024-01-24 15:51:43 +00:00
|
|
|
cfg["ca_file"] = filepath.Join(dir, CAFileName)
|
|
|
|
cfg["cert_file"] = filepath.Join(dir, CertFileName)
|
|
|
|
cfg["key_file"] = filepath.Join(dir, KeyFileName)
|
2023-04-27 20:27:39 +00:00
|
|
|
|
|
|
|
// Convert the bootstrap config map back into a string
|
|
|
|
cfgJSONBytes, err := json.Marshal(cfg)
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
2023-04-27 20:27:39 +00:00
|
|
|
cfgJSON = string(cfgJSONBytes)
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
if !devMode {
|
|
|
|
// Persist the final config we need to add so that it is available locally after a restart.
|
|
|
|
// Assuming the configured data dir wasn't a tmp dir to start with.
|
|
|
|
if err := persistBootstrapConfig(dir, cfgJSON); err != nil {
|
|
|
|
return "", fmt.Errorf("failed to persist bootstrap config: %w", err)
|
|
|
|
}
|
|
|
|
|
2023-07-21 17:33:22 +00:00
|
|
|
// HCP only returns the management token if it requires Consul to
|
|
|
|
// initialize it
|
|
|
|
if bsCfg.ManagementToken != "" {
|
|
|
|
if err := validateManagementToken(bsCfg.ManagementToken); err != nil {
|
|
|
|
return "", fmt.Errorf("invalid management token: %w", err)
|
|
|
|
}
|
|
|
|
if err := persistManagementToken(dir, bsCfg.ManagementToken); err != nil {
|
|
|
|
return "", fmt.Errorf("failed to persist HCP management token: %w", err)
|
|
|
|
}
|
2023-04-27 20:27:39 +00:00
|
|
|
}
|
|
|
|
|
2023-07-21 17:33:22 +00:00
|
|
|
if err := persistSuccessMarker(dir); err != nil {
|
2023-04-27 20:27:39 +00:00
|
|
|
return "", fmt.Errorf("failed to persist success marker: %w", err)
|
|
|
|
}
|
|
|
|
}
|
2022-09-26 18:58:15 +00:00
|
|
|
return cfgJSON, nil
|
|
|
|
}
|
|
|
|
|
2023-07-21 17:33:22 +00:00
|
|
|
func persistSuccessMarker(dir string) error {
|
2024-01-24 15:51:43 +00:00
|
|
|
name := filepath.Join(dir, SuccessFileName)
|
2023-04-27 20:27:39 +00:00
|
|
|
return os.WriteFile(name, []byte(""), 0600)
|
2022-09-26 18:58:15 +00:00
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
}
|
2022-09-26 18:58:15 +00:00
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
func persistTLSCerts(dir string, serverCert, serverKey string, caCerts []string) error {
|
|
|
|
if serverCert == "" || serverKey == "" {
|
|
|
|
return fmt.Errorf("unexpected bootstrap response from HCP: missing TLS information")
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Write out CA cert(s). We write them all to one file because Go's x509
|
|
|
|
// machinery will read as many certs as it finds from each PEM file provided
|
|
|
|
// and add them separaetly to the CertPool for validation
|
2024-01-24 15:51:43 +00:00
|
|
|
f, err := os.OpenFile(filepath.Join(dir, CAFileName), os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0600)
|
2022-09-26 18:58:15 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
bf := bufio.NewWriter(f)
|
2023-04-27 20:27:39 +00:00
|
|
|
for _, caPEM := range caCerts {
|
2022-09-26 18:58:15 +00:00
|
|
|
bf.WriteString(caPEM + "\n")
|
|
|
|
}
|
|
|
|
if err := bf.Flush(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := f.Close(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
if err := os.WriteFile(filepath.Join(dir, CertFileName), []byte(serverCert), 0600); err != nil {
|
2022-09-26 18:58:15 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
if err := os.WriteFile(filepath.Join(dir, KeyFileName), []byte(serverKey), 0600); err != nil {
|
2022-09-26 18:58:15 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2023-07-21 17:33:22 +00:00
|
|
|
// Basic validation to ensure a UUID was loaded and assumes the token is non-empty
|
2023-04-27 20:27:39 +00:00
|
|
|
func validateManagementToken(token string) error {
|
2023-07-21 17:33:22 +00:00
|
|
|
// note: we assume that the token is not an empty string
|
2023-04-27 20:27:39 +00:00
|
|
|
if _, err := uuid.ParseUUID(token); err != nil {
|
|
|
|
return errors.New("management token is not a valid UUID")
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
2023-04-27 20:27:39 +00:00
|
|
|
return nil
|
|
|
|
}
|
2022-09-26 18:58:15 +00:00
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
func persistManagementToken(dir, token string) error {
|
2024-01-24 15:51:43 +00:00
|
|
|
name := filepath.Join(dir, TokenFileName)
|
2023-04-27 20:27:39 +00:00
|
|
|
return os.WriteFile(name, []byte(token), 0600)
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
func persistBootstrapConfig(dir, cfgJSON string) error {
|
2022-09-26 18:58:15 +00:00
|
|
|
// Persist the important bits we got from bootstrapping. The TLS certs are
|
|
|
|
// already persisted, just need to persist the config we are going to add.
|
2024-01-24 15:51:43 +00:00
|
|
|
name := filepath.Join(dir, ConfigFileName)
|
2022-11-10 16:26:01 +00:00
|
|
|
return os.WriteFile(name, []byte(cfgJSON), 0600)
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
func LoadPersistedBootstrapConfig(dataDir string, ui UI) (*RawBootstrapConfig, bool) {
|
2023-04-27 20:27:39 +00:00
|
|
|
if dataDir == "" {
|
|
|
|
// There's no files to load when in dev mode.
|
|
|
|
return nil, false
|
|
|
|
}
|
|
|
|
|
Move HCP Manager lifecycle management out of Link controller (#20401)
* Add function to get update channel for watching HCP Link
* Add MonitorHCPLink function
This function can be called in a goroutine to manage the lifecycle
of the HCP manager.
* Update HCP Manager config in link monitor before starting
This updates HCPMonitorLink so it updates the HCP manager
with an HCP client and management token when a Link is upserted.
* Let MonitorHCPManager handle lifecycle instead of link controller
* Remove cleanup from Link controller and move it to MonitorHCPLink
Previously, the Link Controller was responsible for cleaning up the
HCP-related files on the file system. This change makes it so
MonitorHCPLink handles this cleanup. As a result, we are able to remove
the PlacementEachServer placement strategy for the Link controller
because it no longer needs to do this per-node cleanup.
* Remove HCP Manager dependency from Link Controller
The Link controller does not need to have HCP Manager
as a dependency anymore, so this removes that dependency
in order to simplify the design.
* Add Linked prefix to Linked status variables
This is in preparation for adding a new status type to the
Link resource.
* Add new "validated" status type to link resource
The link resource controller will now set a "validated" status
in addition to the "linked" status. This is needed so that other
components (eg the HCP manager) know when the Link is ready to link
with HCP.
* Fix tests
* Handle new 'EndOfSnapshot' WatchList event
* Fix watch test
* Remove unnecessary config from TestAgent_scadaProvider
Since the Scada provider is now started on agent startup
regardless of whether a cloud config is provided, this removes
the cloud config override from the relevant test.
This change is not exactly related to the changes from this PR,
but rather is something small and sort of related that was noticed
while working on this PR.
* Simplify link watch test and remove sleep from link watch
This updates the link watch test so that it uses more mocks
and does not require setting up the infrastructure for the HCP Link
controller.
This also removes the time.Sleep delay in the link watcher loop in favor
of an error counter. When we receive 10 consecutive errors, we shut down
the link watcher loop.
* Add better logging for link validation. Remove EndOfSnapshot test.
* Refactor link monitor test into a table test
* Add some clarifying comments to link monitor
* Simplify link watch test
* Test a bunch more errors cases in link monitor test
* Use exponential backoff instead of errorCounter in LinkWatch
* Move link watch and link monitor into a single goroutine called from server.go
* Refactor HCP link watcher to use single go-routine.
Previously, if the WatchClient errored, we would've never recovered
because we never retry to create the stream. With this change,
we have a single goroutine that runs for the life of the server agent
and if the WatchClient stream ever errors, we retry the creation
of the stream with an exponential backoff.
2024-02-12 15:48:23 +00:00
|
|
|
dir := filepath.Join(dataDir, constants.SubDir)
|
2023-04-27 20:27:39 +00:00
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
_, err := os.Stat(filepath.Join(dir, SuccessFileName))
|
2023-04-27 20:27:39 +00:00
|
|
|
if os.IsNotExist(err) {
|
|
|
|
// Haven't bootstrapped from HCP.
|
|
|
|
return nil, false
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
ui.Warn("failed to check for config on disk, re-fetching from HCP: " + err.Error())
|
|
|
|
return nil, false
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := checkCerts(dir); err != nil {
|
|
|
|
ui.Warn("failed to validate certs on disk, re-fetching from HCP: " + err.Error())
|
|
|
|
return nil, false
|
|
|
|
}
|
|
|
|
|
|
|
|
configJSON, err := loadBootstrapConfigJSON(dataDir)
|
|
|
|
if err != nil {
|
|
|
|
ui.Warn("failed to load bootstrap config from disk, re-fetching from HCP: " + err.Error())
|
|
|
|
return nil, false
|
|
|
|
}
|
|
|
|
|
|
|
|
mgmtToken, err := loadManagementToken(dir)
|
|
|
|
if err != nil {
|
|
|
|
ui.Warn("failed to load HCP management token from disk, re-fetching from HCP: " + err.Error())
|
|
|
|
return nil, false
|
|
|
|
}
|
|
|
|
|
|
|
|
return &RawBootstrapConfig{
|
|
|
|
ConfigJSON: configJSON,
|
|
|
|
ManagementToken: mgmtToken,
|
|
|
|
}, true
|
|
|
|
}
|
|
|
|
|
|
|
|
func loadBootstrapConfigJSON(dataDir string) (string, error) {
|
Move HCP Manager lifecycle management out of Link controller (#20401)
* Add function to get update channel for watching HCP Link
* Add MonitorHCPLink function
This function can be called in a goroutine to manage the lifecycle
of the HCP manager.
* Update HCP Manager config in link monitor before starting
This updates HCPMonitorLink so it updates the HCP manager
with an HCP client and management token when a Link is upserted.
* Let MonitorHCPManager handle lifecycle instead of link controller
* Remove cleanup from Link controller and move it to MonitorHCPLink
Previously, the Link Controller was responsible for cleaning up the
HCP-related files on the file system. This change makes it so
MonitorHCPLink handles this cleanup. As a result, we are able to remove
the PlacementEachServer placement strategy for the Link controller
because it no longer needs to do this per-node cleanup.
* Remove HCP Manager dependency from Link Controller
The Link controller does not need to have HCP Manager
as a dependency anymore, so this removes that dependency
in order to simplify the design.
* Add Linked prefix to Linked status variables
This is in preparation for adding a new status type to the
Link resource.
* Add new "validated" status type to link resource
The link resource controller will now set a "validated" status
in addition to the "linked" status. This is needed so that other
components (eg the HCP manager) know when the Link is ready to link
with HCP.
* Fix tests
* Handle new 'EndOfSnapshot' WatchList event
* Fix watch test
* Remove unnecessary config from TestAgent_scadaProvider
Since the Scada provider is now started on agent startup
regardless of whether a cloud config is provided, this removes
the cloud config override from the relevant test.
This change is not exactly related to the changes from this PR,
but rather is something small and sort of related that was noticed
while working on this PR.
* Simplify link watch test and remove sleep from link watch
This updates the link watch test so that it uses more mocks
and does not require setting up the infrastructure for the HCP Link
controller.
This also removes the time.Sleep delay in the link watcher loop in favor
of an error counter. When we receive 10 consecutive errors, we shut down
the link watcher loop.
* Add better logging for link validation. Remove EndOfSnapshot test.
* Refactor link monitor test into a table test
* Add some clarifying comments to link monitor
* Simplify link watch test
* Test a bunch more errors cases in link monitor test
* Use exponential backoff instead of errorCounter in LinkWatch
* Move link watch and link monitor into a single goroutine called from server.go
* Refactor HCP link watcher to use single go-routine.
Previously, if the WatchClient errored, we would've never recovered
because we never retry to create the stream. With this change,
we have a single goroutine that runs for the life of the server agent
and if the WatchClient stream ever errors, we retry the creation
of the stream with an exponential backoff.
2024-02-12 15:48:23 +00:00
|
|
|
filename := filepath.Join(dataDir, constants.SubDir, ConfigFileName)
|
2023-04-27 20:27:39 +00:00
|
|
|
|
|
|
|
_, err := os.Stat(filename)
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
return "", nil
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf("failed to check for bootstrap config: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
jsonBs, err := os.ReadFile(filename)
|
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf(fmt.Sprintf("failed to read local bootstrap config file: %s", err))
|
|
|
|
}
|
|
|
|
return strings.TrimSpace(string(jsonBs)), nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func loadManagementToken(dir string) (string, error) {
|
2024-01-24 15:51:43 +00:00
|
|
|
name := filepath.Join(dir, TokenFileName)
|
2023-04-27 20:27:39 +00:00
|
|
|
bytes, err := os.ReadFile(name)
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
return "", errors.New("configuration files on disk are incomplete, missing: " + name)
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf("failed to read: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
token := string(bytes)
|
|
|
|
if err := validateManagementToken(token); err != nil {
|
|
|
|
return "", fmt.Errorf("invalid management token: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return token, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func checkCerts(dir string) error {
|
2022-09-26 18:58:15 +00:00
|
|
|
files := []string{
|
2024-01-24 15:51:43 +00:00
|
|
|
filepath.Join(dir, CAFileName),
|
|
|
|
filepath.Join(dir, CertFileName),
|
|
|
|
filepath.Join(dir, KeyFileName),
|
2023-04-27 20:27:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
missing := make([]string, 0)
|
|
|
|
for _, file := range files {
|
|
|
|
_, err := os.Stat(file)
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
missing = append(missing, file)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
// If all the TLS files are missing, assume this is intentional.
|
|
|
|
// Existing clusters do not receive any TLS certs.
|
|
|
|
if len(missing) == len(files) {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// If only some of the files are missing, something went wrong.
|
|
|
|
if len(missing) > 0 {
|
|
|
|
return fmt.Errorf("configuration files on disk are incomplete, missing: %v", missing)
|
|
|
|
}
|
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
cert, key, caCerts, err := LoadCerts(dir)
|
2023-04-27 20:27:39 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to load certs from disk: %w", err)
|
|
|
|
}
|
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
if err = ValidateTLSCerts(cert, key, caCerts); err != nil {
|
2023-04-27 20:27:39 +00:00
|
|
|
return fmt.Errorf("invalid certs on disk: %w", err)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
func LoadCerts(dir string) (cert, key string, caCerts []string, err error) {
|
|
|
|
certPEMBlock, err := os.ReadFile(filepath.Join(dir, CertFileName))
|
2023-04-27 20:27:39 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", "", nil, err
|
|
|
|
}
|
2024-01-24 15:51:43 +00:00
|
|
|
keyPEMBlock, err := os.ReadFile(filepath.Join(dir, KeyFileName))
|
2023-04-27 20:27:39 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", "", nil, err
|
|
|
|
}
|
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
caPEMs, err := os.ReadFile(filepath.Join(dir, CAFileName))
|
2023-04-27 20:27:39 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", "", nil, err
|
|
|
|
}
|
|
|
|
caCerts, err = splitCACerts(caPEMs)
|
2022-09-26 18:58:15 +00:00
|
|
|
if err != nil {
|
2023-04-27 20:27:39 +00:00
|
|
|
return "", "", nil, fmt.Errorf("failed to parse CA certs: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return string(certPEMBlock), string(keyPEMBlock), caCerts, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// splitCACerts takes a list of concatenated PEM blocks and splits
|
|
|
|
// them back up into strings. This is used because CACerts are written
|
|
|
|
// into a single file, but validated individually.
|
|
|
|
func splitCACerts(caPEMs []byte) ([]string, error) {
|
|
|
|
var out []string
|
|
|
|
|
|
|
|
for {
|
|
|
|
nextBlock, remaining := pem.Decode(caPEMs)
|
|
|
|
if nextBlock == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
if nextBlock.Type != "CERTIFICATE" {
|
|
|
|
return nil, fmt.Errorf("PEM-block should be CERTIFICATE type")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Collect up to the start of the remaining bytes.
|
|
|
|
// We don't grab nextBlock.Bytes because it's not PEM encoded.
|
|
|
|
out = append(out, string(caPEMs[:len(caPEMs)-len(remaining)]))
|
|
|
|
caPEMs = remaining
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
if len(out) == 0 {
|
|
|
|
return nil, errors.New("invalid CA certificate")
|
|
|
|
}
|
|
|
|
return out, nil
|
|
|
|
}
|
|
|
|
|
2024-01-24 15:51:43 +00:00
|
|
|
// ValidateTLSCerts checks that the CA cert, server cert, and key on disk are structurally valid.
|
2023-04-27 20:27:39 +00:00
|
|
|
//
|
|
|
|
// OPTIMIZE: This could be improved by returning an error if certs are expired or close to expiration.
|
|
|
|
// However, that requires issuing new certs on bootstrap requests, since returning an error
|
|
|
|
// would trigger a re-fetch from HCP.
|
2024-01-24 15:51:43 +00:00
|
|
|
func ValidateTLSCerts(cert, key string, caCerts []string) error {
|
2023-04-27 20:27:39 +00:00
|
|
|
leaf, err := tls.X509KeyPair([]byte(cert), []byte(key))
|
|
|
|
if err != nil {
|
|
|
|
return errors.New("invalid server certificate or key")
|
|
|
|
}
|
|
|
|
_, err = x509.ParseCertificate(leaf.Certificate[0])
|
|
|
|
if err != nil {
|
|
|
|
return errors.New("invalid server certificate")
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
|
|
|
|
2023-04-27 20:27:39 +00:00
|
|
|
for _, caCert := range caCerts {
|
|
|
|
_, err = connect.ParseCert(caCert)
|
|
|
|
if err != nil {
|
|
|
|
return errors.New("invalid CA certificate")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
2022-09-26 18:58:15 +00:00
|
|
|
}
|
2024-01-24 15:51:43 +00:00
|
|
|
|
|
|
|
// LoadManagementToken returns the management token, either by loading it from the persisted
|
|
|
|
// token config file or by fetching it from HCP if the token file does not exist.
|
|
|
|
func LoadManagementToken(ctx context.Context, logger hclog.Logger, client hcpclient.Client, dataDir string) (string, error) {
|
Move HCP Manager lifecycle management out of Link controller (#20401)
* Add function to get update channel for watching HCP Link
* Add MonitorHCPLink function
This function can be called in a goroutine to manage the lifecycle
of the HCP manager.
* Update HCP Manager config in link monitor before starting
This updates HCPMonitorLink so it updates the HCP manager
with an HCP client and management token when a Link is upserted.
* Let MonitorHCPManager handle lifecycle instead of link controller
* Remove cleanup from Link controller and move it to MonitorHCPLink
Previously, the Link Controller was responsible for cleaning up the
HCP-related files on the file system. This change makes it so
MonitorHCPLink handles this cleanup. As a result, we are able to remove
the PlacementEachServer placement strategy for the Link controller
because it no longer needs to do this per-node cleanup.
* Remove HCP Manager dependency from Link Controller
The Link controller does not need to have HCP Manager
as a dependency anymore, so this removes that dependency
in order to simplify the design.
* Add Linked prefix to Linked status variables
This is in preparation for adding a new status type to the
Link resource.
* Add new "validated" status type to link resource
The link resource controller will now set a "validated" status
in addition to the "linked" status. This is needed so that other
components (eg the HCP manager) know when the Link is ready to link
with HCP.
* Fix tests
* Handle new 'EndOfSnapshot' WatchList event
* Fix watch test
* Remove unnecessary config from TestAgent_scadaProvider
Since the Scada provider is now started on agent startup
regardless of whether a cloud config is provided, this removes
the cloud config override from the relevant test.
This change is not exactly related to the changes from this PR,
but rather is something small and sort of related that was noticed
while working on this PR.
* Simplify link watch test and remove sleep from link watch
This updates the link watch test so that it uses more mocks
and does not require setting up the infrastructure for the HCP Link
controller.
This also removes the time.Sleep delay in the link watcher loop in favor
of an error counter. When we receive 10 consecutive errors, we shut down
the link watcher loop.
* Add better logging for link validation. Remove EndOfSnapshot test.
* Refactor link monitor test into a table test
* Add some clarifying comments to link monitor
* Simplify link watch test
* Test a bunch more errors cases in link monitor test
* Use exponential backoff instead of errorCounter in LinkWatch
* Move link watch and link monitor into a single goroutine called from server.go
* Refactor HCP link watcher to use single go-routine.
Previously, if the WatchClient errored, we would've never recovered
because we never retry to create the stream. With this change,
we have a single goroutine that runs for the life of the server agent
and if the WatchClient stream ever errors, we retry the creation
of the stream with an exponential backoff.
2024-02-12 15:48:23 +00:00
|
|
|
hcpCfgDir := filepath.Join(dataDir, constants.SubDir)
|
2024-01-24 15:51:43 +00:00
|
|
|
token, err := loadManagementToken(hcpCfgDir)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
logger.Debug("failed to load management token from local disk, fetching configuration from HCP", "error", err)
|
|
|
|
var err error
|
|
|
|
cfg, err := fetchBootstrapConfig(ctx, client, dataDir)
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
logger.Debug("configuration fetched from HCP and saved on local disk")
|
|
|
|
token = cfg.ManagementToken
|
|
|
|
} else {
|
|
|
|
logger.Trace("loaded HCP configuration from local disk")
|
|
|
|
}
|
|
|
|
|
|
|
|
return token, nil
|
|
|
|
}
|