You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
consul/agent/operator_endpoint.go

454 lines
14 KiB

package agent
import (
"fmt"
"net/http"
"strconv"
"time"
"github.com/armon/go-metrics"
external "github.com/hashicorp/consul/agent/grpc-external"
Protobuf Refactoring for Multi-Module Cleanliness (#16302) Protobuf Refactoring for Multi-Module Cleanliness This commit includes the following: Moves all packages that were within proto/ to proto/private Rewrites imports to account for the packages being moved Adds in buf.work.yaml to enable buf workspaces Names the proto-public buf module so that we can override the Go package imports within proto/buf.yaml Bumps the buf version dependency to 1.14.0 (I was trying out the version to see if it would get around an issue - it didn't but it also doesn't break things and it seemed best to keep up with the toolchain changes) Why: In the future we will need to consume other protobuf dependencies such as the Google HTTP annotations for openapi generation or grpc-gateway usage. There were some recent changes to have our own ratelimiting annotations. The two combined were not working when I was trying to use them together (attempting to rebase another branch) Buf workspaces should be the solution to the problem Buf workspaces means that each module will have generated Go code that embeds proto file names relative to the proto dir and not the top level repo root. This resulted in proto file name conflicts in the Go global protobuf type registry. The solution to that was to add in a private/ directory into the path within the proto/ directory. That then required rewriting all the imports. Is this safe? AFAICT yes The gRPC wire protocol doesn't seem to care about the proto file names (although the Go grpc code does tack on the proto file name as Metadata in the ServiceDesc) Other than imports, there were no changes to any generated code as a result of this.
2 years ago
"github.com/hashicorp/consul/proto/private/pboperator"
multierror "github.com/hashicorp/go-multierror"
"github.com/hashicorp/raft"
autopilot "github.com/hashicorp/raft-autopilot"
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/api"
)
// OperatorRaftConfiguration is used to inspect the current Raft configuration.
// This supports the stale query mode in case the cluster doesn't have a leader.
func (s *HTTPHandlers) OperatorRaftConfiguration(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var args structs.DCSpecificRequest
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
return nil, nil
}
var reply structs.RaftConfigurationResponse
if err := s.agent.RPC(req.Context(), "Operator.RaftGetConfiguration", &args, &reply); err != nil {
return nil, err
}
return reply, nil
}
Leadership transfer cmd (#14132) * add leadership transfer command * add RPC call test (flaky) * add missing import * add changelog * add command registration * Apply suggestions from code review Co-authored-by: Matt Keeler <mkeeler@users.noreply.github.com> * add the possibility of providing an id to raft leadership transfer. Add few tests. * delete old file from cherry pick * rename changelog filename to PR # * rename changelog and fix import * fix failing test * check for OperatorWrite Co-authored-by: Matt Keeler <mkeeler@users.noreply.github.com> * rename from leader-transfer to transfer-leader * remove version check and add test for operator read * move struct to operator.go * first pass * add code for leader transfer in the grpc backend and tests * wire the http endpoint to the new grpc endpoint * remove the RPC endpoint * remove non needed struct * fix naming * add mog glue to API * fix comment * remove dead code * fix linter error * change package name for proto file * remove error wrapping * fix failing test * add command registration * add grpc service mock tests * fix receiver to be pointer * use defined values Co-authored-by: Matt Keeler <mkeeler@users.noreply.github.com> * reuse MockAclAuthorizer * add documentation * remove usage of external.TokenFromContext * fix failing tests * fix proto generation * Apply suggestions from code review Co-authored-by: Jared Kirschner <85913323+jkirschner-hashicorp@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Jared Kirschner <85913323+jkirschner-hashicorp@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Jared Kirschner <85913323+jkirschner-hashicorp@users.noreply.github.com> * Apply suggestions from code review * add more context in doc for the reason * Apply suggestions from docs code review Co-authored-by: Jeff Boruszak <104028618+boruszak@users.noreply.github.com> * regenerate proto * fix linter errors Co-authored-by: github-team-consul-core <github-team-consul-core@hashicorp.com> Co-authored-by: Matt Keeler <mkeeler@users.noreply.github.com> Co-authored-by: Jared Kirschner <85913323+jkirschner-hashicorp@users.noreply.github.com> Co-authored-by: Jeff Boruszak <104028618+boruszak@users.noreply.github.com>
2 years ago
// OperatorRaftTransferLeader is used to transfer raft cluster leadership to another node
func (s *HTTPHandlers) OperatorRaftTransferLeader(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var entMeta acl.EnterpriseMeta
if err := s.parseEntMetaPartition(req, &entMeta); err != nil {
return nil, err
}
params := req.URL.Query()
_, hasID := params["id"]
ID := ""
if hasID {
ID = params.Get("id")
}
args := pboperator.TransferLeaderRequest{
ID: ID,
}
var token string
s.parseToken(req, &token)
ctx, err := external.ContextWithQueryOptions(req.Context(), structs.QueryOptions{Token: token})
if err != nil {
return nil, err
}
result, err := s.agent.rpcClientOperator.TransferLeader(ctx, &args)
if err != nil {
return nil, err
}
if result.Success != true {
return nil, HTTPError{StatusCode: http.StatusNotFound, Reason: fmt.Sprintf("Failed to transfer Leader: %s", err.Error())}
}
reply := new(api.TransferLeaderResponse)
pboperator.TransferLeaderResponseToAPI(result, reply)
return reply, nil
}
// OperatorRaftPeer supports actions on Raft peers. Currently we only support
// removing peers by address.
func (s *HTTPHandlers) OperatorRaftPeer(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var args structs.RaftRemovePeerRequest
s.parseDC(req, &args.Datacenter)
s.parseToken(req, &args.Token)
params := req.URL.Query()
_, hasID := params["id"]
if hasID {
args.ID = raft.ServerID(params.Get("id"))
}
_, hasAddress := params["address"]
if hasAddress {
args.Address = raft.ServerAddress(params.Get("address"))
}
if !hasID && !hasAddress {
return nil, HTTPError{
StatusCode: http.StatusBadRequest,
Reason: "Must specify either ?id with the server's ID or ?address with IP:port of peer to remove",
}
}
if hasID && hasAddress {
return nil, HTTPError{StatusCode: http.StatusBadRequest, Reason: "Must specify only one of ?id or ?address"}
}
var reply struct{}
method := "Operator.RaftRemovePeerByID"
if hasAddress {
method = "Operator.RaftRemovePeerByAddress"
}
if err := s.agent.RPC(req.Context(), method, &args, &reply); err != nil {
return nil, err
}
return nil, nil
}
type keyringArgs struct {
Key string
Token string
RelayFactor uint8
LocalOnly bool // ?local-only; only used for GET requests
}
// OperatorKeyringEndpoint handles keyring operations (install, list, use, remove)
func (s *HTTPHandlers) OperatorKeyringEndpoint(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var args keyringArgs
if req.Method == "POST" || req.Method == "PUT" || req.Method == "DELETE" {
if err := decodeBody(req.Body, &args); err != nil {
return nil, HTTPError{StatusCode: http.StatusBadRequest, Reason: fmt.Sprintf("Request decode failed: %v", err)}
}
}
s.parseToken(req, &args.Token)
// Parse relay factor
if relayFactor := req.URL.Query().Get("relay-factor"); relayFactor != "" {
n, err := strconv.Atoi(relayFactor)
if err != nil {
return nil, HTTPError{StatusCode: http.StatusBadRequest, Reason: fmt.Sprintf("Error parsing relay factor: %v", err)}
}
args.RelayFactor, err = ParseRelayFactor(n)
if err != nil {
return nil, HTTPError{StatusCode: http.StatusBadRequest, Reason: fmt.Sprintf("Invalid relay-factor: %v", err)}
}
}
// Parse local-only. local-only can only be used in GET requests.
if localOnly := req.URL.Query().Get("local-only"); localOnly != "" {
var err error
args.LocalOnly, err = strconv.ParseBool(localOnly)
if err != nil {
return nil, HTTPError{StatusCode: http.StatusBadRequest, Reason: fmt.Sprintf("Error parsing local-only: %v", err)}
}
err = ValidateLocalOnly(args.LocalOnly, req.Method == "GET")
if err != nil {
return nil, HTTPError{StatusCode: http.StatusBadRequest, Reason: fmt.Sprintf("Invalid use of local-only: %v", err)}
}
}
// Switch on the method
switch req.Method {
case "GET":
return s.KeyringList(resp, req, &args)
case "POST":
return s.KeyringInstall(resp, req, &args)
case "PUT":
return s.KeyringUse(resp, req, &args)
case "DELETE":
return s.KeyringRemove(resp, req, &args)
default:
return nil, MethodNotAllowedError{req.Method, []string{"GET", "POST", "PUT", "DELETE"}}
}
}
// KeyringInstall is used to install a new gossip encryption key into the cluster
func (s *HTTPHandlers) KeyringInstall(resp http.ResponseWriter, req *http.Request, args *keyringArgs) (interface{}, error) {
responses, err := s.agent.InstallKey(args.Key, args.Token, args.RelayFactor)
if err != nil {
return nil, err
}
return nil, keyringErrorsOrNil(responses.Responses)
}
// KeyringList is used to list the keys installed in the cluster
func (s *HTTPHandlers) KeyringList(resp http.ResponseWriter, req *http.Request, args *keyringArgs) (interface{}, error) {
responses, err := s.agent.ListKeys(args.Token, args.LocalOnly, args.RelayFactor)
if err != nil {
return nil, err
}
return responses.Responses, keyringErrorsOrNil(responses.Responses)
}
// KeyringRemove is used to list the keys installed in the cluster
func (s *HTTPHandlers) KeyringRemove(resp http.ResponseWriter, req *http.Request, args *keyringArgs) (interface{}, error) {
responses, err := s.agent.RemoveKey(args.Key, args.Token, args.RelayFactor)
if err != nil {
return nil, err
}
return nil, keyringErrorsOrNil(responses.Responses)
}
// KeyringUse is used to change the primary gossip encryption key
func (s *HTTPHandlers) KeyringUse(resp http.ResponseWriter, req *http.Request, args *keyringArgs) (interface{}, error) {
responses, err := s.agent.UseKey(args.Key, args.Token, args.RelayFactor)
if err != nil {
return nil, err
}
return nil, keyringErrorsOrNil(responses.Responses)
}
func keyringErrorsOrNil(responses []*structs.KeyringResponse) error {
var errs error
for _, response := range responses {
if response.Error != "" {
pool := response.Datacenter + " (LAN)"
if response.WAN {
pool = "WAN"
}
if response.Segment != "" {
pool += " [segment: " + response.Segment + "]"
} else if !acl.IsDefaultPartition(response.Partition) {
pool += " [partition: " + response.Partition + "]"
}
errs = multierror.Append(errs, fmt.Errorf("%s error: %s", pool, response.Error))
for key, message := range response.Messages {
errs = multierror.Append(errs, fmt.Errorf("%s: %s", key, message))
}
}
}
return errs
}
// OperatorAutopilotConfiguration is used to inspect the current Autopilot configuration.
// This supports the stale query mode in case the cluster doesn't have a leader.
func (s *HTTPHandlers) OperatorAutopilotConfiguration(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
// Switch on the method
switch req.Method {
case "GET":
var args structs.DCSpecificRequest
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
return nil, nil
}
var reply structs.AutopilotConfig
if err := s.agent.RPC(req.Context(), "Operator.AutopilotGetConfiguration", &args, &reply); err != nil {
return nil, err
}
out := api.AutopilotConfiguration{
CleanupDeadServers: reply.CleanupDeadServers,
LastContactThreshold: api.NewReadableDuration(reply.LastContactThreshold),
MaxTrailingLogs: reply.MaxTrailingLogs,
MinQuorum: reply.MinQuorum,
ServerStabilizationTime: api.NewReadableDuration(reply.ServerStabilizationTime),
RedundancyZoneTag: reply.RedundancyZoneTag,
DisableUpgradeMigration: reply.DisableUpgradeMigration,
UpgradeVersionTag: reply.UpgradeVersionTag,
CreateIndex: reply.CreateIndex,
ModifyIndex: reply.ModifyIndex,
}
return out, nil
case "PUT":
var args structs.AutopilotSetConfigRequest
s.parseDC(req, &args.Datacenter)
s.parseToken(req, &args.Token)
conf := api.NewAutopilotConfiguration()
if err := decodeBody(req.Body, &conf); err != nil {
return nil, HTTPError{StatusCode: http.StatusBadRequest, Reason: fmt.Sprintf("Error parsing autopilot config: %v", err)}
}
args.Config = structs.AutopilotConfig{
CleanupDeadServers: conf.CleanupDeadServers,
LastContactThreshold: conf.LastContactThreshold.Duration(),
MaxTrailingLogs: conf.MaxTrailingLogs,
MinQuorum: conf.MinQuorum,
ServerStabilizationTime: conf.ServerStabilizationTime.Duration(),
RedundancyZoneTag: conf.RedundancyZoneTag,
DisableUpgradeMigration: conf.DisableUpgradeMigration,
UpgradeVersionTag: conf.UpgradeVersionTag,
}
// Check for cas value
params := req.URL.Query()
if _, ok := params["cas"]; ok {
casVal, err := strconv.ParseUint(params.Get("cas"), 10, 64)
if err != nil {
return nil, HTTPError{StatusCode: http.StatusBadRequest, Reason: fmt.Sprintf("Error parsing cas value: %v", err)}
}
args.Config.ModifyIndex = casVal
args.CAS = true
}
var reply bool
if err := s.agent.RPC(req.Context(), "Operator.AutopilotSetConfiguration", &args, &reply); err != nil {
return nil, err
}
// Only use the out value if this was a CAS
if !args.CAS {
return true, nil
}
return reply, nil
default:
return nil, MethodNotAllowedError{req.Method, []string{"GET", "PUT"}}
}
}
// OperatorServerHealth is used to get the health of the servers in the local DC
func (s *HTTPHandlers) OperatorServerHealth(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var args structs.DCSpecificRequest
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
return nil, nil
}
var reply structs.AutopilotHealthReply
if err := s.agent.RPC(req.Context(), "Operator.ServerHealth", &args, &reply); err != nil {
return nil, err
}
// Reply with status 429 if something is unhealthy
if !reply.Healthy {
resp.WriteHeader(http.StatusTooManyRequests)
}
out := &api.OperatorHealthReply{
Healthy: reply.Healthy,
FailureTolerance: reply.FailureTolerance,
}
for _, server := range reply.Servers {
out.Servers = append(out.Servers, api.ServerHealth{
ID: server.ID,
Name: server.Name,
Address: server.Address,
Version: server.Version,
Leader: server.Leader,
SerfStatus: server.SerfStatus.String(),
LastContact: api.NewReadableDuration(server.LastContact),
LastTerm: server.LastTerm,
LastIndex: server.LastIndex,
Healthy: server.Healthy,
Voter: server.Voter,
StableSince: server.StableSince.Round(time.Second).UTC(),
})
}
return out, nil
}
func (s *HTTPHandlers) OperatorAutopilotState(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var args structs.DCSpecificRequest
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
return nil, nil
}
var reply autopilot.State
if err := s.agent.RPC(req.Context(), "Operator.AutopilotState", &args, &reply); err != nil {
return nil, err
}
out := autopilotToAPIState(&reply)
return out, nil
}
func (s *HTTPHandlers) OperatorUsage(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
metrics.IncrCounterWithLabels([]string{"client", "api", "operator_usage"}, 1,
s.nodeMetricsLabels())
var args structs.OperatorUsageRequest
if err := s.parseEntMetaNoWildcard(req, &args.EnterpriseMeta); err != nil {
return nil, err
}
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
return nil, nil
}
if _, ok := req.URL.Query()["global"]; ok {
args.Global = true
}
// Make the RPC request
var out structs.Usage
defer setMeta(resp, &out.QueryMeta)
RETRY_ONCE:
err := s.agent.RPC(req.Context(), "Operator.Usage", &args, &out)
if err != nil {
metrics.IncrCounterWithLabels([]string{"client", "rpc", "error", "operator_usage"}, 1,
s.nodeMetricsLabels())
return nil, err
}
if args.QueryOptions.AllowStale && args.MaxStaleDuration > 0 && args.MaxStaleDuration < out.LastContact {
args.AllowStale = false
args.MaxStaleDuration = 0
goto RETRY_ONCE
}
out.ConsistencyLevel = args.QueryOptions.ConsistencyLevel()
metrics.IncrCounterWithLabels([]string{"client", "api", "success", "operator_usage"}, 1,
s.nodeMetricsLabels())
return out, nil
}
func stringIDs(ids []raft.ServerID) []string {
out := make([]string, len(ids))
for i, id := range ids {
out[i] = string(id)
}
return out
}
func autopilotToAPIState(state *autopilot.State) *api.AutopilotState {
out := &api.AutopilotState{
Healthy: state.Healthy,
FailureTolerance: state.FailureTolerance,
Leader: string(state.Leader),
Voters: stringIDs(state.Voters),
Servers: make(map[string]api.AutopilotServer),
}
for id, srv := range state.Servers {
out.Servers[string(id)] = autopilotToAPIServer(srv)
}
autopilotToAPIStateEnterprise(state, out)
return out
}
func autopilotToAPIServer(srv *autopilot.ServerState) api.AutopilotServer {
apiSrv := api.AutopilotServer{
ID: string(srv.Server.ID),
Name: srv.Server.Name,
Address: string(srv.Server.Address),
NodeStatus: string(srv.Server.NodeStatus),
Version: srv.Server.Version,
LastContact: api.NewReadableDuration(srv.Stats.LastContact),
LastTerm: srv.Stats.LastTerm,
LastIndex: srv.Stats.LastIndex,
Healthy: srv.Health.Healthy,
StableSince: srv.Health.StableSince,
Status: api.AutopilotServerStatus(srv.State),
Meta: srv.Server.Meta,
NodeType: api.AutopilotServerType(srv.Server.NodeType),
}
autopilotToAPIServerEnterprise(srv, &apiSrv)
return apiSrv
}