Move RaftStats to Status endpoint

pull/2788/head
Kyle Havlovitz 2017-03-07 13:58:06 -08:00
parent fb259e3d04
commit c3d638e2c5
No known key found for this signature in database
GPG Key ID: 8A5E6B173056AD6C
7 changed files with 131 additions and 99 deletions

View File

@ -242,5 +242,10 @@ func (s *HTTPServer) OperatorServerHealth(resp http.ResponseWriter, req *http.Re
return nil, err return nil, err
} }
// Reply with status 429 if something is unhealthy
if !reply.Healthy {
resp.WriteHeader(http.StatusTooManyRequests)
}
return reply, nil return reply, nil
} }

View File

@ -8,8 +8,8 @@ import (
"strings" "strings"
"testing" "testing"
"github.com/hashicorp/consul-enterprise/testutil"
"github.com/hashicorp/consul/consul/structs" "github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"github.com/hashicorp/serf/serf" "github.com/hashicorp/serf/serf"
) )

View File

@ -92,27 +92,30 @@ func (s *Server) pruneDeadServers() error {
return err return err
} }
// Look for dead servers to clean up // Find any failed servers
var failed []string
if autopilotConf.CleanupDeadServers {
for _, member := range s.serfLAN.Members() {
valid, _ := agent.IsConsulServer(member)
if valid && member.Status == serf.StatusFailed {
failed = append(failed, member.Name)
}
}
}
peers, err := s.numPeers() peers, err := s.numPeers()
if err != nil { if err != nil {
return err return err
} }
removed := 0 // Only do removals if a minority of servers will be affected
if autopilotConf.CleanupDeadServers { if len(failed) <= peers/2 {
for _, member := range s.serfLAN.Members() { for _, server := range failed {
// Exit early if we already removed the max amount of servers s.logger.Printf("[INFO] consul: Attempting removal of failed server: %v", server)
if removed == peers/2 { go s.serfLAN.RemoveFailedNode(server)
break
}
valid, _ := agent.IsConsulServer(member)
if valid && member.Status == serf.StatusFailed {
removed++
s.logger.Printf("[INFO] consul: Attempting removal of failed server: %v", member.Name)
go s.serfLAN.RemoveFailedNode(member.Name)
}
} }
} else {
s.logger.Printf("[ERR] consul: Failed to remove dead servers: too many dead servers: %d/%d", len(failed), peers)
} }
return nil return nil
@ -125,7 +128,11 @@ func (s *Server) promoteNonVoters(autopilotConf *structs.AutopilotConfig) error
return fmt.Errorf("error getting server raft protocol versions: %s", err) return fmt.Errorf("error getting server raft protocol versions: %s", err)
} }
if minRaftProtocol >= 3 { // If we don't meet the minimum version for non-voter features, bail early
if minRaftProtocol < 3 {
return nil
}
future := s.raft.GetConfiguration() future := s.raft.GetConfiguration()
if err := future.Error(); err != nil { if err := future.Error(); err != nil {
return fmt.Errorf("failed to get raft configuration: %v", err) return fmt.Errorf("failed to get raft configuration: %v", err)
@ -178,9 +185,9 @@ func (s *Server) promoteNonVoters(autopilotConf *structs.AutopilotConfig) error
// If we added a new server, trigger a check to remove dead servers // If we added a new server, trigger a check to remove dead servers
if newServers { if newServers {
go func() { select {
s.autopilotRemoveDeadCh <- struct{}{} case s.autopilotRemoveDeadCh <- struct{}{}:
}() default:
} }
} }
@ -190,47 +197,35 @@ func (s *Server) promoteNonVoters(autopilotConf *structs.AutopilotConfig) error
// queryServerHealth fetches the raft stats for the given server and uses them // queryServerHealth fetches the raft stats for the given server and uses them
// to update its ServerHealth // to update its ServerHealth
func (s *Server) queryServerHealth(member serf.Member, server *agent.Server, autopilotConf *structs.AutopilotConfig) *structs.ServerHealth { func (s *Server) queryServerHealth(member serf.Member, server *agent.Server, autopilotConf *structs.AutopilotConfig) *structs.ServerHealth {
stats, err := s.getServerStats(server)
if err != nil {
s.logger.Printf("[DEBUG] consul: error getting server's raft stats: %s", err)
}
health := &structs.ServerHealth{ health := &structs.ServerHealth{
ID: server.ID, ID: server.ID,
Name: server.Name, Name: server.Name,
SerfStatusRaw: member.Status, SerfStatusRaw: member.Status,
SerfStatus: member.Status.String(), SerfStatus: member.Status.String(),
LastContactRaw: -1, LastContactRaw: -1,
LastContact: "never", LastContact: stats.LastContact,
LastTerm: stats.LastTerm,
LastIndex: stats.LastIndex,
} }
stats, err := s.getServerStats(server) if health.LastContact != "never" {
if err != nil { health.LastContactRaw, err = time.ParseDuration(health.LastContact)
s.logger.Printf("[DEBUG] consul: error getting server's raft stats: %s", err)
}
if v, ok := stats["last_contact"]; ok && v != "never" {
health.LastContactRaw, err = time.ParseDuration(v)
if err != nil { if err != nil {
s.logger.Printf("[DEBUG] consul: error parsing server's last_contact value: %s", err) s.logger.Printf("[DEBUG] consul: error parsing server's last_contact value: %s", err)
} }
health.LastContact = health.LastContactRaw.String()
} }
// Set LastContact to 0 if we're the leader
// Set LastContact to 0 for the leader
if s.config.NodeName == member.Name { if s.config.NodeName == member.Name {
health.LastContactRaw = 0 health.LastContactRaw = 0
health.LastContact = "leader" health.LastContact = "leader"
} }
if v, ok := stats["last_log_index"]; ok {
health.LastIndex, err = strconv.ParseUint(v, 10, 64)
if err != nil {
s.logger.Printf("[DEBUG] consul: error parsing server's last_log_index value: %s", err)
}
}
if v, ok := stats["last_log_term"]; ok {
health.LastTerm, err = strconv.ParseUint(v, 10, 64)
if err != nil {
s.logger.Printf("[DEBUG] consul: error parsing server's last_log_term value: %s", err)
}
}
health.Healthy = s.isServerHealthy(health, autopilotConf) health.Healthy = s.isServerHealthy(health, autopilotConf)
// If this is a new server or the health changed, reset StableSince // If this is a new server or the health changed, reset StableSince
@ -254,10 +249,10 @@ func (s *Server) getServerHealth(addr string) *structs.ServerHealth {
return h return h
} }
func (s *Server) getServerStats(server *agent.Server) (map[string]string, error) { func (s *Server) getServerStats(server *agent.Server) (structs.ServerStats, error) {
var args struct{} var args struct{}
var reply map[string]string var reply structs.ServerStats
err := s.connPool.RPC(s.config.Datacenter, server.Addr, server.Version, "Operator.RaftStats", &args, &reply) err := s.connPool.RPC(s.config.Datacenter, server.Addr, server.Version, "Status.RaftStats", &args, &reply)
return reply, err return reply, err
} }

View File

@ -624,9 +624,10 @@ func (s *Server) joinConsulServer(m serf.Member, parts *agent.Server) error {
} }
// Trigger a check to remove dead servers // Trigger a check to remove dead servers
go func() { select {
s.autopilotRemoveDeadCh <- struct{}{} case s.autopilotRemoveDeadCh <- struct{}{}:
}() default:
}
return nil return nil
} }

View File

@ -185,12 +185,6 @@ func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRe
return nil return nil
} }
// Used by Autopilot to query the raft stats of the local server.
func (op *Operator) RaftStats(args struct{}, reply *map[string]string) error {
*reply = op.srv.raft.Stats()
return nil
}
// ServerHealth is used to get the current health of the servers. // ServerHealth is used to get the current health of the servers.
func (op *Operator) ServerHealth(args *structs.DCSpecificRequest, reply *structs.OperatorHealthReply) error { func (op *Operator) ServerHealth(args *structs.DCSpecificRequest, reply *structs.OperatorHealthReply) error {
// This must be sent to the leader, so we fix the args since we are // This must be sent to the leader, so we fix the args since we are

View File

@ -1,5 +1,12 @@
package consul package consul
import (
"fmt"
"strconv"
"github.com/hashicorp/consul/consul/structs"
)
// Status endpoint is used to check on server status // Status endpoint is used to check on server status
type Status struct { type Status struct {
server *Server server *Server
@ -33,3 +40,21 @@ func (s *Status) Peers(args struct{}, reply *[]string) error {
} }
return nil return nil
} }
// Used by Autopilot to query the raft stats of the local server.
func (s *Status) RaftStats(args struct{}, reply *structs.ServerStats) error {
stats := s.server.raft.Stats()
var err error
reply.LastContact = stats["last_contact"]
reply.LastIndex, err = strconv.ParseUint(stats["last_log_index"], 10, 64)
if err != nil {
return fmt.Errorf("error parsing server's last_log_index value: %s", err)
}
reply.LastTerm, err = strconv.ParseUint(stats["last_log_term"], 10, 64)
if err != nil {
return fmt.Errorf("error parsing server's last_log_term value: %s", err)
}
return nil
}

View File

@ -132,6 +132,18 @@ type ServerHealth struct {
StableSince time.Time StableSince time.Time
} }
// ServerStats holds miscellaneous Raft metrics for a server
type ServerStats struct {
// LastContact is the time since this node's last contact with the leader.
LastContact string
// LastTerm is the highest leader term this server has a record of in its Raft log.
LastTerm uint64
// LastIndex is the last log index this server has a record of in its Raft log.
LastIndex uint64
}
// OperatorHealthReply is a representation of the overall health of the cluster // OperatorHealthReply is a representation of the overall health of the cluster
type OperatorHealthReply struct { type OperatorHealthReply struct {
// Healthy is true if all the servers in the cluster are healthy. // Healthy is true if all the servers in the cluster are healthy.