mirror of https://github.com/portainer/portainer
feat(observability): alerting experimental feature (#801)
Co-authored-by: JamesPlayer <james.player@portainer.io>pull/12608/merge
parent
b7e906701a
commit
96f2d69ae5
|
@ -776,6 +776,7 @@
|
|||
"ImageCount": 9,
|
||||
"IsPodman": false,
|
||||
"NodeCount": 0,
|
||||
"PerformanceMetrics": null,
|
||||
"RunningContainerCount": 5,
|
||||
"ServiceCount": 0,
|
||||
"StackCount": 2,
|
||||
|
|
|
@ -215,26 +215,34 @@ type (
|
|||
|
||||
// DockerSnapshot represents a snapshot of a specific Docker environment(endpoint) at a specific time
|
||||
DockerSnapshot struct {
|
||||
Time int64 `json:"Time"`
|
||||
DockerVersion string `json:"DockerVersion"`
|
||||
Swarm bool `json:"Swarm"`
|
||||
TotalCPU int `json:"TotalCPU"`
|
||||
TotalMemory int64 `json:"TotalMemory"`
|
||||
ContainerCount int `json:"ContainerCount"`
|
||||
RunningContainerCount int `json:"RunningContainerCount"`
|
||||
StoppedContainerCount int `json:"StoppedContainerCount"`
|
||||
HealthyContainerCount int `json:"HealthyContainerCount"`
|
||||
UnhealthyContainerCount int `json:"UnhealthyContainerCount"`
|
||||
VolumeCount int `json:"VolumeCount"`
|
||||
ImageCount int `json:"ImageCount"`
|
||||
ServiceCount int `json:"ServiceCount"`
|
||||
StackCount int `json:"StackCount"`
|
||||
SnapshotRaw DockerSnapshotRaw `json:"DockerSnapshotRaw"`
|
||||
NodeCount int `json:"NodeCount"`
|
||||
GpuUseAll bool `json:"GpuUseAll"`
|
||||
GpuUseList []string `json:"GpuUseList"`
|
||||
IsPodman bool `json:"IsPodman"`
|
||||
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"`
|
||||
Time int64 `json:"Time"`
|
||||
DockerVersion string `json:"DockerVersion"`
|
||||
Swarm bool `json:"Swarm"`
|
||||
TotalCPU int `json:"TotalCPU"`
|
||||
TotalMemory int64 `json:"TotalMemory"`
|
||||
ContainerCount int `json:"ContainerCount"`
|
||||
RunningContainerCount int `json:"RunningContainerCount"`
|
||||
StoppedContainerCount int `json:"StoppedContainerCount"`
|
||||
HealthyContainerCount int `json:"HealthyContainerCount"`
|
||||
UnhealthyContainerCount int `json:"UnhealthyContainerCount"`
|
||||
VolumeCount int `json:"VolumeCount"`
|
||||
ImageCount int `json:"ImageCount"`
|
||||
ServiceCount int `json:"ServiceCount"`
|
||||
StackCount int `json:"StackCount"`
|
||||
SnapshotRaw DockerSnapshotRaw `json:"DockerSnapshotRaw"`
|
||||
NodeCount int `json:"NodeCount"`
|
||||
GpuUseAll bool `json:"GpuUseAll"`
|
||||
GpuUseList []string `json:"GpuUseList"`
|
||||
IsPodman bool `json:"IsPodman"`
|
||||
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"`
|
||||
PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics"`
|
||||
}
|
||||
|
||||
// PerformanceMetrics represents the performance metrics of a Docker, Swarm, Podman, and Kubernetes environments
|
||||
PerformanceMetrics struct {
|
||||
CPUUsage float64 `json:"CPUUsage,omitempty"`
|
||||
MemoryUsage float64 `json:"MemoryUsage,omitempty"`
|
||||
NetworkUsage float64 `json:"NetworkUsage,omitempty"`
|
||||
}
|
||||
|
||||
// DockerContainerSnapshot is an extent of Docker's Container struct
|
||||
|
@ -663,12 +671,13 @@ type (
|
|||
|
||||
// KubernetesSnapshot represents a snapshot of a specific Kubernetes environment(endpoint) at a specific time
|
||||
KubernetesSnapshot struct {
|
||||
Time int64 `json:"Time"`
|
||||
KubernetesVersion string `json:"KubernetesVersion"`
|
||||
NodeCount int `json:"NodeCount"`
|
||||
TotalCPU int64 `json:"TotalCPU"`
|
||||
TotalMemory int64 `json:"TotalMemory"`
|
||||
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"`
|
||||
Time int64 `json:"Time"`
|
||||
KubernetesVersion string `json:"KubernetesVersion"`
|
||||
NodeCount int `json:"NodeCount"`
|
||||
TotalCPU int64 `json:"TotalCPU"`
|
||||
TotalMemory int64 `json:"TotalMemory"`
|
||||
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"`
|
||||
PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics"`
|
||||
}
|
||||
|
||||
// KubernetesConfiguration represents the configuration of a Kubernetes environment(endpoint)
|
||||
|
|
1
go.mod
1
go.mod
|
@ -61,6 +61,7 @@ require (
|
|||
k8s.io/cli-runtime v0.33.2
|
||||
k8s.io/client-go v0.33.2
|
||||
k8s.io/kubectl v0.33.2
|
||||
k8s.io/kubelet v0.33.2
|
||||
k8s.io/metrics v0.33.2
|
||||
software.sslmate.com/src/go-pkcs12 v0.0.0-20210415151418-c5206de65a78
|
||||
)
|
||||
|
|
2
go.sum
2
go.sum
|
@ -974,6 +974,8 @@ k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUy
|
|||
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8=
|
||||
k8s.io/kubectl v0.33.2 h1:7XKZ6DYCklu5MZQzJe+CkCjoGZwD1wWl7t/FxzhMz7Y=
|
||||
k8s.io/kubectl v0.33.2/go.mod h1:8rC67FB8tVTYraovAGNi/idWIK90z2CHFNMmGJZJ3KI=
|
||||
k8s.io/kubelet v0.33.2 h1:wxEau5/563oJb3j3KfrCKlNWWx35YlSgDLOYUBCQ0pg=
|
||||
k8s.io/kubelet v0.33.2/go.mod h1:way8VCDTUMiX1HTOvJv7M3xS/xNysJI6qh7TOqMe5KM=
|
||||
k8s.io/metrics v0.33.2 h1:gNCBmtnUMDMCRg9Ly5ehxP3OdKISMsOnh1vzk01iCgE=
|
||||
k8s.io/metrics v0.33.2/go.mod h1:yxoAosKGRsZisv3BGekC5W6T1J8XSV+PoUEevACRv7c=
|
||||
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro=
|
||||
|
|
|
@ -100,7 +100,10 @@ func dockerSnapshotNodes(snapshot *portainer.DockerSnapshot, cli *client.Client)
|
|||
|
||||
snapshot.TotalCPU = int(nanoCpus / 1e9)
|
||||
snapshot.TotalMemory = totalMem
|
||||
snapshot.NodeCount = len(nodes)
|
||||
snapshot.NodeCount = 1
|
||||
if snapshot.Swarm {
|
||||
snapshot.NodeCount = len(nodes)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -5,7 +5,9 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
"reflect"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
|
@ -19,11 +21,11 @@ import (
|
|||
corev1 "k8s.io/api/core/v1"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/client-go/kubernetes"
|
||||
statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
|
||||
)
|
||||
|
||||
func CreateKubernetesSnapshot(cli *kubernetes.Clientset) (*portainer.KubernetesSnapshot, error) {
|
||||
kubernetesSnapshot := &portainer.KubernetesSnapshot{}
|
||||
|
||||
err := kubernetesSnapshotVersion(kubernetesSnapshot, cli)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("unable to snapshot cluster version")
|
||||
|
@ -54,10 +56,28 @@ func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli *kubern
|
|||
return err
|
||||
}
|
||||
|
||||
if len(nodeList.Items) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var totalCPUs, totalMemory int64
|
||||
performanceMetrics := &portainer.PerformanceMetrics{
|
||||
CPUUsage: 0,
|
||||
MemoryUsage: 0,
|
||||
NetworkUsage: 0,
|
||||
}
|
||||
|
||||
for _, node := range nodeList.Items {
|
||||
totalCPUs += node.Status.Capacity.Cpu().Value()
|
||||
totalMemory += node.Status.Capacity.Memory().Value()
|
||||
|
||||
performanceMetrics, err = kubernetesSnapshotNodePerformanceMetrics(cli, node, performanceMetrics)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get node performance metrics: %w", err)
|
||||
}
|
||||
if performanceMetrics != nil {
|
||||
snapshot.PerformanceMetrics = performanceMetrics
|
||||
}
|
||||
}
|
||||
|
||||
snapshot.TotalCPU = totalCPUs
|
||||
|
@ -123,6 +143,40 @@ func kubernetesSnapshotPodErrorLogs(snapshot *portainer.KubernetesSnapshot, cli
|
|||
return nil
|
||||
}
|
||||
|
||||
func kubernetesSnapshotNodePerformanceMetrics(cli *kubernetes.Clientset, node corev1.Node, performanceMetrics *portainer.PerformanceMetrics) (*portainer.PerformanceMetrics, error) {
|
||||
result := cli.RESTClient().Get().AbsPath(fmt.Sprintf("/api/v1/nodes/%s/proxy/stats/summary", node.Name)).Do(context.TODO())
|
||||
if result.Error() != nil {
|
||||
return nil, fmt.Errorf("failed to get node performance metrics: %w", result.Error())
|
||||
}
|
||||
|
||||
raw, err := result.Raw()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get node performance metrics: %w", err)
|
||||
}
|
||||
|
||||
stats := statsapi.Summary{}
|
||||
err = json.Unmarshal(raw, &stats)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal node performance metrics: %w", err)
|
||||
}
|
||||
|
||||
nodeStats := stats.Node
|
||||
if reflect.DeepEqual(nodeStats, statsapi.NodeStats{}) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if nodeStats.CPU != nil && nodeStats.CPU.UsageNanoCores != nil {
|
||||
performanceMetrics.CPUUsage += math.Round(float64(*nodeStats.CPU.UsageNanoCores) / float64(node.Status.Capacity.Cpu().Value()*1000000000) * 100)
|
||||
}
|
||||
if nodeStats.Memory != nil && nodeStats.Memory.WorkingSetBytes != nil {
|
||||
performanceMetrics.MemoryUsage += math.Round(float64(*nodeStats.Memory.WorkingSetBytes) / float64(node.Status.Capacity.Memory().Value()) * 100)
|
||||
}
|
||||
if nodeStats.Network != nil && nodeStats.Network.RxBytes != nil && nodeStats.Network.TxBytes != nil {
|
||||
performanceMetrics.NetworkUsage += math.Round((float64(*nodeStats.Network.RxBytes) + float64(*nodeStats.Network.TxBytes)) / 1024 / 1024) // MB
|
||||
}
|
||||
return performanceMetrics, nil
|
||||
}
|
||||
|
||||
// filterLogsByPattern filters the logs by the given patterns and returns a list of logs that match the patterns
|
||||
// the logs are returned as a list of maps with the keys "timestamp" and "message"
|
||||
func filterLogsByPattern(logBytes []byte, patterns []string) []map[string]string {
|
||||
|
|
Loading…
Reference in New Issue