feat(observability): alerting experimental feature (#801)

Co-authored-by: JamesPlayer <james.player@portainer.io>
pull/12608/merge
Steven Kang 2025-07-11 16:55:23 +12:00 committed by GitHub
parent b7e906701a
commit 96f2d69ae5
6 changed files with 98 additions and 28 deletions

View File

@ -776,6 +776,7 @@
"ImageCount": 9,
"IsPodman": false,
"NodeCount": 0,
"PerformanceMetrics": null,
"RunningContainerCount": 5,
"ServiceCount": 0,
"StackCount": 2,

View File

@ -215,26 +215,34 @@ type (
// DockerSnapshot represents a snapshot of a specific Docker environment(endpoint) at a specific time
DockerSnapshot struct {
Time int64 `json:"Time"`
DockerVersion string `json:"DockerVersion"`
Swarm bool `json:"Swarm"`
TotalCPU int `json:"TotalCPU"`
TotalMemory int64 `json:"TotalMemory"`
ContainerCount int `json:"ContainerCount"`
RunningContainerCount int `json:"RunningContainerCount"`
StoppedContainerCount int `json:"StoppedContainerCount"`
HealthyContainerCount int `json:"HealthyContainerCount"`
UnhealthyContainerCount int `json:"UnhealthyContainerCount"`
VolumeCount int `json:"VolumeCount"`
ImageCount int `json:"ImageCount"`
ServiceCount int `json:"ServiceCount"`
StackCount int `json:"StackCount"`
SnapshotRaw DockerSnapshotRaw `json:"DockerSnapshotRaw"`
NodeCount int `json:"NodeCount"`
GpuUseAll bool `json:"GpuUseAll"`
GpuUseList []string `json:"GpuUseList"`
IsPodman bool `json:"IsPodman"`
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"`
Time int64 `json:"Time"`
DockerVersion string `json:"DockerVersion"`
Swarm bool `json:"Swarm"`
TotalCPU int `json:"TotalCPU"`
TotalMemory int64 `json:"TotalMemory"`
ContainerCount int `json:"ContainerCount"`
RunningContainerCount int `json:"RunningContainerCount"`
StoppedContainerCount int `json:"StoppedContainerCount"`
HealthyContainerCount int `json:"HealthyContainerCount"`
UnhealthyContainerCount int `json:"UnhealthyContainerCount"`
VolumeCount int `json:"VolumeCount"`
ImageCount int `json:"ImageCount"`
ServiceCount int `json:"ServiceCount"`
StackCount int `json:"StackCount"`
SnapshotRaw DockerSnapshotRaw `json:"DockerSnapshotRaw"`
NodeCount int `json:"NodeCount"`
GpuUseAll bool `json:"GpuUseAll"`
GpuUseList []string `json:"GpuUseList"`
IsPodman bool `json:"IsPodman"`
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"`
PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics"`
}
// PerformanceMetrics represents the performance metrics of a Docker, Swarm, Podman, and Kubernetes environments
PerformanceMetrics struct {
CPUUsage float64 `json:"CPUUsage,omitempty"`
MemoryUsage float64 `json:"MemoryUsage,omitempty"`
NetworkUsage float64 `json:"NetworkUsage,omitempty"`
}
// DockerContainerSnapshot is an extent of Docker's Container struct
@ -663,12 +671,13 @@ type (
// KubernetesSnapshot represents a snapshot of a specific Kubernetes environment(endpoint) at a specific time
KubernetesSnapshot struct {
Time int64 `json:"Time"`
KubernetesVersion string `json:"KubernetesVersion"`
NodeCount int `json:"NodeCount"`
TotalCPU int64 `json:"TotalCPU"`
TotalMemory int64 `json:"TotalMemory"`
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"`
Time int64 `json:"Time"`
KubernetesVersion string `json:"KubernetesVersion"`
NodeCount int `json:"NodeCount"`
TotalCPU int64 `json:"TotalCPU"`
TotalMemory int64 `json:"TotalMemory"`
DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"`
PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics"`
}
// KubernetesConfiguration represents the configuration of a Kubernetes environment(endpoint)

1
go.mod
View File

@ -61,6 +61,7 @@ require (
k8s.io/cli-runtime v0.33.2
k8s.io/client-go v0.33.2
k8s.io/kubectl v0.33.2
k8s.io/kubelet v0.33.2
k8s.io/metrics v0.33.2
software.sslmate.com/src/go-pkcs12 v0.0.0-20210415151418-c5206de65a78
)

2
go.sum
View File

@ -974,6 +974,8 @@ k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUy
k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8=
k8s.io/kubectl v0.33.2 h1:7XKZ6DYCklu5MZQzJe+CkCjoGZwD1wWl7t/FxzhMz7Y=
k8s.io/kubectl v0.33.2/go.mod h1:8rC67FB8tVTYraovAGNi/idWIK90z2CHFNMmGJZJ3KI=
k8s.io/kubelet v0.33.2 h1:wxEau5/563oJb3j3KfrCKlNWWx35YlSgDLOYUBCQ0pg=
k8s.io/kubelet v0.33.2/go.mod h1:way8VCDTUMiX1HTOvJv7M3xS/xNysJI6qh7TOqMe5KM=
k8s.io/metrics v0.33.2 h1:gNCBmtnUMDMCRg9Ly5ehxP3OdKISMsOnh1vzk01iCgE=
k8s.io/metrics v0.33.2/go.mod h1:yxoAosKGRsZisv3BGekC5W6T1J8XSV+PoUEevACRv7c=
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro=

View File

@ -100,7 +100,10 @@ func dockerSnapshotNodes(snapshot *portainer.DockerSnapshot, cli *client.Client)
snapshot.TotalCPU = int(nanoCpus / 1e9)
snapshot.TotalMemory = totalMem
snapshot.NodeCount = len(nodes)
snapshot.NodeCount = 1
if snapshot.Swarm {
snapshot.NodeCount = len(nodes)
}
return nil
}

View File

@ -5,7 +5,9 @@ import (
"errors"
"fmt"
"io"
"math"
"os"
"reflect"
"strings"
"time"
@ -19,11 +21,11 @@ import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1"
)
func CreateKubernetesSnapshot(cli *kubernetes.Clientset) (*portainer.KubernetesSnapshot, error) {
kubernetesSnapshot := &portainer.KubernetesSnapshot{}
err := kubernetesSnapshotVersion(kubernetesSnapshot, cli)
if err != nil {
log.Warn().Err(err).Msg("unable to snapshot cluster version")
@ -54,10 +56,28 @@ func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli *kubern
return err
}
if len(nodeList.Items) == 0 {
return nil
}
var totalCPUs, totalMemory int64
performanceMetrics := &portainer.PerformanceMetrics{
CPUUsage: 0,
MemoryUsage: 0,
NetworkUsage: 0,
}
for _, node := range nodeList.Items {
totalCPUs += node.Status.Capacity.Cpu().Value()
totalMemory += node.Status.Capacity.Memory().Value()
performanceMetrics, err = kubernetesSnapshotNodePerformanceMetrics(cli, node, performanceMetrics)
if err != nil {
return fmt.Errorf("failed to get node performance metrics: %w", err)
}
if performanceMetrics != nil {
snapshot.PerformanceMetrics = performanceMetrics
}
}
snapshot.TotalCPU = totalCPUs
@ -123,6 +143,40 @@ func kubernetesSnapshotPodErrorLogs(snapshot *portainer.KubernetesSnapshot, cli
return nil
}
func kubernetesSnapshotNodePerformanceMetrics(cli *kubernetes.Clientset, node corev1.Node, performanceMetrics *portainer.PerformanceMetrics) (*portainer.PerformanceMetrics, error) {
result := cli.RESTClient().Get().AbsPath(fmt.Sprintf("/api/v1/nodes/%s/proxy/stats/summary", node.Name)).Do(context.TODO())
if result.Error() != nil {
return nil, fmt.Errorf("failed to get node performance metrics: %w", result.Error())
}
raw, err := result.Raw()
if err != nil {
return nil, fmt.Errorf("failed to get node performance metrics: %w", err)
}
stats := statsapi.Summary{}
err = json.Unmarshal(raw, &stats)
if err != nil {
return nil, fmt.Errorf("failed to unmarshal node performance metrics: %w", err)
}
nodeStats := stats.Node
if reflect.DeepEqual(nodeStats, statsapi.NodeStats{}) {
return nil, nil
}
if nodeStats.CPU != nil && nodeStats.CPU.UsageNanoCores != nil {
performanceMetrics.CPUUsage += math.Round(float64(*nodeStats.CPU.UsageNanoCores) / float64(node.Status.Capacity.Cpu().Value()*1000000000) * 100)
}
if nodeStats.Memory != nil && nodeStats.Memory.WorkingSetBytes != nil {
performanceMetrics.MemoryUsage += math.Round(float64(*nodeStats.Memory.WorkingSetBytes) / float64(node.Status.Capacity.Memory().Value()) * 100)
}
if nodeStats.Network != nil && nodeStats.Network.RxBytes != nil && nodeStats.Network.TxBytes != nil {
performanceMetrics.NetworkUsage += math.Round((float64(*nodeStats.Network.RxBytes) + float64(*nodeStats.Network.TxBytes)) / 1024 / 1024) // MB
}
return performanceMetrics, nil
}
// filterLogsByPattern filters the logs by the given patterns and returns a list of logs that match the patterns
// the logs are returned as a list of maps with the keys "timestamp" and "message"
func filterLogsByPattern(logBytes []byte, patterns []string) []map[string]string {