From 96f2d69ae548d61e37cbe09b1b44a64878f16e13 Mon Sep 17 00:00:00 2001 From: Steven Kang Date: Fri, 11 Jul 2025 16:55:23 +1200 Subject: [PATCH] feat(observability): alerting experimental feature (#801) Co-authored-by: JamesPlayer --- .../test_data/output_24_to_latest.json | 1 + api/portainer.go | 61 +++++++++++-------- go.mod | 1 + go.sum | 2 + pkg/snapshot/docker.go | 5 +- pkg/snapshot/kubernetes.go | 56 ++++++++++++++++- 6 files changed, 98 insertions(+), 28 deletions(-) diff --git a/api/datastore/test_data/output_24_to_latest.json b/api/datastore/test_data/output_24_to_latest.json index 41a1b49df..dc50e6788 100644 --- a/api/datastore/test_data/output_24_to_latest.json +++ b/api/datastore/test_data/output_24_to_latest.json @@ -776,6 +776,7 @@ "ImageCount": 9, "IsPodman": false, "NodeCount": 0, + "PerformanceMetrics": null, "RunningContainerCount": 5, "ServiceCount": 0, "StackCount": 2, diff --git a/api/portainer.go b/api/portainer.go index da15479ee..0dc235550 100644 --- a/api/portainer.go +++ b/api/portainer.go @@ -215,26 +215,34 @@ type ( // DockerSnapshot represents a snapshot of a specific Docker environment(endpoint) at a specific time DockerSnapshot struct { - Time int64 `json:"Time"` - DockerVersion string `json:"DockerVersion"` - Swarm bool `json:"Swarm"` - TotalCPU int `json:"TotalCPU"` - TotalMemory int64 `json:"TotalMemory"` - ContainerCount int `json:"ContainerCount"` - RunningContainerCount int `json:"RunningContainerCount"` - StoppedContainerCount int `json:"StoppedContainerCount"` - HealthyContainerCount int `json:"HealthyContainerCount"` - UnhealthyContainerCount int `json:"UnhealthyContainerCount"` - VolumeCount int `json:"VolumeCount"` - ImageCount int `json:"ImageCount"` - ServiceCount int `json:"ServiceCount"` - StackCount int `json:"StackCount"` - SnapshotRaw DockerSnapshotRaw `json:"DockerSnapshotRaw"` - NodeCount int `json:"NodeCount"` - GpuUseAll bool `json:"GpuUseAll"` - GpuUseList []string `json:"GpuUseList"` - IsPodman bool `json:"IsPodman"` - DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"` + Time int64 `json:"Time"` + DockerVersion string `json:"DockerVersion"` + Swarm bool `json:"Swarm"` + TotalCPU int `json:"TotalCPU"` + TotalMemory int64 `json:"TotalMemory"` + ContainerCount int `json:"ContainerCount"` + RunningContainerCount int `json:"RunningContainerCount"` + StoppedContainerCount int `json:"StoppedContainerCount"` + HealthyContainerCount int `json:"HealthyContainerCount"` + UnhealthyContainerCount int `json:"UnhealthyContainerCount"` + VolumeCount int `json:"VolumeCount"` + ImageCount int `json:"ImageCount"` + ServiceCount int `json:"ServiceCount"` + StackCount int `json:"StackCount"` + SnapshotRaw DockerSnapshotRaw `json:"DockerSnapshotRaw"` + NodeCount int `json:"NodeCount"` + GpuUseAll bool `json:"GpuUseAll"` + GpuUseList []string `json:"GpuUseList"` + IsPodman bool `json:"IsPodman"` + DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"` + PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics"` + } + + // PerformanceMetrics represents the performance metrics of a Docker, Swarm, Podman, and Kubernetes environments + PerformanceMetrics struct { + CPUUsage float64 `json:"CPUUsage,omitempty"` + MemoryUsage float64 `json:"MemoryUsage,omitempty"` + NetworkUsage float64 `json:"NetworkUsage,omitempty"` } // DockerContainerSnapshot is an extent of Docker's Container struct @@ -663,12 +671,13 @@ type ( // KubernetesSnapshot represents a snapshot of a specific Kubernetes environment(endpoint) at a specific time KubernetesSnapshot struct { - Time int64 `json:"Time"` - KubernetesVersion string `json:"KubernetesVersion"` - NodeCount int `json:"NodeCount"` - TotalCPU int64 `json:"TotalCPU"` - TotalMemory int64 `json:"TotalMemory"` - DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"` + Time int64 `json:"Time"` + KubernetesVersion string `json:"KubernetesVersion"` + NodeCount int `json:"NodeCount"` + TotalCPU int64 `json:"TotalCPU"` + TotalMemory int64 `json:"TotalMemory"` + DiagnosticsData *DiagnosticsData `json:"DiagnosticsData"` + PerformanceMetrics *PerformanceMetrics `json:"PerformanceMetrics"` } // KubernetesConfiguration represents the configuration of a Kubernetes environment(endpoint) diff --git a/go.mod b/go.mod index f20d26f57..d589f19d9 100644 --- a/go.mod +++ b/go.mod @@ -61,6 +61,7 @@ require ( k8s.io/cli-runtime v0.33.2 k8s.io/client-go v0.33.2 k8s.io/kubectl v0.33.2 + k8s.io/kubelet v0.33.2 k8s.io/metrics v0.33.2 software.sslmate.com/src/go-pkcs12 v0.0.0-20210415151418-c5206de65a78 ) diff --git a/go.sum b/go.sum index d0f60eefc..ad8e48908 100644 --- a/go.sum +++ b/go.sum @@ -974,6 +974,8 @@ k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUy k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= k8s.io/kubectl v0.33.2 h1:7XKZ6DYCklu5MZQzJe+CkCjoGZwD1wWl7t/FxzhMz7Y= k8s.io/kubectl v0.33.2/go.mod h1:8rC67FB8tVTYraovAGNi/idWIK90z2CHFNMmGJZJ3KI= +k8s.io/kubelet v0.33.2 h1:wxEau5/563oJb3j3KfrCKlNWWx35YlSgDLOYUBCQ0pg= +k8s.io/kubelet v0.33.2/go.mod h1:way8VCDTUMiX1HTOvJv7M3xS/xNysJI6qh7TOqMe5KM= k8s.io/metrics v0.33.2 h1:gNCBmtnUMDMCRg9Ly5ehxP3OdKISMsOnh1vzk01iCgE= k8s.io/metrics v0.33.2/go.mod h1:yxoAosKGRsZisv3BGekC5W6T1J8XSV+PoUEevACRv7c= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= diff --git a/pkg/snapshot/docker.go b/pkg/snapshot/docker.go index deaabdb08..12f3a3089 100644 --- a/pkg/snapshot/docker.go +++ b/pkg/snapshot/docker.go @@ -100,7 +100,10 @@ func dockerSnapshotNodes(snapshot *portainer.DockerSnapshot, cli *client.Client) snapshot.TotalCPU = int(nanoCpus / 1e9) snapshot.TotalMemory = totalMem - snapshot.NodeCount = len(nodes) + snapshot.NodeCount = 1 + if snapshot.Swarm { + snapshot.NodeCount = len(nodes) + } return nil } diff --git a/pkg/snapshot/kubernetes.go b/pkg/snapshot/kubernetes.go index d8e550e84..77fda14bd 100644 --- a/pkg/snapshot/kubernetes.go +++ b/pkg/snapshot/kubernetes.go @@ -5,7 +5,9 @@ import ( "errors" "fmt" "io" + "math" "os" + "reflect" "strings" "time" @@ -19,11 +21,11 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" + statsapi "k8s.io/kubelet/pkg/apis/stats/v1alpha1" ) func CreateKubernetesSnapshot(cli *kubernetes.Clientset) (*portainer.KubernetesSnapshot, error) { kubernetesSnapshot := &portainer.KubernetesSnapshot{} - err := kubernetesSnapshotVersion(kubernetesSnapshot, cli) if err != nil { log.Warn().Err(err).Msg("unable to snapshot cluster version") @@ -54,10 +56,28 @@ func kubernetesSnapshotNodes(snapshot *portainer.KubernetesSnapshot, cli *kubern return err } + if len(nodeList.Items) == 0 { + return nil + } + var totalCPUs, totalMemory int64 + performanceMetrics := &portainer.PerformanceMetrics{ + CPUUsage: 0, + MemoryUsage: 0, + NetworkUsage: 0, + } + for _, node := range nodeList.Items { totalCPUs += node.Status.Capacity.Cpu().Value() totalMemory += node.Status.Capacity.Memory().Value() + + performanceMetrics, err = kubernetesSnapshotNodePerformanceMetrics(cli, node, performanceMetrics) + if err != nil { + return fmt.Errorf("failed to get node performance metrics: %w", err) + } + if performanceMetrics != nil { + snapshot.PerformanceMetrics = performanceMetrics + } } snapshot.TotalCPU = totalCPUs @@ -123,6 +143,40 @@ func kubernetesSnapshotPodErrorLogs(snapshot *portainer.KubernetesSnapshot, cli return nil } +func kubernetesSnapshotNodePerformanceMetrics(cli *kubernetes.Clientset, node corev1.Node, performanceMetrics *portainer.PerformanceMetrics) (*portainer.PerformanceMetrics, error) { + result := cli.RESTClient().Get().AbsPath(fmt.Sprintf("/api/v1/nodes/%s/proxy/stats/summary", node.Name)).Do(context.TODO()) + if result.Error() != nil { + return nil, fmt.Errorf("failed to get node performance metrics: %w", result.Error()) + } + + raw, err := result.Raw() + if err != nil { + return nil, fmt.Errorf("failed to get node performance metrics: %w", err) + } + + stats := statsapi.Summary{} + err = json.Unmarshal(raw, &stats) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal node performance metrics: %w", err) + } + + nodeStats := stats.Node + if reflect.DeepEqual(nodeStats, statsapi.NodeStats{}) { + return nil, nil + } + + if nodeStats.CPU != nil && nodeStats.CPU.UsageNanoCores != nil { + performanceMetrics.CPUUsage += math.Round(float64(*nodeStats.CPU.UsageNanoCores) / float64(node.Status.Capacity.Cpu().Value()*1000000000) * 100) + } + if nodeStats.Memory != nil && nodeStats.Memory.WorkingSetBytes != nil { + performanceMetrics.MemoryUsage += math.Round(float64(*nodeStats.Memory.WorkingSetBytes) / float64(node.Status.Capacity.Memory().Value()) * 100) + } + if nodeStats.Network != nil && nodeStats.Network.RxBytes != nil && nodeStats.Network.TxBytes != nil { + performanceMetrics.NetworkUsage += math.Round((float64(*nodeStats.Network.RxBytes) + float64(*nodeStats.Network.TxBytes)) / 1024 / 1024) // MB + } + return performanceMetrics, nil +} + // filterLogsByPattern filters the logs by the given patterns and returns a list of logs that match the patterns // the logs are returned as a list of maps with the keys "timestamp" and "message" func filterLogsByPattern(logBytes []byte, patterns []string) []map[string]string {