From f52314ea1f32153b263f1592c7a7e9b4d8952f45 Mon Sep 17 00:00:00 2001 From: zhangwei <894646498@qq.com> Date: Mon, 30 Oct 2023 10:53:30 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BB=BB=E5=8A=A1=E8=AF=A6=E6=83=85=E7=9B=91?= =?UTF-8?q?=E6=8E=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former-commit-id: f584527e2f8b081846c6aa88798e50b26bb10328 --- api/desc/core/pcm-core.api | 10 +- api/desc/pcm.api | 2 +- api/internal/config/config.go | 10 - .../handler/core/scheduletaskbyyamlhandler.go | 22 +- api/internal/handler/routes.go | 2 +- .../logic/core/scheduletaskbyyamllogic.go | 8 +- api/internal/logic/core/taskdetaillogic.go | 101 +++- api/internal/types/types.go | 8 +- go.mod | 6 +- go.sum | 4 +- pkg/tracker/interface.go | 18 +- pkg/tracker/promql.go | 486 ++++++++++++++++++ pkg/tracker/queryoptions.go | 318 ++++++++++++ pkg/tracker/tracker.go | 86 ++++ pkg/tracker/tracker_test.go | 17 + pkg/tracker/types.go | 174 +++++++ 16 files changed, 1222 insertions(+), 50 deletions(-) create mode 100644 pkg/tracker/promql.go create mode 100644 pkg/tracker/queryoptions.go create mode 100644 pkg/tracker/tracker_test.go create mode 100644 pkg/tracker/types.go diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index 977b0605..44f0143b 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -176,11 +176,8 @@ type deleteTaskReq { type ( scheduleTaskByYamlReq { - Name string `yaml:"name"` - synergy string `yaml:"synergy"` Description string `yaml:"description"` - strategy string `yaml:"strategy"` tenantId int64 `yaml:"tenantId"` tasks []TaskYaml `yaml:"tasks"` } @@ -260,7 +257,12 @@ type ( TaskId int64 `path:"taskId"` } taskDetailResp { - + CpuCores float64 `json:"cpuCores"` + CpuRate float64 `json:"cpuRate"` + CpuLimit float64 `json:"cpuLimit"` + MemoryTotal float64 `json:"memoryTotal"` + MemoryRate float64 `json:"memoryRate"` + MemoryLimit float64 `json:"memoryLimit"` } ) type ( diff --git a/api/desc/pcm.api b/api/desc/pcm.api index d879a7b1..32e6b633 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -42,7 +42,7 @@ service pcm { // 任务详情接口 @handler TaskDetailHandler - get /core/taskDetail (taskDetailReq) returns (taskDetailResp) + get /core/taskDetail/:taskId (taskDetailReq) returns (taskDetailResp) @handler JobTotalHandler get /core/jobTotal returns (jobTotalResp) diff --git a/api/internal/config/config.go b/api/internal/config/config.go index 6ec313a9..9897af56 100644 --- a/api/internal/config/config.go +++ b/api/internal/config/config.go @@ -1,7 +1,6 @@ package config import ( - "github.com/zeromicro/go-queue/kq" "github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/stores/cache" "github.com/zeromicro/go-zero/core/stores/redis" @@ -11,21 +10,12 @@ import ( type Config struct { rest.RestConf - KqProducerConf struct { - Brokers []string - HpcTopic string - CloudTopic string - AiTopic string - } DB struct { DataSource string } Redis redis.RedisConf Cache cache.CacheConf LogConf logx.LogConf - HpcConsumerConf kq.KqConf - CloudConsumerConf kq.KqConf - AiConsumerConf kq.KqConf K8sNativeConf zrpc.RpcClientConf ACRpcConf zrpc.RpcClientConf THRpcConf zrpc.RpcClientConf diff --git a/api/internal/handler/core/scheduletaskbyyamlhandler.go b/api/internal/handler/core/scheduletaskbyyamlhandler.go index 46c8b32a..c1549eb4 100644 --- a/api/internal/handler/core/scheduletaskbyyamlhandler.go +++ b/api/internal/handler/core/scheduletaskbyyamlhandler.go @@ -1,6 +1,8 @@ package core import ( + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils" + "gitlink.org.cn/jcce-pcm/utils/result" "net/http" "github.com/zeromicro/go-zero/rest/httpx" @@ -13,16 +15,22 @@ func ScheduleTaskByYamlHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { var req types.ScheduleTaskByYamlReq if err := httpx.Parse(r, &req); err != nil { - httpx.ErrorCtx(r.Context(), w, err) + result.HttpResult(r, w, nil, err) + return + } + // 解析yaml文件 + _, fileHeader, err := r.FormFile("file") + if err != nil { + result.HttpResult(r, w, nil, err) + return + } + err = utils.Yaml2struct(fileHeader, &req) + if err != nil { + result.HttpResult(r, w, nil, err) return } - l := core.NewScheduleTaskByYamlLogic(r.Context(), svcCtx) resp, err := l.ScheduleTaskByYaml(&req) - if err != nil { - httpx.ErrorCtx(r.Context(), w, err) - } else { - httpx.OkJsonCtx(r.Context(), w, resp) - } + result.HttpResult(r, w, resp, err) } } diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index cba46acc..b1c66419 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -47,7 +47,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { }, { Method: http.MethodGet, - Path: "/core/taskDetail", + Path: "/core/taskDetail/:taskId", Handler: core.TaskDetailHandler(serverCtx), }, { diff --git a/api/internal/logic/core/scheduletaskbyyamllogic.go b/api/internal/logic/core/scheduletaskbyyamllogic.go index c67b30a5..4459682c 100644 --- a/api/internal/logic/core/scheduletaskbyyamllogic.go +++ b/api/internal/logic/core/scheduletaskbyyamllogic.go @@ -2,9 +2,9 @@ package core import ( "context" - "encoding/json" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/constants" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models" + "k8s.io/apimachinery/pkg/util/json" "time" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" @@ -28,6 +28,7 @@ func NewScheduleTaskByYamlLogic(ctx context.Context, svcCtx *svc.ServiceContext) } func (l *ScheduleTaskByYamlLogic) ScheduleTaskByYaml(req *types.ScheduleTaskByYamlReq) (resp *types.ScheduleTaskByYamlResp, err error) { + resp = &types.ScheduleTaskByYamlResp{} bytes, err := json.Marshal(req) if err != nil { return nil, err @@ -55,7 +56,10 @@ func (l *ScheduleTaskByYamlLogic) ScheduleTaskByYaml(req *types.ScheduleTaskByYa logx.Error(err) return nil, err } - l.svcCtx.RedisClient.Publish(context.Background(), task.TaskType, reqMessage) + publish := l.svcCtx.RedisClient.Publish(context.Background(), task.TaskType, reqMessage) + if publish.Err() != nil { + return nil, publish.Err() + } } resp.TaskId = taskModel.Id return resp, nil diff --git a/api/internal/logic/core/taskdetaillogic.go b/api/internal/logic/core/taskdetaillogic.go index 49fca25a..2d8ac35f 100644 --- a/api/internal/logic/core/taskdetaillogic.go +++ b/api/internal/logic/core/taskdetaillogic.go @@ -2,6 +2,10 @@ package core import ( "context" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/tracker" + "gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes/kubernetesclient" + "time" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/types" @@ -24,7 +28,100 @@ func NewTaskDetailLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TaskDe } func (l *TaskDetailLogic) TaskDetail(req *types.TaskDetailReq) (resp *types.TaskDetailResp, err error) { - // todo: add your logic here and delete this line + resp = &types.TaskDetailResp{} + var clouds []models.Cloud + l.svcCtx.DbEngin.Where("task_id = ?", req.TaskId).Find(&clouds) + for _, cloud := range clouds { + // 查询监控地址 + var metricsUrl string + l.svcCtx.DbEngin.Raw(" SELECT metrics_url FROM `sc_participant_phy_info` WHERE id = ? ", cloud.ParticipantId).Scan(&metricsUrl) - return + // + var pods []*kubernetesclient.Pod + switch cloud.Kind { + case "Job": + jobResult, err := l.svcCtx.K8sRpc[cloud.ParticipantId].JobDetail(context.Background(), &kubernetesclient.JobDetailReq{ + Namespace: cloud.Namespace, + Name: cloud.Name, + }) + if err != nil { + return nil, err + } + // 查询出job下关联的pod列表 + uid := jobResult.Job.Metadata.Labels["controller-uid"] + LabelSelector := "controller-uid=" + uid + podResp, err := l.svcCtx.K8sRpc[cloud.ParticipantId].PodList(context.Background(), &kubernetesclient.PodListReq{ + ListOptions: &kubernetesclient.ListOptions{ + LabelSelector: &LabelSelector, + }, + }) + if err != nil { + return nil, err + } + pods = podResp.PodList.Items + + case "Deployment": + deploymentResult, err := l.svcCtx.K8sRpc[cloud.ParticipantId].DeploymentDetail(context.Background(), &kubernetesclient.DeploymentDetailReq{ + Namespace: cloud.Namespace, + Name: cloud.Name, + }) + if err != nil { + return nil, err + } + // 查询出job下关联的pod列表 + + uid := deploymentResult.Deployment.Spec.Selector.MatchLabels["app"] + LabelSelector := "app=" + uid + podResp, err := l.svcCtx.K8sRpc[cloud.ParticipantId].PodList(context.Background(), &kubernetesclient.PodListReq{ + ListOptions: &kubernetesclient.ListOptions{ + LabelSelector: &LabelSelector, + }, + }) + if err != nil { + return nil, err + } + pods = podResp.PodList.Items + } + podsMetrics(metricsUrl, pods, resp) + + } + return resp, nil +} + +func podsMetrics(metricsUrl string, pods []*kubernetesclient.Pod, resp *types.TaskDetailResp) { + // 查询每个pod资源使用情况 + for _, pod := range pods { + prometheusClient, _ := tracker.NewPrometheus(metricsUrl) + // cpu需求量 + podCpuLimit := prometheusClient.GetNamedMetrics([]string{"pod_cpu_resource_limits"}, time.Now(), tracker.PodOption{ + PodName: *pod.Metadata.Name, + NamespaceName: *pod.Metadata.Namespace, + }) + resp.CpuLimit = metricAdd(resp.CpuLimit, podCpuLimit) + // cpu使用量 + podCpuUsage := prometheusClient.GetNamedMetrics([]string{"pod_cpu_usage"}, time.Now(), tracker.PodOption{ + PodName: *pod.Metadata.Name, + NamespaceName: *pod.Metadata.Namespace, + }) + resp.CpuCores = metricAdd(resp.CpuCores, podCpuUsage) + // 内存使用量 + podMemoryUsage := prometheusClient.GetNamedMetrics([]string{"pod_memory_usage"}, time.Now(), tracker.PodOption{ + PodName: *pod.Metadata.Name, + NamespaceName: *pod.Metadata.Namespace, + }) + resp.MemoryTotal = metricAdd(resp.MemoryTotal, podMemoryUsage) + // 内存需求量 + podMemoryLimit := prometheusClient.GetNamedMetrics([]string{"pod_memory_resource_limits"}, time.Now(), tracker.PodOption{ + PodName: *pod.Metadata.Name, + NamespaceName: *pod.Metadata.Namespace, + }) + resp.MemoryLimit = metricAdd(resp.MemoryLimit, podMemoryLimit) + } +} + +func metricAdd(z float64, metric []tracker.Metric) float64 { + if metric[0].MetricValues != nil { + z = z + metric[0].MetricValues[0].Sample.Value() + } + return z } diff --git a/api/internal/types/types.go b/api/internal/types/types.go index c2558392..555673f2 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -158,9 +158,7 @@ type DeleteTaskReq struct { type ScheduleTaskByYamlReq struct { Name string `yaml:"name"` - Synergy string `yaml:"synergy"` Description string `yaml:"description"` - Strategy string `yaml:"strategy"` TenantId int64 `yaml:"tenantId"` Tasks []TaskYaml `yaml:"tasks"` } @@ -236,6 +234,12 @@ type TaskDetailReq struct { } type TaskDetailResp struct { + CpuCores float64 `json:"cpuCores"` + CpuRate float64 `json:"cpuRate"` + CpuLimit float64 `json:"cpuLimit"` + MemoryTotal float64 `json:"memoryTotal"` + MemoryRate float64 `json:"memoryRate"` + MemoryLimit float64 `json:"memoryLimit"` } type ListCenterResp struct { diff --git a/go.mod b/go.mod index 1ac1f7ec..c1c81dfd 100644 --- a/go.mod +++ b/go.mod @@ -13,9 +13,11 @@ require ( github.com/go-resty/resty/v2 v2.7.0 github.com/go-sql-driver/mysql v1.7.1 github.com/jinzhu/copier v0.3.5 + github.com/json-iterator/go v1.1.12 github.com/nacos-group/nacos-sdk-go/v2 v2.2.3 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.16.0 + github.com/prometheus/common v0.44.0 github.com/redis/go-redis/v9 v9.2.1 github.com/robfig/cron/v3 v3.0.1 github.com/shopspring/decimal v1.3.1 @@ -23,7 +25,7 @@ require ( github.com/zeromicro/go-zero v1.5.5 gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31 gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835 - gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20230830120334-bf6d99c715ef + gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231027083610-c8a292768f4a gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231024115530-f6fd0505d2a1 gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231011071802-c6a7637b74e4 gitlink.org.cn/jcce-pcm/pcm-participant-openstack v0.0.0-20231024105731-cbdceff549c9 @@ -79,7 +81,6 @@ require ( github.com/jinzhu/now v1.1.5 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/josharian/intern v1.0.0 // indirect - github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.15.15 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect @@ -96,7 +97,6 @@ require ( github.com/pelletier/go-toml/v2 v2.1.0 // indirect github.com/pierrec/lz4/v4 v4.1.17 // indirect github.com/prometheus/client_model v0.4.0 // indirect - github.com/prometheus/common v0.44.0 // indirect github.com/prometheus/procfs v0.10.1 // indirect github.com/segmentio/kafka-go v0.4.38 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect diff --git a/go.sum b/go.sum index 59447806..68c31a2f 100644 --- a/go.sum +++ b/go.sum @@ -1037,8 +1037,8 @@ gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31 h1 gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31/go.mod h1:DY45tXlPBWBptj9YjCHWnAK5LshvJ33PjFkE5/vtd4o= gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835 h1:WDCPqD8IrepGJXankkpG14Ny6inh9AldB0RX9WWa+ck= gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835/go.mod h1:r/KLzUpupCV5jdxSfgDhc2pVjP0fBi3VhAWRttsBn30= -gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20230830120334-bf6d99c715ef h1:s7JfXjka2MhGaDjKMJ57fj0k3XuDB6w+UlYHFLyJlUY= -gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20230830120334-bf6d99c715ef/go.mod h1:SFpXY1gy1ELYTo4P6EU68nTL2vKu1ZuH7nKecL16hJk= +gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231027083610-c8a292768f4a h1:12PKPeDecCY7IfPp9JZmdMEIoJn6fF1oT4WW4ZKEUFM= +gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231027083610-c8a292768f4a/go.mod h1:xtSfvDUd+jJhqHBBibZ1d9/ud3oN9nxaNYYHwz+CDgY= gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231024115530-f6fd0505d2a1 h1:4Ibzcl4waYiHO3tdbqvcLUWEoV51ZaJhZBi7T518AA8= gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231024115530-f6fd0505d2a1/go.mod h1:pisJKAI8FRFFUcBaH3Gob+ENXWRM97rpuYmv9s1raag= gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231011071802-c6a7637b74e4 h1:iv78VZ5+j6/VNkEyD/GSmTJ96rpxzpKDUNknAoXsAmg= diff --git a/pkg/tracker/interface.go b/pkg/tracker/interface.go index 21b5015d..061f1d45 100644 --- a/pkg/tracker/interface.go +++ b/pkg/tracker/interface.go @@ -1,25 +1,11 @@ package tracker -/* -Copyright 2020 KubeSphere Authors - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ +import "time" type Interface interface { //GetMetric(expr string, time time.Time) Metric //GetMetricOverTime(expr string, start, end time.Time, step time.Duration) Metric - //GetNamedMetrics(metrics []string, time time.Time, opt QueryOption) []Metric + GetNamedMetrics(metrics []string, time time.Time, opt QueryOption) []Metric //GetNamedMetricsOverTime(metrics []string, start, end time.Time, step time.Duration, opt QueryOption) []Metric //GetMetadata(namespace string) []Metadata //GetMetricLabelSet(expr string, start, end time.Time) []map[string]string diff --git a/pkg/tracker/promql.go b/pkg/tracker/promql.go new file mode 100644 index 00000000..f5f0e432 --- /dev/null +++ b/pkg/tracker/promql.go @@ -0,0 +1,486 @@ +package tracker + +import ( + "fmt" + "strings" +) + +const ( + StatefulSet = "StatefulSet" + DaemonSet = "DaemonSet" + Deployment = "Deployment" +) + +var promQLTemplates = map[string]string{ + //cluster + "cluster_cpu_utilisation": ":node_cpu_utilisation:avg1m", + "cluster_cpu_usage": `round(:node_cpu_utilisation:avg1m * sum(node:node_num_cpu:sum), 0.001)`, + "cluster_cpu_total": "sum(node:node_num_cpu:sum)", + "cluster_memory_utilisation": ":node_memory_utilisation:", + "cluster_memory_available": "sum(node:node_memory_bytes_available:sum)", + "cluster_memory_total": "sum(node:node_memory_bytes_total:sum)", + "cluster_memory_usage_wo_cache": "sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)", + "cluster_net_utilisation": ":node_net_utilisation:sum_irate", + "cluster_net_bytes_transmitted": "sum(node:node_net_bytes_transmitted:sum_irate)", + "cluster_net_bytes_received": "sum(node:node_net_bytes_received:sum_irate)", + "cluster_disk_read_iops": "sum(node:data_volume_iops_reads:sum)", + "cluster_disk_write_iops": "sum(node:data_volume_iops_writes:sum)", + "cluster_disk_read_throughput": "sum(node:data_volume_throughput_bytes_read:sum)", + "cluster_disk_write_throughput": "sum(node:data_volume_throughput_bytes_written:sum)", + "cluster_disk_size_usage": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`, + "cluster_disk_size_utilisation": `cluster:disk_utilization:ratio`, + "cluster_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`, + "cluster_disk_size_available": `sum(max(node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`, + "cluster_disk_inode_total": `sum(node:node_inodes_total:)`, + "cluster_disk_inode_usage": `sum(node:node_inodes_total:) - sum(node:node_inodes_free:)`, + "cluster_disk_inode_utilisation": `cluster:disk_inode_utilization:ratio`, + "cluster_namespace_count": `count(kube_namespace_labels)`, + "cluster_pod_count": `cluster:pod:sum`, + "cluster_pod_quota": `sum(max(kube_node_status_capacity{resource="pods"}) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`, + "cluster_pod_utilisation": `cluster:pod_utilization:ratio`, + "cluster_pod_running_count": `cluster:pod_running:count`, + "cluster_pod_succeeded_count": `count(kube_pod_info unless on (pod) (kube_pod_status_phase{phase=~"Failed|Pending|Unknown|Running"} > 0) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`, + "cluster_pod_abnormal_count": `cluster:pod_abnormal:sum`, + "cluster_node_online": `sum(kube_node_status_condition{condition="Ready",status="true"})`, + "cluster_node_offline": `cluster:node_offline:sum`, + "cluster_node_total": `sum(kube_node_status_condition{condition="Ready"})`, + "cluster_cronjob_count": `sum(kube_cronjob_labels)`, + "cluster_pvc_count": `sum(kube_persistentvolumeclaim_info)`, + "cluster_daemonset_count": `sum(kube_daemonset_labels)`, + "cluster_deployment_count": `sum(kube_deployment_labels)`, + "cluster_endpoint_count": `sum(kube_endpoint_labels)`, + "cluster_hpa_count": `sum(kube_horizontalpodautoscaler_labels)`, + "cluster_job_count": `sum(kube_job_labels)`, + "cluster_statefulset_count": `sum(kube_statefulset_labels)`, + "cluster_replicaset_count": `count(kube_replicaset_labels)`, + "cluster_service_count": `sum(kube_service_info)`, + "cluster_secret_count": `sum(kube_secret_info)`, + "cluster_pv_count": `sum(kube_persistentvolume_labels)`, + "cluster_ingresses_extensions_count": `sum(kube_ingress_labels)`, + "cluster_load1": `sum(node_load1{job="node-exporter"}) / sum(node:node_num_cpu:sum)`, + "cluster_load5": `sum(node_load5{job="node-exporter"}) / sum(node:node_num_cpu:sum)`, + "cluster_load15": `sum(node_load15{job="node-exporter"}) / sum(node:node_num_cpu:sum)`, + "cluster_pod_abnormal_ratio": `cluster:pod_abnormal:ratio`, + "cluster_node_offline_ratio": `cluster:node_offline:ratio`, + + //node + "node_cpu_utilisation": "node:node_cpu_utilisation:avg1m{$1}", + "node_cpu_total": "node:node_num_cpu:sum{$1}", + "node_memory_utilisation": "node:node_memory_utilisation:{$1}", + "node_memory_available": "node:node_memory_bytes_available:sum{$1}", + "node_memory_total": "node:node_memory_bytes_total:sum{$1}", + "node_memory_usage_wo_cache": "node:node_memory_bytes_total:sum{$1} - node:node_memory_bytes_available:sum{$1}", + "node_net_utilisation": "node:node_net_utilisation:sum_irate{$1}", + "node_net_bytes_transmitted": "node:node_net_bytes_transmitted:sum_irate{$1}", + "node_net_bytes_received": "node:node_net_bytes_received:sum_irate{$1}", + "node_disk_read_iops": "node:data_volume_iops_reads:sum{$1}", + "node_disk_write_iops": "node:data_volume_iops_writes:sum{$1}", + "node_disk_read_throughput": "node:data_volume_throughput_bytes_read:sum{$1}", + "node_disk_write_throughput": "node:data_volume_throughput_bytes_written:sum{$1}", + "node_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{$1}) by (device, node)) by (node)`, + "node_disk_size_available": `node:disk_space_available:{$1}`, + "node_disk_size_usage": `sum(max((node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{$1}) by (device, node)) by (node)`, + "node_disk_size_utilisation": `node:disk_space_utilization:ratio{$1}`, + "node_disk_inode_total": `node:node_inodes_total:{$1}`, + "node_disk_inode_usage": `node:node_inodes_total:{$1} - node:node_inodes_free:{$1}`, + "node_disk_inode_utilisation": `node:disk_inode_utilization:ratio{$1}`, + "node_pod_count": `node:pod_count:sum{$1}`, + "node_pod_quota": `max(kube_node_status_capacity{resource="pods",$1}) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0)`, + "node_pod_utilisation": `node:pod_utilization:ratio{$1}`, + "node_pod_running_count": `node:pod_running:count{$1}`, + "node_pod_succeeded_count": `node:pod_succeeded:count{$1}`, + "node_pod_abnormal_count": `node:pod_abnormal:count{$1}`, + "node_cpu_usage": `round(node:node_cpu_utilisation:avg1m{$1} * node:node_num_cpu:sum{$1}, 0.001)`, + "node_load1": `node:load1:ratio{$1}`, + "node_load5": `node:load5:ratio{$1}`, + "node_load15": `node:load15:ratio{$1}`, + "node_pod_abnormal_ratio": `node:pod_abnormal:ratio{$1}`, + "node_pleg_quantile": `node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{$1}`, + + "node_device_size_usage": `sum by(device, node, host_ip, role) (node_filesystem_size_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1}) - sum by(device, node, host_ip, role) (node_filesystem_avail_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1})`, + "node_device_size_utilisation": `1 - sum by(device, node, host_ip, role) (node_filesystem_avail_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1}) / sum by(device, node, host_ip, role) (node_filesystem_size_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1})`, + + // workspace + "workspace_cpu_usage": `round(sum by (workspace) (namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}), 0.001)`, + "workspace_memory_usage": `sum by (workspace) (namespace:container_memory_usage_bytes:sum{namespace!="", $1})`, + "workspace_memory_usage_wo_cache": `sum by (workspace) (namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1})`, + "workspace_net_bytes_transmitted": `sum by (workspace) (sum by (namespace) (irate(container_network_transmit_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`, + "workspace_net_bytes_received": `sum by (workspace) (sum by (namespace) (irate(container_network_receive_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`, + "workspace_pod_count": `sum by (workspace) (kube_pod_status_phase{phase!~"Failed|Succeeded", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1})) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`, + "workspace_pod_running_count": `sum by (workspace) (kube_pod_status_phase{phase="Running", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1})) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`, + "workspace_pod_succeeded_count": `sum by (workspace) (kube_pod_status_phase{phase="Succeeded", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1})) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`, + "workspace_pod_abnormal_count": `count by (workspace) ((kube_pod_info{node!=""} unless on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Succeeded"}>0) unless on (pod, namespace) ((kube_pod_status_ready{job="kube-state-metrics", condition="true"}>0) and on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Running"}>0)) unless on (pod, namespace) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", reason="ContainerCreating"}>0)) * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_ingresses_extensions_count": `sum by (workspace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_cronjob_count": `sum by (workspace) (kube_cronjob_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_pvc_count": `sum by (workspace) (kube_persistentvolumeclaim_info{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_daemonset_count": `sum by (workspace) (kube_daemonset_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_deployment_count": `sum by (workspace) (kube_deployment_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_endpoint_count": `sum by (workspace) (kube_endpoint_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_hpa_count": `sum by (workspace) (kube_horizontalpodautoscaler_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_job_count": `sum by (workspace) (kube_job_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_statefulset_count": `sum by (workspace) (kube_statefulset_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_replicaset_count": `count by (workspace) (kube_replicaset_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_service_count": `sum by (workspace) (kube_service_info{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_secret_count": `sum by (workspace) (kube_secret_info{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + "workspace_pod_abnormal_ratio": `count by (workspace) ((kube_pod_info{node!=""} unless on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Succeeded"}>0) unless on (pod, namespace) ((kube_pod_status_ready{job="kube-state-metrics", condition="true"}>0) and on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Running"}>0)) unless on (pod, namespace) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", reason="ContainerCreating"}>0)) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) / sum by (workspace) (kube_pod_status_phase{phase!="Succeeded", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`, + + //namespace + "namespace_cpu_usage": `round(namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}, 0.001)`, + "namespace_memory_usage": `namespace:container_memory_usage_bytes:sum{namespace!="", $1}`, + "namespace_memory_usage_wo_cache": `namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1}`, + "namespace_net_bytes_transmitted": `sum by (namespace) (irate(container_network_transmit_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m]) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`, + "namespace_net_bytes_received": `sum by (namespace) (irate(container_network_receive_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m]) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`, + "namespace_pod_count": `sum by (namespace) (kube_pod_status_phase{phase!~"Failed|Succeeded", namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`, + "namespace_pod_running_count": `sum by (namespace) (kube_pod_status_phase{phase="Running", namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`, + "namespace_pod_succeeded_count": `sum by (namespace) (kube_pod_status_phase{phase="Succeeded", namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`, + "namespace_pod_abnormal_count": `namespace:pod_abnormal:count{namespace!="", $1}`, + "namespace_pod_abnormal_ratio": `namespace:pod_abnormal:ratio{namespace!="", $1}`, + "namespace_memory_limit_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="limits.memory"} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_cpu_limit_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="limits.cpu"} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_pod_count_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="count/pods"} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_cronjob_count": `sum by (namespace) (kube_cronjob_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_pvc_count": `sum by (namespace) (kube_persistentvolumeclaim_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_daemonset_count": `sum by (namespace) (kube_daemonset_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_deployment_count": `sum by (namespace) (kube_deployment_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_endpoint_count": `sum by (namespace) (kube_endpoint_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_hpa_count": `sum by (namespace) (kube_horizontalpodautoscaler_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_job_count": `sum by (namespace) (kube_job_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_statefulset_count": `sum by (namespace) (kube_statefulset_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_replicaset_count": `count by (namespace) (kube_replicaset_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_service_count": `sum by (namespace) (kube_service_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_secret_count": `sum by (namespace) (kube_secret_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_configmap_count": `sum by (namespace) (kube_configmap_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_ingresses_extensions_count": `sum by (namespace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + "namespace_s2ibuilder_count": `sum by (namespace) (s2i_s2ibuilder_created{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, + + // ingress + "ingress_request_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2}[$3])))`, + "ingress_request_4xx_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2,status=~"[4].*"}[$3])))`, + "ingress_request_5xx_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2,status=~"[5].*"}[$3])))`, + "ingress_active_connections": `sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{$2,state="active"}[$3]))`, + "ingress_success_rate": `sum(rate(nginx_ingress_controller_requests{$1,$2,status!~"[4-5].*"}[$3])) / sum(rate(nginx_ingress_controller_requests{$1,$2}[$3]))`, + "ingress_request_duration_average": `sum_over_time(nginx_ingress_controller_request_duration_seconds_sum{$1,$2}[$3])/sum_over_time(nginx_ingress_controller_request_duration_seconds_count{$1,$2}[$3])`, + "ingress_request_duration_50percentage": `histogram_quantile(0.50, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`, + "ingress_request_duration_95percentage": `histogram_quantile(0.95, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`, + "ingress_request_duration_99percentage": `histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`, + "ingress_request_volume": `round(sum(irate(nginx_ingress_controller_requests{$1,$2}[$3])), 0.001)`, + "ingress_request_volume_by_ingress": `round(sum(irate(nginx_ingress_controller_requests{$1,$2}[$3])) by (ingress), 0.001)`, + "ingress_request_network_sent": `sum(irate(nginx_ingress_controller_response_size_sum{$1,$2}[$3]))`, + "ingress_request_network_received": `sum(irate(nginx_ingress_controller_request_size_sum{$1,$2}[$3]))`, + "ingress_request_memory_bytes": `avg(nginx_ingress_controller_nginx_process_resident_memory_bytes{$2})`, + "ingress_request_cpu_usage": `avg(rate(nginx_ingress_controller_nginx_process_cpu_seconds_total{$2}[5m]))`, + + // workload + "workload_cpu_usage": `round(namespace:workload_cpu_usage:sum{$1}, 0.001)`, + "workload_memory_usage": `namespace:workload_memory_usage:sum{$1}`, + "workload_memory_usage_wo_cache": `namespace:workload_memory_usage_wo_cache:sum{$1}`, + "workload_net_bytes_transmitted": `namespace:workload_net_bytes_transmitted:sum_irate{$1}`, + "workload_net_bytes_received": `namespace:workload_net_bytes_received:sum_irate{$1}`, + + "workload_deployment_replica": `label_join(sum (label_join(label_replace(kube_deployment_spec_replicas{$2}, "owner_kind", "Deployment", "", ""), "workload", "", "deployment")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`, + "workload_deployment_replica_available": `label_join(sum (label_join(label_replace(kube_deployment_status_replicas_available{$2}, "owner_kind", "Deployment", "", ""), "workload", "", "deployment")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`, + "workload_statefulset_replica": `label_join(sum (label_join(label_replace(kube_statefulset_replicas{$2}, "owner_kind", "StatefulSet", "", ""), "workload", "", "statefulset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`, + "workload_statefulset_replica_available": `label_join(sum (label_join(label_replace(kube_statefulset_status_replicas_current{$2}, "owner_kind", "StatefulSet", "", ""), "workload", "", "statefulset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`, + "workload_daemonset_replica": `label_join(sum (label_join(label_replace(kube_daemonset_status_desired_number_scheduled{$2}, "owner_kind", "DaemonSet", "", ""), "workload", "", "daemonset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`, + "workload_daemonset_replica_available": `label_join(sum (label_join(label_replace(kube_daemonset_status_number_available{$2}, "owner_kind", "DaemonSet", "", ""), "workload", "", "daemonset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`, + "workload_deployment_unavailable_replicas_ratio": `namespace:deployment_unavailable_replicas:ratio{$1}`, + "workload_daemonset_unavailable_replicas_ratio": `namespace:daemonset_unavailable_replicas:ratio{$1}`, + "workload_statefulset_unavailable_replicas_ratio": `namespace:statefulset_unavailable_replicas:ratio{$1}`, + + // pod + "pod_cpu_usage": `round(sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{job="kubelet", pod!="", image!=""}[5m])) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.001)`, + "pod_memory_usage": `sum by (namespace, pod) (container_memory_usage_bytes{job="kubelet", pod!="", image!=""}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`, + "pod_memory_usage_wo_cache": `sum by (namespace, pod) (container_memory_working_set_bytes{job="kubelet", pod!="", image!=""}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`, + "pod_net_bytes_transmitted": `sum by (namespace, pod) (irate(container_network_transmit_bytes_total{pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`, + "pod_net_bytes_received": `sum by (namespace, pod) (irate(container_network_receive_bytes_total{pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`, + "pod_cpu_resource_limits": `sum by (namespace, pod) (kube_pod_container_resource_limits{origin_prometheus=~"",resource="cpu",unit="core"}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`, + "pod_memory_resource_limits": `sum by (namespace, pod) (kube_pod_container_resource_limits{origin_prometheus=~"",resource="memory",unit="byte"}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`, + + // container + "container_cpu_usage": `round(sum by (namespace, pod, container) (irate(container_cpu_usage_seconds_total{job="kubelet", container!="POD", container!="", image!="", $1}[5m])), 0.001)`, + "container_memory_usage": `sum by (namespace, pod, container) (container_memory_usage_bytes{job="kubelet", container!="POD", container!="", image!="", $1})`, + "container_memory_usage_wo_cache": `sum by (namespace, pod, container) (container_memory_working_set_bytes{job="kubelet", container!="POD", container!="", image!="", $1})`, + "container_processes_usage": `sum by (namespace, pod, container) (container_processes{job="kubelet", container!="POD", container!="", image!="", $1})`, + "container_threads_usage": `sum by (namespace, pod, container) (container_threads {job="kubelet", container!="POD", container!="", image!="", $1})`, + + // pvc + "pvc_inodes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_free) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, + "pvc_inodes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, + "pvc_inodes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, + "pvc_inodes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used / kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, + "pvc_bytes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_available_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, + "pvc_bytes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, + "pvc_bytes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, + "pvc_bytes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, + + // component + "etcd_server_list": `label_replace(up{job="etcd"}, "node_ip", "$1", "instance", "(.*):.*")`, + "etcd_server_total": `count(up{job="etcd"})`, + "etcd_server_up_total": `etcd:up:sum`, + "etcd_server_has_leader": `label_replace(etcd_server_has_leader, "node_ip", "$1", "instance", "(.*):.*")`, + "etcd_server_is_leader": `label_replace(etcd_server_is_leader, "node_ip", "$1", "instance", "(.*):.*")`, + "etcd_server_leader_changes": `label_replace(etcd:etcd_server_leader_changes_seen:sum_changes, "node_ip", "$1", "node", "(.*)")`, + "etcd_server_proposals_failed_rate": `avg(etcd:etcd_server_proposals_failed:sum_irate)`, + "etcd_server_proposals_applied_rate": `avg(etcd:etcd_server_proposals_applied:sum_irate)`, + "etcd_server_proposals_committed_rate": `avg(etcd:etcd_server_proposals_committed:sum_irate)`, + "etcd_server_proposals_pending_count": `avg(etcd:etcd_server_proposals_pending:sum)`, + "etcd_mvcc_db_size": `avg(etcd:etcd_mvcc_db_total_size:sum)`, + "etcd_network_client_grpc_received_bytes": `sum(etcd:etcd_network_client_grpc_received_bytes:sum_irate)`, + "etcd_network_client_grpc_sent_bytes": `sum(etcd:etcd_network_client_grpc_sent_bytes:sum_irate)`, + "etcd_grpc_call_rate": `sum(etcd:grpc_server_started:sum_irate)`, + "etcd_grpc_call_failed_rate": `sum(etcd:grpc_server_handled:sum_irate)`, + "etcd_grpc_server_msg_received_rate": `sum(etcd:grpc_server_msg_received:sum_irate)`, + "etcd_grpc_server_msg_sent_rate": `sum(etcd:grpc_server_msg_sent:sum_irate)`, + "etcd_disk_wal_fsync_duration": `avg(etcd:etcd_disk_wal_fsync_duration:avg)`, + "etcd_disk_wal_fsync_duration_quantile": `avg(etcd:etcd_disk_wal_fsync_duration:histogram_quantile) by (quantile)`, + "etcd_disk_backend_commit_duration": `avg(etcd:etcd_disk_backend_commit_duration:avg)`, + "etcd_disk_backend_commit_duration_quantile": `avg(etcd:etcd_disk_backend_commit_duration:histogram_quantile) by (quantile)`, + + "apiserver_up_sum": `apiserver:up:sum`, + "apiserver_request_rate": `apiserver:apiserver_request_total:sum_irate`, + "apiserver_request_by_verb_rate": `apiserver:apiserver_request_total:sum_verb_irate`, + "apiserver_request_latencies": `apiserver:apiserver_request_duration:avg`, + "apiserver_request_by_verb_latencies": `apiserver:apiserver_request_duration:avg_by_verb`, + + "scheduler_up_sum": `scheduler:up:sum`, + "scheduler_schedule_attempts": `scheduler:scheduler_schedule_attempts:sum`, + "scheduler_schedule_attempt_rate": `scheduler:scheduler_schedule_attempts:sum_rate`, + "scheduler_e2e_scheduling_latency": `scheduler:scheduler_e2e_scheduling_duration:avg`, + "scheduler_e2e_scheduling_latency_quantile": `scheduler:scheduler_e2e_scheduling_duration:histogram_quantile`, +} + +func makeExpr(metric string, opts QueryOptions) string { + tmpl := promQLTemplates[metric] + switch opts.Level { + case LevelCluster: + return tmpl + case LevelNode: + return makeNodeMetricExpr(tmpl, opts) + case LevelWorkspace: + return makeWorkspaceMetricExpr(tmpl, opts) + case LevelNamespace: + return makeNamespaceMetricExpr(tmpl, opts) + case LevelWorkload: + return makeWorkloadMetricExpr(metric, tmpl, opts) + case LevelPod: + return makePodMetricExpr(tmpl, opts) + case LevelContainer: + return makeContainerMetricExpr(tmpl, opts) + case LevelPVC: + return makePVCMetricExpr(tmpl, opts) + case LevelIngress: + return makeIngressMetricExpr(tmpl, opts) + case LevelComponent: + return tmpl + default: + return tmpl + } +} + +func makeNodeMetricExpr(tmpl string, o QueryOptions) string { + var nodeSelector string + if o.NodeName != "" { + nodeSelector = fmt.Sprintf(`node="%s"`, o.NodeName) + } else { + nodeSelector = fmt.Sprintf(`node=~"%s"`, o.ResourceFilter) + } + return strings.Replace(tmpl, "$1", nodeSelector, -1) +} + +func makeWorkspaceMetricExpr(tmpl string, o QueryOptions) string { + var workspaceSelector string + if o.WorkspaceName != "" { + workspaceSelector = fmt.Sprintf(`workspace="%s"`, o.WorkspaceName) + } else { + workspaceSelector = fmt.Sprintf(`workspace=~"%s", workspace!=""`, o.ResourceFilter) + } + return strings.Replace(tmpl, "$1", workspaceSelector, -1) +} + +func makeNamespaceMetricExpr(tmpl string, o QueryOptions) string { + var namespaceSelector string + + // For monitoring namespaces in the specific workspace + // GET /workspaces/{workspace}/namespaces + if o.WorkspaceName != "" { + namespaceSelector = fmt.Sprintf(`workspace="%s", namespace=~"%s"`, o.WorkspaceName, o.ResourceFilter) + return strings.Replace(tmpl, "$1", namespaceSelector, -1) + } + + // For monitoring the specific namespaces + // GET /namespaces/{namespace} or + // GET /namespaces + if o.NamespaceName != "" { + namespaceSelector = fmt.Sprintf(`namespace="%s"`, o.NamespaceName) + } else { + namespaceSelector = fmt.Sprintf(`namespace=~"%s"`, o.ResourceFilter) + } + return strings.Replace(tmpl, "$1", namespaceSelector, -1) +} + +func makeWorkloadMetricExpr(metric, tmpl string, o QueryOptions) string { + var kindSelector, workloadSelector string + + switch o.WorkloadKind { + case "deployment": + o.WorkloadKind = Deployment + case "statefulset": + o.WorkloadKind = StatefulSet + case "daemonset": + o.WorkloadKind = DaemonSet + default: + o.WorkloadKind = ".*" + } + workloadSelector = fmt.Sprintf(`namespace="%s", workload=~"%s:(%s)"`, o.NamespaceName, o.WorkloadKind, o.ResourceFilter) + + if strings.Contains(metric, "deployment") { + kindSelector = fmt.Sprintf(`namespace="%s", deployment!="", deployment=~"%s"`, o.NamespaceName, o.ResourceFilter) + } + if strings.Contains(metric, "statefulset") { + kindSelector = fmt.Sprintf(`namespace="%s", statefulset!="", statefulset=~"%s"`, o.NamespaceName, o.ResourceFilter) + } + if strings.Contains(metric, "daemonset") { + kindSelector = fmt.Sprintf(`namespace="%s", daemonset!="", daemonset=~"%s"`, o.NamespaceName, o.ResourceFilter) + } + + return strings.NewReplacer("$1", workloadSelector, "$2", kindSelector).Replace(tmpl) +} + +func makePodMetricExpr(tmpl string, o QueryOptions) string { + var podSelector, workloadSelector string + + // For monitoriong pods of the specific workload + // GET /namespaces/{namespace}/workloads/{kind}/{workload}/pods + if o.WorkloadName != "" { + switch o.WorkloadKind { + case "deployment": + workloadSelector = fmt.Sprintf(`owner_kind="ReplicaSet", owner_name=~"^%s-[^-]{1,10}$"`, o.WorkloadName) + case "statefulset": + workloadSelector = fmt.Sprintf(`owner_kind="StatefulSet", owner_name="%s"`, o.WorkloadName) + case "daemonset": + workloadSelector = fmt.Sprintf(`owner_kind="DaemonSet", owner_name="%s"`, o.WorkloadName) + } + } + + // For monitoring pods in the specific namespace + // GET /namespaces/{namespace}/workloads/{kind}/{workload}/pods or + // GET /namespaces/{namespace}/pods/{pod} or + // GET /namespaces/{namespace}/pods + if o.NamespaceName != "" { + if o.PodName != "" { + podSelector = fmt.Sprintf(`pod="%s", namespace="%s"`, o.PodName, o.NamespaceName) + } else { + podSelector = fmt.Sprintf(`pod=~"%s", namespace="%s"`, o.ResourceFilter, o.NamespaceName) + } + } else { + var namespaces, pods []string + if o.NamespacedResourcesFilter != "" { + for _, np := range strings.Split(o.NamespacedResourcesFilter, "|") { + if nparr := strings.SplitN(np, "/", 2); len(nparr) > 1 { + namespaces = append(namespaces, nparr[0]) + pods = append(pods, nparr[1]) + } else { + pods = append(pods, np) + } + } + } + // For monitoring pods on the specific node + // GET /nodes/{node}/pods/{pod} + // GET /nodes/{node}/pods + if o.NodeName != "" { + if o.PodName != "" { + if nparr := strings.SplitN(o.PodName, "/", 2); len(nparr) > 1 { + podSelector = fmt.Sprintf(`namespace="%s",pod="%s", node="%s"`, nparr[0], nparr[1], o.NodeName) + } else { + podSelector = fmt.Sprintf(`pod="%s", node="%s"`, o.PodName, o.NodeName) + } + } else { + var ps []string + ps = append(ps, fmt.Sprintf(`node="%s"`, o.NodeName)) + if o.ResourceFilter != "" { + ps = append(ps, fmt.Sprintf(`pod=~"%s"`, o.ResourceFilter)) + } + + if len(namespaces) > 0 { + ps = append(ps, fmt.Sprintf(`namespace=~"%s"`, strings.Join(namespaces, "|"))) + } + if len(pods) > 0 { + ps = append(ps, fmt.Sprintf(`pod=~"%s"`, strings.Join(pods, "|"))) + } + podSelector = strings.Join(ps, ",") + } + } else { + // For monitoring pods in the whole cluster + // Get /pods + var ps []string + if len(namespaces) > 0 { + ps = append(ps, fmt.Sprintf(`namespace=~"%s"`, strings.Join(namespaces, "|"))) + } + if len(pods) > 0 { + ps = append(ps, fmt.Sprintf(`pod=~"%s"`, strings.Join(pods, "|"))) + } + if len(ps) > 0 { + podSelector = strings.Join(ps, ",") + } + } + } + + return strings.NewReplacer("$1", workloadSelector, "$2", podSelector).Replace(tmpl) +} + +func makeContainerMetricExpr(tmpl string, o QueryOptions) string { + var containerSelector string + if o.ContainerName != "" { + containerSelector = fmt.Sprintf(`pod="%s", namespace="%s", container="%s"`, o.PodName, o.NamespaceName, o.ContainerName) + } else { + containerSelector = fmt.Sprintf(`pod="%s", namespace="%s", container=~"%s"`, o.PodName, o.NamespaceName, o.ResourceFilter) + } + return strings.Replace(tmpl, "$1", containerSelector, -1) +} + +func makePVCMetricExpr(tmpl string, o QueryOptions) string { + var pvcSelector string + + // For monitoring persistentvolumeclaims in the specific namespace + // GET /namespaces/{namespace}/persistentvolumeclaims/{persistentvolumeclaim} or + // GET /namespaces/{namespace}/persistentvolumeclaims + if o.NamespaceName != "" { + if o.PersistentVolumeClaimName != "" { + pvcSelector = fmt.Sprintf(`namespace="%s", persistentvolumeclaim="%s"`, o.NamespaceName, o.PersistentVolumeClaimName) + } else { + pvcSelector = fmt.Sprintf(`namespace="%s", persistentvolumeclaim=~"%s"`, o.NamespaceName, o.ResourceFilter) + } + return strings.Replace(tmpl, "$1", pvcSelector, -1) + } + + // For monitoring persistentvolumeclaims of the specific storageclass + // GET /storageclasses/{storageclass}/persistentvolumeclaims + if o.StorageClassName != "" { + pvcSelector = fmt.Sprintf(`storageclass="%s", persistentvolumeclaim=~"%s"`, o.StorageClassName, o.ResourceFilter) + } + return strings.Replace(tmpl, "$1", pvcSelector, -1) +} + +func makeIngressMetricExpr(tmpl string, o QueryOptions) string { + var ingressSelector string + var jobSelector string + duration := "5m" + + // parse Range Vector Selectors metric{key=value}[duration] + if o.Duration != nil { + duration = o.Duration.String() + } + + // job is a reqiuried filter + // GET /namespaces/{namespace}/ingress?job=xxx&pod=xxx + if o.Job != "" { + jobSelector = fmt.Sprintf(`job="%s"`, o.Job) + if o.PodName != "" { + jobSelector = fmt.Sprintf(`%s,controller_pod="%s"`, jobSelector, o.PodName) + } + } + + tmpl = strings.Replace(tmpl, "$1", ingressSelector, -1) + tmpl = strings.Replace(tmpl, "$2", jobSelector, -1) + return strings.Replace(tmpl, "$3", duration, -1) +} diff --git a/pkg/tracker/queryoptions.go b/pkg/tracker/queryoptions.go new file mode 100644 index 00000000..90528a1d --- /dev/null +++ b/pkg/tracker/queryoptions.go @@ -0,0 +1,318 @@ +package tracker + +import ( + "fmt" + "strings" + "time" +) + +type Level int + +const ( + LevelCluster = 1 << iota + LevelNode + LevelWorkspace + LevelNamespace + LevelApplication + LevelOpenpitrix + LevelWorkload + LevelService + LevelPod + LevelContainer + LevelPVC + LevelComponent + LevelIngress +) + +var MeteringLevelMap = map[string]int{ + "LevelCluster": LevelCluster, + "LevelNode": LevelNode, + "LevelWorkspace": LevelWorkspace, + "LevelNamespace": LevelNamespace, + "LevelApplication": LevelApplication, + "LevelWorkload": LevelWorkload, + "LevelService": LevelService, + "LevelPod": LevelPod, + "LevelContainer": LevelContainer, + "LevelPVC": LevelPVC, + "LevelComponent": LevelComponent, +} + +type QueryOption interface { + Apply(*QueryOptions) +} + +type Meteroptions struct { + Start time.Time + End time.Time + Step time.Duration +} + +type QueryOptions struct { + Level Level + + NamespacedResourcesFilter string + QueryType string + ResourceFilter string + NodeName string + WorkspaceName string + NamespaceName string + WorkloadKind string + WorkloadName string + PodName string + ContainerName string + StorageClassName string + PersistentVolumeClaimName string + PVCFilter string + ApplicationName string + ServiceName string + Ingress string + Job string + Duration *time.Duration + MeterOptions *Meteroptions +} + +func NewQueryOptions() *QueryOptions { + return &QueryOptions{} +} + +type ClusterOption struct{} + +func (_ ClusterOption) Apply(o *QueryOptions) { + o.Level = LevelCluster +} + +type NodeOption struct { + ResourceFilter string + NodeName string + PVCFilter string + StorageClassName string + QueryType string +} + +func (no NodeOption) Apply(o *QueryOptions) { + o.Level = LevelNode + o.ResourceFilter = no.ResourceFilter + o.NodeName = no.NodeName + o.PVCFilter = no.PVCFilter + o.StorageClassName = no.StorageClassName + o.QueryType = no.QueryType +} + +type WorkspaceOption struct { + ResourceFilter string + WorkspaceName string + PVCFilter string + StorageClassName string +} + +func (wo WorkspaceOption) Apply(o *QueryOptions) { + o.Level = LevelWorkspace + o.ResourceFilter = wo.ResourceFilter + o.WorkspaceName = wo.WorkspaceName + o.PVCFilter = wo.PVCFilter + o.StorageClassName = wo.StorageClassName +} + +type NamespaceOption struct { + ResourceFilter string + WorkspaceName string + NamespaceName string + PVCFilter string + StorageClassName string +} + +func (no NamespaceOption) Apply(o *QueryOptions) { + o.Level = LevelNamespace + o.ResourceFilter = no.ResourceFilter + o.WorkspaceName = no.WorkspaceName + o.NamespaceName = no.NamespaceName + o.PVCFilter = no.PVCFilter + o.StorageClassName = no.StorageClassName +} + +type ApplicationsOption struct { + NamespaceName string + Applications []string + StorageClassName string +} + +func (aso ApplicationsOption) Apply(o *QueryOptions) { + // nothing should be done + //nolint:gosimple + return +} + +type OpenpitrixsOption struct { + Cluster string + NamespaceName string + Openpitrixs []string + StorageClassName string +} + +func (oso OpenpitrixsOption) Apply(o *QueryOptions) { + // nothing should be done + //nolint:gosimple + return +} + +// ApplicationsOption & OpenpitrixsOption share the same ApplicationOption struct +type ApplicationOption struct { + NamespaceName string + Application string + ApplicationComponents []string + StorageClassName string +} + +func (ao ApplicationOption) Apply(o *QueryOptions) { + o.Level = LevelApplication + o.NamespaceName = ao.NamespaceName + o.ApplicationName = ao.Application + o.StorageClassName = ao.StorageClassName + + app_components := strings.Join(ao.ApplicationComponents[:], "|") + + if len(app_components) > 0 { + o.ResourceFilter = fmt.Sprintf(`namespace="%s", workload=~"%s"`, o.NamespaceName, app_components) + } else { + o.ResourceFilter = fmt.Sprintf(`namespace="%s", workload=~"%s"`, o.NamespaceName, ".*") + } +} + +type WorkloadOption struct { + ResourceFilter string + NamespaceName string + WorkloadKind string +} + +func (wo WorkloadOption) Apply(o *QueryOptions) { + o.Level = LevelWorkload + o.ResourceFilter = wo.ResourceFilter + o.NamespaceName = wo.NamespaceName + o.WorkloadKind = wo.WorkloadKind +} + +type ServicesOption struct { + NamespaceName string + Services []string +} + +func (sso ServicesOption) Apply(o *QueryOptions) { + // nothing should be done + //nolint:gosimple + return +} + +type ServiceOption struct { + ResourceFilter string + NamespaceName string + ServiceName string + PodNames []string +} + +func (so ServiceOption) Apply(o *QueryOptions) { + o.Level = LevelService + o.NamespaceName = so.NamespaceName + o.ServiceName = so.ServiceName + + pod_names := strings.Join(so.PodNames, "|") + + if len(pod_names) > 0 { + o.ResourceFilter = fmt.Sprintf(`pod=~"%s", namespace="%s"`, pod_names, o.NamespaceName) + } else { + o.ResourceFilter = fmt.Sprintf(`pod=~"%s", namespace="%s"`, ".*", o.NamespaceName) + } +} + +type PodOption struct { + NamespacedResourcesFilter string + ResourceFilter string + NodeName string + NamespaceName string + WorkloadKind string + WorkloadName string + PodName string +} + +func (po PodOption) Apply(o *QueryOptions) { + o.Level = LevelPod + o.NamespacedResourcesFilter = po.NamespacedResourcesFilter + o.ResourceFilter = po.ResourceFilter + o.NodeName = po.NodeName + o.NamespaceName = po.NamespaceName + o.WorkloadKind = po.WorkloadKind + o.WorkloadName = po.WorkloadName + o.PodName = po.PodName +} + +type ContainerOption struct { + ResourceFilter string + NamespaceName string + PodName string + ContainerName string +} + +func (co ContainerOption) Apply(o *QueryOptions) { + o.Level = LevelContainer + o.ResourceFilter = co.ResourceFilter + o.NamespaceName = co.NamespaceName + o.PodName = co.PodName + o.ContainerName = co.ContainerName +} + +type PVCOption struct { + ResourceFilter string + NamespaceName string + StorageClassName string + PersistentVolumeClaimName string +} + +func (po PVCOption) Apply(o *QueryOptions) { + o.Level = LevelPVC + o.ResourceFilter = po.ResourceFilter + o.NamespaceName = po.NamespaceName + o.StorageClassName = po.StorageClassName + o.PersistentVolumeClaimName = po.PersistentVolumeClaimName + + // for meter + o.PVCFilter = po.PersistentVolumeClaimName +} + +type IngressOption struct { + ResourceFilter string + NamespaceName string + Ingress string + Job string + Pod string + Duration *time.Duration +} + +func (no IngressOption) Apply(o *QueryOptions) { + o.Level = LevelIngress + o.ResourceFilter = no.ResourceFilter + o.NamespaceName = no.NamespaceName + o.Ingress = no.Ingress + o.Job = no.Job + o.PodName = no.Pod + o.Duration = no.Duration +} + +type ComponentOption struct{} + +func (_ ComponentOption) Apply(o *QueryOptions) { + o.Level = LevelComponent +} + +type MeterOption struct { + Start time.Time + End time.Time + Step time.Duration +} + +func (mo MeterOption) Apply(o *QueryOptions) { + o.MeterOptions = &Meteroptions{ + Start: mo.Start, + End: mo.End, + Step: mo.Step, + } +} diff --git a/pkg/tracker/tracker.go b/pkg/tracker/tracker.go index 8fdb0f6a..b13a9e3c 100644 --- a/pkg/tracker/tracker.go +++ b/pkg/tracker/tracker.go @@ -1,14 +1,20 @@ package tracker import ( + "context" "github.com/prometheus/client_golang/api" v1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + "strings" + "sync" + "time" ) type prometheus struct { client v1.API } +// NewPrometheus 初始化Prometheus客户端 func NewPrometheus(address string) (Interface, error) { cfg := api.Config{ Address: address, @@ -17,3 +23,83 @@ func NewPrometheus(address string) (Interface, error) { client, err := api.NewClient(cfg) return prometheus{client: v1.NewAPI(client)}, err } + +func (p prometheus) GetNamedMetrics(metrics []string, ts time.Time, o QueryOption) []Metric { + var res []Metric + var mtx sync.Mutex + var wg sync.WaitGroup + + opts := NewQueryOptions() + o.Apply(opts) + + for _, metric := range metrics { + wg.Add(1) + go func(metric string) { + parsedResp := Metric{MetricName: metric} + + value, _, err := p.client.Query(context.Background(), makeExpr(metric, *opts), ts) + if err != nil { + parsedResp.Error = err.Error() + } else { + parsedResp.MetricData = parseQueryResp(value, genMetricFilter(o)) + } + + mtx.Lock() + res = append(res, parsedResp) + mtx.Unlock() + + wg.Done() + }(metric) + } + + wg.Wait() + + return res +} +func parseQueryResp(value model.Value, metricFilter func(metric model.Metric) bool) MetricData { + res := MetricData{MetricType: MetricTypeVector} + + data, _ := value.(model.Vector) + + for _, v := range data { + if metricFilter != nil && !metricFilter(v.Metric) { + continue + } + mv := MetricValue{ + Metadata: make(map[string]string), + } + + for k, v := range v.Metric { + mv.Metadata[string(k)] = string(v) + } + + mv.Sample = &Point{float64(v.Timestamp) / 1000, float64(v.Value)} + + res.MetricValues = append(res.MetricValues, mv) + } + + return res +} + +func genMetricFilter(o QueryOption) func(metric model.Metric) bool { + if o != nil { + if po, ok := o.(PodOption); ok { + if po.NamespacedResourcesFilter != "" { + namespacedPodsMap := make(map[string]struct{}) + for _, s := range strings.Split(po.NamespacedResourcesFilter, "|") { + namespacedPodsMap[s] = struct{}{} + } + return func(metric model.Metric) bool { + if len(metric) == 0 { + return false + } + _, ok := namespacedPodsMap[string(metric["namespace"])+"/"+string(metric["pod"])] + return ok + } + } + } + } + return func(metric model.Metric) bool { + return true + } +} diff --git a/pkg/tracker/tracker_test.go b/pkg/tracker/tracker_test.go new file mode 100644 index 00000000..c0ff3e76 --- /dev/null +++ b/pkg/tracker/tracker_test.go @@ -0,0 +1,17 @@ +package tracker + +import ( + "testing" + "time" +) + +func TestGetNamedMetrics(t *testing.T) { + client, _ := NewPrometheus("http://10.101.15.3:32585") + result := client.GetNamedMetrics([]string{"pod_cpu_resource_limits"}, time.Now(), PodOption{ + + PodName: "prometheus-k8s-0", + NamespaceName: "monitoring-system", + }) + println("zzz", result[0].MetricValues[0].Sample.Value()) + +} diff --git a/pkg/tracker/types.go b/pkg/tracker/types.go new file mode 100644 index 00000000..6309df78 --- /dev/null +++ b/pkg/tracker/types.go @@ -0,0 +1,174 @@ +package tracker + +import ( + "errors" + "fmt" + "strconv" + "time" + + jsoniter "github.com/json-iterator/go" +) + +const ( + MetricTypeMatrix = "matrix" + MetricTypeVector = "vector" +) + +type Metadata struct { + Metric string `json:"metric,omitempty" description:"metric name"` + Type string `json:"type,omitempty" description:"metric type"` + Help string `json:"help,omitempty" description:"metric description"` +} + +type Metric struct { + MetricName string `json:"metric_name,omitempty" description:"metric name, eg. scheduler_up_sum" csv:"metric_name"` + MetricData `json:"data,omitempty" description:"actual metric result"` + Error string `json:"error,omitempty" csv:"-"` +} + +type MetricValues []MetricValue + +type MetricData struct { + MetricType string `json:"resultType,omitempty" description:"result type, one of matrix, vector" csv:"metric_type"` + MetricValues `json:"result,omitempty" description:"metric data including labels, time series and values" csv:"metric_values"` +} + +type DashboardEntity struct { + GrafanaDashboardUrl string `json:"grafanaDashboardUrl,omitempty"` + GrafanaDashboardContent string `json:"grafanaDashboardContent,omitempty"` + Description string `json:"description,omitempty"` + Namespace string `json:"namespace,omitempty"` +} + +// The first element is the timestamp, the second is the metric value. +// eg, [1585658599.195, 0.528] +type Point [2]float64 + +type MetricValue struct { + Metadata map[string]string `json:"metric,omitempty" description:"time series labels"` + // The type of Point is a float64 array with fixed length of 2. + // So Point will always be initialized as [0, 0], rather than nil. + // To allow empty Sample, we should declare Sample to type *Point + Sample *Point `json:"value,omitempty" description:"time series, values of vector type"` + Series []Point `json:"values,omitempty" description:"time series, values of matrix type"` + ExportSample *ExportPoint `json:"exported_value,omitempty" description:"exported time series, values of vector type"` + ExportedSeries []ExportPoint `json:"exported_values,omitempty" description:"exported time series, values of matrix type"` + + MinValue string `json:"min_value" description:"minimum value from monitor points"` + MaxValue string `json:"max_value" description:"maximum value from monitor points"` + AvgValue string `json:"avg_value" description:"average value from monitor points"` + SumValue string `json:"sum_value" description:"sum value from monitor points"` + Fee string `json:"fee" description:"resource fee"` + ResourceUnit string `json:"resource_unit"` + CurrencyUnit string `json:"currency_unit"` +} + +func (mv *MetricValue) TransferToExportedMetricValue() { + + if mv.Sample != nil { + sample := mv.Sample.transferToExported() + mv.ExportSample = &sample + mv.Sample = nil + } + + for _, item := range mv.Series { + mv.ExportedSeries = append(mv.ExportedSeries, item.transferToExported()) + } + mv.Series = nil + +} + +func (p Point) Timestamp() float64 { + return p[0] +} + +func (p Point) Value() float64 { + return p[1] +} + +func (p Point) transferToExported() ExportPoint { + return ExportPoint{p[0], p[1]} +} + +func (p Point) Add(other Point) Point { + return Point{p[0], p[1] + other[1]} +} + +// MarshalJSON implements json.Marshaler. It will be called when writing JSON to HTTP response +// Inspired by prometheus/client_golang +func (p Point) MarshalJSON() ([]byte, error) { + t, err := jsoniter.Marshal(p.Timestamp()) + if err != nil { + return nil, err + } + v, err := jsoniter.Marshal(strconv.FormatFloat(p.Value(), 'f', -1, 64)) + if err != nil { + return nil, err + } + return []byte(fmt.Sprintf("[%s,%s]", t, v)), nil +} + +// UnmarshalJSON implements json.Unmarshaler. This is for unmarshaling test data. +func (p *Point) UnmarshalJSON(b []byte) error { + var v []interface{} + if err := jsoniter.Unmarshal(b, &v); err != nil { + return err + } + + if v == nil { + return nil + } + + if len(v) != 2 { + return errors.New("unsupported array length") + } + + ts, ok := v[0].(float64) + if !ok { + return errors.New("failed to unmarshal [timestamp]") + } + valstr, ok := v[1].(string) + if !ok { + return errors.New("failed to unmarshal [value]") + } + valf, err := strconv.ParseFloat(valstr, 64) + if err != nil { + return err + } + + p[0] = ts + p[1] = valf + return nil +} + +type CSVPoint struct { + MetricName string `csv:"metric_name"` + Selector string `csv:"selector"` + Time string `csv:"time"` + Value string `csv:"value"` + ResourceUnit string `csv:"unit"` +} + +type ExportPoint [2]float64 + +func (p ExportPoint) Timestamp() string { + return time.Unix(int64(p[0]), 0).Format("2006-01-02 03:04:05 PM") +} + +func (p ExportPoint) Value() float64 { + return p[1] +} + +func (p ExportPoint) Format() string { + return p.Timestamp() + " " + strconv.FormatFloat(p.Value(), 'f', -1, 64) +} + +func (p ExportPoint) TransformToCSVPoint(metricName string, selector string, resourceUnit string) CSVPoint { + return CSVPoint{ + MetricName: metricName, + Selector: selector, + Time: p.Timestamp(), + Value: strconv.FormatFloat(p.Value(), 'f', -1, 64), + ResourceUnit: resourceUnit, + } +}