任务详情监控

Former-commit-id: f584527e2f8b081846c6aa88798e50b26bb10328
This commit is contained in:
zhangwei 2023-10-30 10:53:30 +08:00
parent ecd23ba389
commit f52314ea1f
16 changed files with 1222 additions and 50 deletions

View File

@ -176,11 +176,8 @@ type deleteTaskReq {
type ( type (
scheduleTaskByYamlReq { scheduleTaskByYamlReq {
Name string `yaml:"name"` Name string `yaml:"name"`
synergy string `yaml:"synergy"`
Description string `yaml:"description"` Description string `yaml:"description"`
strategy string `yaml:"strategy"`
tenantId int64 `yaml:"tenantId"` tenantId int64 `yaml:"tenantId"`
tasks []TaskYaml `yaml:"tasks"` tasks []TaskYaml `yaml:"tasks"`
} }
@ -260,7 +257,12 @@ type (
TaskId int64 `path:"taskId"` TaskId int64 `path:"taskId"`
} }
taskDetailResp { taskDetailResp {
CpuCores float64 `json:"cpuCores"`
CpuRate float64 `json:"cpuRate"`
CpuLimit float64 `json:"cpuLimit"`
MemoryTotal float64 `json:"memoryTotal"`
MemoryRate float64 `json:"memoryRate"`
MemoryLimit float64 `json:"memoryLimit"`
} }
) )
type ( type (

View File

@ -42,7 +42,7 @@ service pcm {
// 任务详情接口 // 任务详情接口
@handler TaskDetailHandler @handler TaskDetailHandler
get /core/taskDetail (taskDetailReq) returns (taskDetailResp) get /core/taskDetail/:taskId (taskDetailReq) returns (taskDetailResp)
@handler JobTotalHandler @handler JobTotalHandler
get /core/jobTotal returns (jobTotalResp) get /core/jobTotal returns (jobTotalResp)

View File

@ -1,7 +1,6 @@
package config package config
import ( import (
"github.com/zeromicro/go-queue/kq"
"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
"github.com/zeromicro/go-zero/core/stores/cache" "github.com/zeromicro/go-zero/core/stores/cache"
"github.com/zeromicro/go-zero/core/stores/redis" "github.com/zeromicro/go-zero/core/stores/redis"
@ -11,21 +10,12 @@ import (
type Config struct { type Config struct {
rest.RestConf rest.RestConf
KqProducerConf struct {
Brokers []string
HpcTopic string
CloudTopic string
AiTopic string
}
DB struct { DB struct {
DataSource string DataSource string
} }
Redis redis.RedisConf Redis redis.RedisConf
Cache cache.CacheConf Cache cache.CacheConf
LogConf logx.LogConf LogConf logx.LogConf
HpcConsumerConf kq.KqConf
CloudConsumerConf kq.KqConf
AiConsumerConf kq.KqConf
K8sNativeConf zrpc.RpcClientConf K8sNativeConf zrpc.RpcClientConf
ACRpcConf zrpc.RpcClientConf ACRpcConf zrpc.RpcClientConf
THRpcConf zrpc.RpcClientConf THRpcConf zrpc.RpcClientConf

View File

@ -1,6 +1,8 @@
package core package core
import ( import (
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
"gitlink.org.cn/jcce-pcm/utils/result"
"net/http" "net/http"
"github.com/zeromicro/go-zero/rest/httpx" "github.com/zeromicro/go-zero/rest/httpx"
@ -13,16 +15,22 @@ func ScheduleTaskByYamlHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) { return func(w http.ResponseWriter, r *http.Request) {
var req types.ScheduleTaskByYamlReq var req types.ScheduleTaskByYamlReq
if err := httpx.Parse(r, &req); err != nil { if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err) result.HttpResult(r, w, nil, err)
return
}
// 解析yaml文件
_, fileHeader, err := r.FormFile("file")
if err != nil {
result.HttpResult(r, w, nil, err)
return
}
err = utils.Yaml2struct(fileHeader, &req)
if err != nil {
result.HttpResult(r, w, nil, err)
return return
} }
l := core.NewScheduleTaskByYamlLogic(r.Context(), svcCtx) l := core.NewScheduleTaskByYamlLogic(r.Context(), svcCtx)
resp, err := l.ScheduleTaskByYaml(&req) resp, err := l.ScheduleTaskByYaml(&req)
if err != nil { result.HttpResult(r, w, resp, err)
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
} }
} }

View File

@ -47,7 +47,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
}, },
{ {
Method: http.MethodGet, Method: http.MethodGet,
Path: "/core/taskDetail", Path: "/core/taskDetail/:taskId",
Handler: core.TaskDetailHandler(serverCtx), Handler: core.TaskDetailHandler(serverCtx),
}, },
{ {

View File

@ -2,9 +2,9 @@ package core
import ( import (
"context" "context"
"encoding/json"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/constants" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/constants"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models"
"k8s.io/apimachinery/pkg/util/json"
"time" "time"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
@ -28,6 +28,7 @@ func NewScheduleTaskByYamlLogic(ctx context.Context, svcCtx *svc.ServiceContext)
} }
func (l *ScheduleTaskByYamlLogic) ScheduleTaskByYaml(req *types.ScheduleTaskByYamlReq) (resp *types.ScheduleTaskByYamlResp, err error) { func (l *ScheduleTaskByYamlLogic) ScheduleTaskByYaml(req *types.ScheduleTaskByYamlReq) (resp *types.ScheduleTaskByYamlResp, err error) {
resp = &types.ScheduleTaskByYamlResp{}
bytes, err := json.Marshal(req) bytes, err := json.Marshal(req)
if err != nil { if err != nil {
return nil, err return nil, err
@ -55,7 +56,10 @@ func (l *ScheduleTaskByYamlLogic) ScheduleTaskByYaml(req *types.ScheduleTaskByYa
logx.Error(err) logx.Error(err)
return nil, err return nil, err
} }
l.svcCtx.RedisClient.Publish(context.Background(), task.TaskType, reqMessage) publish := l.svcCtx.RedisClient.Publish(context.Background(), task.TaskType, reqMessage)
if publish.Err() != nil {
return nil, publish.Err()
}
} }
resp.TaskId = taskModel.Id resp.TaskId = taskModel.Id
return resp, nil return resp, nil

View File

@ -2,6 +2,10 @@ package core
import ( import (
"context" "context"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/tracker"
"gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes/kubernetesclient"
"time"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/types" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/types"
@ -24,7 +28,100 @@ func NewTaskDetailLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TaskDe
} }
func (l *TaskDetailLogic) TaskDetail(req *types.TaskDetailReq) (resp *types.TaskDetailResp, err error) { func (l *TaskDetailLogic) TaskDetail(req *types.TaskDetailReq) (resp *types.TaskDetailResp, err error) {
// todo: add your logic here and delete this line resp = &types.TaskDetailResp{}
var clouds []models.Cloud
l.svcCtx.DbEngin.Where("task_id = ?", req.TaskId).Find(&clouds)
for _, cloud := range clouds {
// 查询监控地址
var metricsUrl string
l.svcCtx.DbEngin.Raw(" SELECT metrics_url FROM `sc_participant_phy_info` WHERE id = ? ", cloud.ParticipantId).Scan(&metricsUrl)
return //
var pods []*kubernetesclient.Pod
switch cloud.Kind {
case "Job":
jobResult, err := l.svcCtx.K8sRpc[cloud.ParticipantId].JobDetail(context.Background(), &kubernetesclient.JobDetailReq{
Namespace: cloud.Namespace,
Name: cloud.Name,
})
if err != nil {
return nil, err
}
// 查询出job下关联的pod列表
uid := jobResult.Job.Metadata.Labels["controller-uid"]
LabelSelector := "controller-uid=" + uid
podResp, err := l.svcCtx.K8sRpc[cloud.ParticipantId].PodList(context.Background(), &kubernetesclient.PodListReq{
ListOptions: &kubernetesclient.ListOptions{
LabelSelector: &LabelSelector,
},
})
if err != nil {
return nil, err
}
pods = podResp.PodList.Items
case "Deployment":
deploymentResult, err := l.svcCtx.K8sRpc[cloud.ParticipantId].DeploymentDetail(context.Background(), &kubernetesclient.DeploymentDetailReq{
Namespace: cloud.Namespace,
Name: cloud.Name,
})
if err != nil {
return nil, err
}
// 查询出job下关联的pod列表
uid := deploymentResult.Deployment.Spec.Selector.MatchLabels["app"]
LabelSelector := "app=" + uid
podResp, err := l.svcCtx.K8sRpc[cloud.ParticipantId].PodList(context.Background(), &kubernetesclient.PodListReq{
ListOptions: &kubernetesclient.ListOptions{
LabelSelector: &LabelSelector,
},
})
if err != nil {
return nil, err
}
pods = podResp.PodList.Items
}
podsMetrics(metricsUrl, pods, resp)
}
return resp, nil
}
func podsMetrics(metricsUrl string, pods []*kubernetesclient.Pod, resp *types.TaskDetailResp) {
// 查询每个pod资源使用情况
for _, pod := range pods {
prometheusClient, _ := tracker.NewPrometheus(metricsUrl)
// cpu需求量
podCpuLimit := prometheusClient.GetNamedMetrics([]string{"pod_cpu_resource_limits"}, time.Now(), tracker.PodOption{
PodName: *pod.Metadata.Name,
NamespaceName: *pod.Metadata.Namespace,
})
resp.CpuLimit = metricAdd(resp.CpuLimit, podCpuLimit)
// cpu使用量
podCpuUsage := prometheusClient.GetNamedMetrics([]string{"pod_cpu_usage"}, time.Now(), tracker.PodOption{
PodName: *pod.Metadata.Name,
NamespaceName: *pod.Metadata.Namespace,
})
resp.CpuCores = metricAdd(resp.CpuCores, podCpuUsage)
// 内存使用量
podMemoryUsage := prometheusClient.GetNamedMetrics([]string{"pod_memory_usage"}, time.Now(), tracker.PodOption{
PodName: *pod.Metadata.Name,
NamespaceName: *pod.Metadata.Namespace,
})
resp.MemoryTotal = metricAdd(resp.MemoryTotal, podMemoryUsage)
// 内存需求量
podMemoryLimit := prometheusClient.GetNamedMetrics([]string{"pod_memory_resource_limits"}, time.Now(), tracker.PodOption{
PodName: *pod.Metadata.Name,
NamespaceName: *pod.Metadata.Namespace,
})
resp.MemoryLimit = metricAdd(resp.MemoryLimit, podMemoryLimit)
}
}
func metricAdd(z float64, metric []tracker.Metric) float64 {
if metric[0].MetricValues != nil {
z = z + metric[0].MetricValues[0].Sample.Value()
}
return z
} }

View File

@ -158,9 +158,7 @@ type DeleteTaskReq struct {
type ScheduleTaskByYamlReq struct { type ScheduleTaskByYamlReq struct {
Name string `yaml:"name"` Name string `yaml:"name"`
Synergy string `yaml:"synergy"`
Description string `yaml:"description"` Description string `yaml:"description"`
Strategy string `yaml:"strategy"`
TenantId int64 `yaml:"tenantId"` TenantId int64 `yaml:"tenantId"`
Tasks []TaskYaml `yaml:"tasks"` Tasks []TaskYaml `yaml:"tasks"`
} }
@ -236,6 +234,12 @@ type TaskDetailReq struct {
} }
type TaskDetailResp struct { type TaskDetailResp struct {
CpuCores float64 `json:"cpuCores"`
CpuRate float64 `json:"cpuRate"`
CpuLimit float64 `json:"cpuLimit"`
MemoryTotal float64 `json:"memoryTotal"`
MemoryRate float64 `json:"memoryRate"`
MemoryLimit float64 `json:"memoryLimit"`
} }
type ListCenterResp struct { type ListCenterResp struct {

6
go.mod
View File

@ -13,9 +13,11 @@ require (
github.com/go-resty/resty/v2 v2.7.0 github.com/go-resty/resty/v2 v2.7.0
github.com/go-sql-driver/mysql v1.7.1 github.com/go-sql-driver/mysql v1.7.1
github.com/jinzhu/copier v0.3.5 github.com/jinzhu/copier v0.3.5
github.com/json-iterator/go v1.1.12
github.com/nacos-group/nacos-sdk-go/v2 v2.2.3 github.com/nacos-group/nacos-sdk-go/v2 v2.2.3
github.com/pkg/errors v0.9.1 github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.16.0 github.com/prometheus/client_golang v1.16.0
github.com/prometheus/common v0.44.0
github.com/redis/go-redis/v9 v9.2.1 github.com/redis/go-redis/v9 v9.2.1
github.com/robfig/cron/v3 v3.0.1 github.com/robfig/cron/v3 v3.0.1
github.com/shopspring/decimal v1.3.1 github.com/shopspring/decimal v1.3.1
@ -23,7 +25,7 @@ require (
github.com/zeromicro/go-zero v1.5.5 github.com/zeromicro/go-zero v1.5.5
gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31 gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31
gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835 gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835
gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20230830120334-bf6d99c715ef gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231027083610-c8a292768f4a
gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231024115530-f6fd0505d2a1 gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231024115530-f6fd0505d2a1
gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231011071802-c6a7637b74e4 gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231011071802-c6a7637b74e4
gitlink.org.cn/jcce-pcm/pcm-participant-openstack v0.0.0-20231024105731-cbdceff549c9 gitlink.org.cn/jcce-pcm/pcm-participant-openstack v0.0.0-20231024105731-cbdceff549c9
@ -79,7 +81,6 @@ require (
github.com/jinzhu/now v1.1.5 // indirect github.com/jinzhu/now v1.1.5 // indirect
github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect
github.com/josharian/intern v1.0.0 // indirect github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.15.15 // indirect github.com/klauspost/compress v1.15.15 // indirect
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 // indirect
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 // indirect
@ -96,7 +97,6 @@ require (
github.com/pelletier/go-toml/v2 v2.1.0 // indirect github.com/pelletier/go-toml/v2 v2.1.0 // indirect
github.com/pierrec/lz4/v4 v4.1.17 // indirect github.com/pierrec/lz4/v4 v4.1.17 // indirect
github.com/prometheus/client_model v0.4.0 // indirect github.com/prometheus/client_model v0.4.0 // indirect
github.com/prometheus/common v0.44.0 // indirect
github.com/prometheus/procfs v0.10.1 // indirect github.com/prometheus/procfs v0.10.1 // indirect
github.com/segmentio/kafka-go v0.4.38 // indirect github.com/segmentio/kafka-go v0.4.38 // indirect
github.com/spaolacci/murmur3 v1.1.0 // indirect github.com/spaolacci/murmur3 v1.1.0 // indirect

4
go.sum
View File

@ -1037,8 +1037,8 @@ gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31 h1
gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31/go.mod h1:DY45tXlPBWBptj9YjCHWnAK5LshvJ33PjFkE5/vtd4o= gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31/go.mod h1:DY45tXlPBWBptj9YjCHWnAK5LshvJ33PjFkE5/vtd4o=
gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835 h1:WDCPqD8IrepGJXankkpG14Ny6inh9AldB0RX9WWa+ck= gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835 h1:WDCPqD8IrepGJXankkpG14Ny6inh9AldB0RX9WWa+ck=
gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835/go.mod h1:r/KLzUpupCV5jdxSfgDhc2pVjP0fBi3VhAWRttsBn30= gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835/go.mod h1:r/KLzUpupCV5jdxSfgDhc2pVjP0fBi3VhAWRttsBn30=
gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20230830120334-bf6d99c715ef h1:s7JfXjka2MhGaDjKMJ57fj0k3XuDB6w+UlYHFLyJlUY= gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231027083610-c8a292768f4a h1:12PKPeDecCY7IfPp9JZmdMEIoJn6fF1oT4WW4ZKEUFM=
gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20230830120334-bf6d99c715ef/go.mod h1:SFpXY1gy1ELYTo4P6EU68nTL2vKu1ZuH7nKecL16hJk= gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231027083610-c8a292768f4a/go.mod h1:xtSfvDUd+jJhqHBBibZ1d9/ud3oN9nxaNYYHwz+CDgY=
gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231024115530-f6fd0505d2a1 h1:4Ibzcl4waYiHO3tdbqvcLUWEoV51ZaJhZBi7T518AA8= gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231024115530-f6fd0505d2a1 h1:4Ibzcl4waYiHO3tdbqvcLUWEoV51ZaJhZBi7T518AA8=
gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231024115530-f6fd0505d2a1/go.mod h1:pisJKAI8FRFFUcBaH3Gob+ENXWRM97rpuYmv9s1raag= gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231024115530-f6fd0505d2a1/go.mod h1:pisJKAI8FRFFUcBaH3Gob+ENXWRM97rpuYmv9s1raag=
gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231011071802-c6a7637b74e4 h1:iv78VZ5+j6/VNkEyD/GSmTJ96rpxzpKDUNknAoXsAmg= gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231011071802-c6a7637b74e4 h1:iv78VZ5+j6/VNkEyD/GSmTJ96rpxzpKDUNknAoXsAmg=

View File

@ -1,25 +1,11 @@
package tracker package tracker
/* import "time"
Copyright 2020 KubeSphere Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
type Interface interface { type Interface interface {
//GetMetric(expr string, time time.Time) Metric //GetMetric(expr string, time time.Time) Metric
//GetMetricOverTime(expr string, start, end time.Time, step time.Duration) Metric //GetMetricOverTime(expr string, start, end time.Time, step time.Duration) Metric
//GetNamedMetrics(metrics []string, time time.Time, opt QueryOption) []Metric GetNamedMetrics(metrics []string, time time.Time, opt QueryOption) []Metric
//GetNamedMetricsOverTime(metrics []string, start, end time.Time, step time.Duration, opt QueryOption) []Metric //GetNamedMetricsOverTime(metrics []string, start, end time.Time, step time.Duration, opt QueryOption) []Metric
//GetMetadata(namespace string) []Metadata //GetMetadata(namespace string) []Metadata
//GetMetricLabelSet(expr string, start, end time.Time) []map[string]string //GetMetricLabelSet(expr string, start, end time.Time) []map[string]string

486
pkg/tracker/promql.go Normal file
View File

@ -0,0 +1,486 @@
package tracker
import (
"fmt"
"strings"
)
const (
StatefulSet = "StatefulSet"
DaemonSet = "DaemonSet"
Deployment = "Deployment"
)
var promQLTemplates = map[string]string{
//cluster
"cluster_cpu_utilisation": ":node_cpu_utilisation:avg1m",
"cluster_cpu_usage": `round(:node_cpu_utilisation:avg1m * sum(node:node_num_cpu:sum), 0.001)`,
"cluster_cpu_total": "sum(node:node_num_cpu:sum)",
"cluster_memory_utilisation": ":node_memory_utilisation:",
"cluster_memory_available": "sum(node:node_memory_bytes_available:sum)",
"cluster_memory_total": "sum(node:node_memory_bytes_total:sum)",
"cluster_memory_usage_wo_cache": "sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)",
"cluster_net_utilisation": ":node_net_utilisation:sum_irate",
"cluster_net_bytes_transmitted": "sum(node:node_net_bytes_transmitted:sum_irate)",
"cluster_net_bytes_received": "sum(node:node_net_bytes_received:sum_irate)",
"cluster_disk_read_iops": "sum(node:data_volume_iops_reads:sum)",
"cluster_disk_write_iops": "sum(node:data_volume_iops_writes:sum)",
"cluster_disk_read_throughput": "sum(node:data_volume_throughput_bytes_read:sum)",
"cluster_disk_write_throughput": "sum(node:data_volume_throughput_bytes_written:sum)",
"cluster_disk_size_usage": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
"cluster_disk_size_utilisation": `cluster:disk_utilization:ratio`,
"cluster_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
"cluster_disk_size_available": `sum(max(node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
"cluster_disk_inode_total": `sum(node:node_inodes_total:)`,
"cluster_disk_inode_usage": `sum(node:node_inodes_total:) - sum(node:node_inodes_free:)`,
"cluster_disk_inode_utilisation": `cluster:disk_inode_utilization:ratio`,
"cluster_namespace_count": `count(kube_namespace_labels)`,
"cluster_pod_count": `cluster:pod:sum`,
"cluster_pod_quota": `sum(max(kube_node_status_capacity{resource="pods"}) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`,
"cluster_pod_utilisation": `cluster:pod_utilization:ratio`,
"cluster_pod_running_count": `cluster:pod_running:count`,
"cluster_pod_succeeded_count": `count(kube_pod_info unless on (pod) (kube_pod_status_phase{phase=~"Failed|Pending|Unknown|Running"} > 0) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`,
"cluster_pod_abnormal_count": `cluster:pod_abnormal:sum`,
"cluster_node_online": `sum(kube_node_status_condition{condition="Ready",status="true"})`,
"cluster_node_offline": `cluster:node_offline:sum`,
"cluster_node_total": `sum(kube_node_status_condition{condition="Ready"})`,
"cluster_cronjob_count": `sum(kube_cronjob_labels)`,
"cluster_pvc_count": `sum(kube_persistentvolumeclaim_info)`,
"cluster_daemonset_count": `sum(kube_daemonset_labels)`,
"cluster_deployment_count": `sum(kube_deployment_labels)`,
"cluster_endpoint_count": `sum(kube_endpoint_labels)`,
"cluster_hpa_count": `sum(kube_horizontalpodautoscaler_labels)`,
"cluster_job_count": `sum(kube_job_labels)`,
"cluster_statefulset_count": `sum(kube_statefulset_labels)`,
"cluster_replicaset_count": `count(kube_replicaset_labels)`,
"cluster_service_count": `sum(kube_service_info)`,
"cluster_secret_count": `sum(kube_secret_info)`,
"cluster_pv_count": `sum(kube_persistentvolume_labels)`,
"cluster_ingresses_extensions_count": `sum(kube_ingress_labels)`,
"cluster_load1": `sum(node_load1{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
"cluster_load5": `sum(node_load5{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
"cluster_load15": `sum(node_load15{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
"cluster_pod_abnormal_ratio": `cluster:pod_abnormal:ratio`,
"cluster_node_offline_ratio": `cluster:node_offline:ratio`,
//node
"node_cpu_utilisation": "node:node_cpu_utilisation:avg1m{$1}",
"node_cpu_total": "node:node_num_cpu:sum{$1}",
"node_memory_utilisation": "node:node_memory_utilisation:{$1}",
"node_memory_available": "node:node_memory_bytes_available:sum{$1}",
"node_memory_total": "node:node_memory_bytes_total:sum{$1}",
"node_memory_usage_wo_cache": "node:node_memory_bytes_total:sum{$1} - node:node_memory_bytes_available:sum{$1}",
"node_net_utilisation": "node:node_net_utilisation:sum_irate{$1}",
"node_net_bytes_transmitted": "node:node_net_bytes_transmitted:sum_irate{$1}",
"node_net_bytes_received": "node:node_net_bytes_received:sum_irate{$1}",
"node_disk_read_iops": "node:data_volume_iops_reads:sum{$1}",
"node_disk_write_iops": "node:data_volume_iops_writes:sum{$1}",
"node_disk_read_throughput": "node:data_volume_throughput_bytes_read:sum{$1}",
"node_disk_write_throughput": "node:data_volume_throughput_bytes_written:sum{$1}",
"node_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{$1}) by (device, node)) by (node)`,
"node_disk_size_available": `node:disk_space_available:{$1}`,
"node_disk_size_usage": `sum(max((node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{$1}) by (device, node)) by (node)`,
"node_disk_size_utilisation": `node:disk_space_utilization:ratio{$1}`,
"node_disk_inode_total": `node:node_inodes_total:{$1}`,
"node_disk_inode_usage": `node:node_inodes_total:{$1} - node:node_inodes_free:{$1}`,
"node_disk_inode_utilisation": `node:disk_inode_utilization:ratio{$1}`,
"node_pod_count": `node:pod_count:sum{$1}`,
"node_pod_quota": `max(kube_node_status_capacity{resource="pods",$1}) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0)`,
"node_pod_utilisation": `node:pod_utilization:ratio{$1}`,
"node_pod_running_count": `node:pod_running:count{$1}`,
"node_pod_succeeded_count": `node:pod_succeeded:count{$1}`,
"node_pod_abnormal_count": `node:pod_abnormal:count{$1}`,
"node_cpu_usage": `round(node:node_cpu_utilisation:avg1m{$1} * node:node_num_cpu:sum{$1}, 0.001)`,
"node_load1": `node:load1:ratio{$1}`,
"node_load5": `node:load5:ratio{$1}`,
"node_load15": `node:load15:ratio{$1}`,
"node_pod_abnormal_ratio": `node:pod_abnormal:ratio{$1}`,
"node_pleg_quantile": `node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{$1}`,
"node_device_size_usage": `sum by(device, node, host_ip, role) (node_filesystem_size_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1}) - sum by(device, node, host_ip, role) (node_filesystem_avail_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1})`,
"node_device_size_utilisation": `1 - sum by(device, node, host_ip, role) (node_filesystem_avail_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1}) / sum by(device, node, host_ip, role) (node_filesystem_size_bytes{device!~"/dev/loop\\d+",device=~"/dev/.*",job="node-exporter"} * on(namespace, pod) group_left(node, host_ip, role) node_namespace_pod:kube_pod_info:{$1})`,
// workspace
"workspace_cpu_usage": `round(sum by (workspace) (namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}), 0.001)`,
"workspace_memory_usage": `sum by (workspace) (namespace:container_memory_usage_bytes:sum{namespace!="", $1})`,
"workspace_memory_usage_wo_cache": `sum by (workspace) (namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1})`,
"workspace_net_bytes_transmitted": `sum by (workspace) (sum by (namespace) (irate(container_network_transmit_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`,
"workspace_net_bytes_received": `sum by (workspace) (sum by (namespace) (irate(container_network_receive_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`,
"workspace_pod_count": `sum by (workspace) (kube_pod_status_phase{phase!~"Failed|Succeeded", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1})) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`,
"workspace_pod_running_count": `sum by (workspace) (kube_pod_status_phase{phase="Running", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1})) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`,
"workspace_pod_succeeded_count": `sum by (workspace) (kube_pod_status_phase{phase="Succeeded", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1})) or on(workspace) max by(workspace) (kube_namespace_labels{$1} * 0)`,
"workspace_pod_abnormal_count": `count by (workspace) ((kube_pod_info{node!=""} unless on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Succeeded"}>0) unless on (pod, namespace) ((kube_pod_status_ready{job="kube-state-metrics", condition="true"}>0) and on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Running"}>0)) unless on (pod, namespace) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", reason="ContainerCreating"}>0)) * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_ingresses_extensions_count": `sum by (workspace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_cronjob_count": `sum by (workspace) (kube_cronjob_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_pvc_count": `sum by (workspace) (kube_persistentvolumeclaim_info{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_daemonset_count": `sum by (workspace) (kube_daemonset_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_deployment_count": `sum by (workspace) (kube_deployment_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_endpoint_count": `sum by (workspace) (kube_endpoint_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_hpa_count": `sum by (workspace) (kube_horizontalpodautoscaler_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_job_count": `sum by (workspace) (kube_job_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_statefulset_count": `sum by (workspace) (kube_statefulset_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_replicaset_count": `count by (workspace) (kube_replicaset_labels{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_service_count": `sum by (workspace) (kube_service_info{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_secret_count": `sum by (workspace) (kube_secret_info{namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
"workspace_pod_abnormal_ratio": `count by (workspace) ((kube_pod_info{node!=""} unless on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Succeeded"}>0) unless on (pod, namespace) ((kube_pod_status_ready{job="kube-state-metrics", condition="true"}>0) and on (pod, namespace) (kube_pod_status_phase{job="kube-state-metrics", phase="Running"}>0)) unless on (pod, namespace) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", reason="ContainerCreating"}>0)) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) / sum by (workspace) (kube_pod_status_phase{phase!="Succeeded", namespace!=""} * on (namespace) group_left(workspace)(kube_namespace_labels{$1}))`,
//namespace
"namespace_cpu_usage": `round(namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}, 0.001)`,
"namespace_memory_usage": `namespace:container_memory_usage_bytes:sum{namespace!="", $1}`,
"namespace_memory_usage_wo_cache": `namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1}`,
"namespace_net_bytes_transmitted": `sum by (namespace) (irate(container_network_transmit_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m]) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`,
"namespace_net_bytes_received": `sum by (namespace) (irate(container_network_receive_bytes_total{namespace!="", pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m]) * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`,
"namespace_pod_count": `sum by (namespace) (kube_pod_status_phase{phase!~"Failed|Succeeded", namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`,
"namespace_pod_running_count": `sum by (namespace) (kube_pod_status_phase{phase="Running", namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`,
"namespace_pod_succeeded_count": `sum by (namespace) (kube_pod_status_phase{phase="Succeeded", namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1}) or on(namespace) max by(namespace) (kube_namespace_labels{$1} * 0)`,
"namespace_pod_abnormal_count": `namespace:pod_abnormal:count{namespace!="", $1}`,
"namespace_pod_abnormal_ratio": `namespace:pod_abnormal:ratio{namespace!="", $1}`,
"namespace_memory_limit_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="limits.memory"} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_cpu_limit_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="limits.cpu"} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_pod_count_hard": `min by (namespace) (kube_resourcequota{resourcequota!="quota", type="hard", namespace!="", resource="count/pods"} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_cronjob_count": `sum by (namespace) (kube_cronjob_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_pvc_count": `sum by (namespace) (kube_persistentvolumeclaim_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_daemonset_count": `sum by (namespace) (kube_daemonset_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_deployment_count": `sum by (namespace) (kube_deployment_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_endpoint_count": `sum by (namespace) (kube_endpoint_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_hpa_count": `sum by (namespace) (kube_horizontalpodautoscaler_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_job_count": `sum by (namespace) (kube_job_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_statefulset_count": `sum by (namespace) (kube_statefulset_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_replicaset_count": `count by (namespace) (kube_replicaset_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_service_count": `sum by (namespace) (kube_service_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_secret_count": `sum by (namespace) (kube_secret_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_configmap_count": `sum by (namespace) (kube_configmap_info{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_ingresses_extensions_count": `sum by (namespace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
"namespace_s2ibuilder_count": `sum by (namespace) (s2i_s2ibuilder_created{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
// ingress
"ingress_request_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2}[$3])))`,
"ingress_request_4xx_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2,status=~"[4].*"}[$3])))`,
"ingress_request_5xx_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2,status=~"[5].*"}[$3])))`,
"ingress_active_connections": `sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{$2,state="active"}[$3]))`,
"ingress_success_rate": `sum(rate(nginx_ingress_controller_requests{$1,$2,status!~"[4-5].*"}[$3])) / sum(rate(nginx_ingress_controller_requests{$1,$2}[$3]))`,
"ingress_request_duration_average": `sum_over_time(nginx_ingress_controller_request_duration_seconds_sum{$1,$2}[$3])/sum_over_time(nginx_ingress_controller_request_duration_seconds_count{$1,$2}[$3])`,
"ingress_request_duration_50percentage": `histogram_quantile(0.50, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`,
"ingress_request_duration_95percentage": `histogram_quantile(0.95, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`,
"ingress_request_duration_99percentage": `histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`,
"ingress_request_volume": `round(sum(irate(nginx_ingress_controller_requests{$1,$2}[$3])), 0.001)`,
"ingress_request_volume_by_ingress": `round(sum(irate(nginx_ingress_controller_requests{$1,$2}[$3])) by (ingress), 0.001)`,
"ingress_request_network_sent": `sum(irate(nginx_ingress_controller_response_size_sum{$1,$2}[$3]))`,
"ingress_request_network_received": `sum(irate(nginx_ingress_controller_request_size_sum{$1,$2}[$3]))`,
"ingress_request_memory_bytes": `avg(nginx_ingress_controller_nginx_process_resident_memory_bytes{$2})`,
"ingress_request_cpu_usage": `avg(rate(nginx_ingress_controller_nginx_process_cpu_seconds_total{$2}[5m]))`,
// workload
"workload_cpu_usage": `round(namespace:workload_cpu_usage:sum{$1}, 0.001)`,
"workload_memory_usage": `namespace:workload_memory_usage:sum{$1}`,
"workload_memory_usage_wo_cache": `namespace:workload_memory_usage_wo_cache:sum{$1}`,
"workload_net_bytes_transmitted": `namespace:workload_net_bytes_transmitted:sum_irate{$1}`,
"workload_net_bytes_received": `namespace:workload_net_bytes_received:sum_irate{$1}`,
"workload_deployment_replica": `label_join(sum (label_join(label_replace(kube_deployment_spec_replicas{$2}, "owner_kind", "Deployment", "", ""), "workload", "", "deployment")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_deployment_replica_available": `label_join(sum (label_join(label_replace(kube_deployment_status_replicas_available{$2}, "owner_kind", "Deployment", "", ""), "workload", "", "deployment")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_statefulset_replica": `label_join(sum (label_join(label_replace(kube_statefulset_replicas{$2}, "owner_kind", "StatefulSet", "", ""), "workload", "", "statefulset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_statefulset_replica_available": `label_join(sum (label_join(label_replace(kube_statefulset_status_replicas_current{$2}, "owner_kind", "StatefulSet", "", ""), "workload", "", "statefulset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_daemonset_replica": `label_join(sum (label_join(label_replace(kube_daemonset_status_desired_number_scheduled{$2}, "owner_kind", "DaemonSet", "", ""), "workload", "", "daemonset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_daemonset_replica_available": `label_join(sum (label_join(label_replace(kube_daemonset_status_number_available{$2}, "owner_kind", "DaemonSet", "", ""), "workload", "", "daemonset")) by (namespace, owner_kind, workload), "workload", ":", "owner_kind", "workload")`,
"workload_deployment_unavailable_replicas_ratio": `namespace:deployment_unavailable_replicas:ratio{$1}`,
"workload_daemonset_unavailable_replicas_ratio": `namespace:daemonset_unavailable_replicas:ratio{$1}`,
"workload_statefulset_unavailable_replicas_ratio": `namespace:statefulset_unavailable_replicas:ratio{$1}`,
// pod
"pod_cpu_usage": `round(sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{job="kubelet", pod!="", image!=""}[5m])) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.001)`,
"pod_memory_usage": `sum by (namespace, pod) (container_memory_usage_bytes{job="kubelet", pod!="", image!=""}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
"pod_memory_usage_wo_cache": `sum by (namespace, pod) (container_memory_working_set_bytes{job="kubelet", pod!="", image!=""}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
"pod_net_bytes_transmitted": `sum by (namespace, pod) (irate(container_network_transmit_bytes_total{pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
"pod_net_bytes_received": `sum by (namespace, pod) (irate(container_network_receive_bytes_total{pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
"pod_cpu_resource_limits": `sum by (namespace, pod) (kube_pod_container_resource_limits{origin_prometheus=~"",resource="cpu",unit="core"}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
"pod_memory_resource_limits": `sum by (namespace, pod) (kube_pod_container_resource_limits{origin_prometheus=~"",resource="memory",unit="byte"}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
// container
"container_cpu_usage": `round(sum by (namespace, pod, container) (irate(container_cpu_usage_seconds_total{job="kubelet", container!="POD", container!="", image!="", $1}[5m])), 0.001)`,
"container_memory_usage": `sum by (namespace, pod, container) (container_memory_usage_bytes{job="kubelet", container!="POD", container!="", image!="", $1})`,
"container_memory_usage_wo_cache": `sum by (namespace, pod, container) (container_memory_working_set_bytes{job="kubelet", container!="POD", container!="", image!="", $1})`,
"container_processes_usage": `sum by (namespace, pod, container) (container_processes{job="kubelet", container!="POD", container!="", image!="", $1})`,
"container_threads_usage": `sum by (namespace, pod, container) (container_threads {job="kubelet", container!="POD", container!="", image!="", $1})`,
// pvc
"pvc_inodes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_free) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_inodes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_inodes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_inodes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used / kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_bytes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_available_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_bytes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_bytes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
"pvc_bytes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
// component
"etcd_server_list": `label_replace(up{job="etcd"}, "node_ip", "$1", "instance", "(.*):.*")`,
"etcd_server_total": `count(up{job="etcd"})`,
"etcd_server_up_total": `etcd:up:sum`,
"etcd_server_has_leader": `label_replace(etcd_server_has_leader, "node_ip", "$1", "instance", "(.*):.*")`,
"etcd_server_is_leader": `label_replace(etcd_server_is_leader, "node_ip", "$1", "instance", "(.*):.*")`,
"etcd_server_leader_changes": `label_replace(etcd:etcd_server_leader_changes_seen:sum_changes, "node_ip", "$1", "node", "(.*)")`,
"etcd_server_proposals_failed_rate": `avg(etcd:etcd_server_proposals_failed:sum_irate)`,
"etcd_server_proposals_applied_rate": `avg(etcd:etcd_server_proposals_applied:sum_irate)`,
"etcd_server_proposals_committed_rate": `avg(etcd:etcd_server_proposals_committed:sum_irate)`,
"etcd_server_proposals_pending_count": `avg(etcd:etcd_server_proposals_pending:sum)`,
"etcd_mvcc_db_size": `avg(etcd:etcd_mvcc_db_total_size:sum)`,
"etcd_network_client_grpc_received_bytes": `sum(etcd:etcd_network_client_grpc_received_bytes:sum_irate)`,
"etcd_network_client_grpc_sent_bytes": `sum(etcd:etcd_network_client_grpc_sent_bytes:sum_irate)`,
"etcd_grpc_call_rate": `sum(etcd:grpc_server_started:sum_irate)`,
"etcd_grpc_call_failed_rate": `sum(etcd:grpc_server_handled:sum_irate)`,
"etcd_grpc_server_msg_received_rate": `sum(etcd:grpc_server_msg_received:sum_irate)`,
"etcd_grpc_server_msg_sent_rate": `sum(etcd:grpc_server_msg_sent:sum_irate)`,
"etcd_disk_wal_fsync_duration": `avg(etcd:etcd_disk_wal_fsync_duration:avg)`,
"etcd_disk_wal_fsync_duration_quantile": `avg(etcd:etcd_disk_wal_fsync_duration:histogram_quantile) by (quantile)`,
"etcd_disk_backend_commit_duration": `avg(etcd:etcd_disk_backend_commit_duration:avg)`,
"etcd_disk_backend_commit_duration_quantile": `avg(etcd:etcd_disk_backend_commit_duration:histogram_quantile) by (quantile)`,
"apiserver_up_sum": `apiserver:up:sum`,
"apiserver_request_rate": `apiserver:apiserver_request_total:sum_irate`,
"apiserver_request_by_verb_rate": `apiserver:apiserver_request_total:sum_verb_irate`,
"apiserver_request_latencies": `apiserver:apiserver_request_duration:avg`,
"apiserver_request_by_verb_latencies": `apiserver:apiserver_request_duration:avg_by_verb`,
"scheduler_up_sum": `scheduler:up:sum`,
"scheduler_schedule_attempts": `scheduler:scheduler_schedule_attempts:sum`,
"scheduler_schedule_attempt_rate": `scheduler:scheduler_schedule_attempts:sum_rate`,
"scheduler_e2e_scheduling_latency": `scheduler:scheduler_e2e_scheduling_duration:avg`,
"scheduler_e2e_scheduling_latency_quantile": `scheduler:scheduler_e2e_scheduling_duration:histogram_quantile`,
}
func makeExpr(metric string, opts QueryOptions) string {
tmpl := promQLTemplates[metric]
switch opts.Level {
case LevelCluster:
return tmpl
case LevelNode:
return makeNodeMetricExpr(tmpl, opts)
case LevelWorkspace:
return makeWorkspaceMetricExpr(tmpl, opts)
case LevelNamespace:
return makeNamespaceMetricExpr(tmpl, opts)
case LevelWorkload:
return makeWorkloadMetricExpr(metric, tmpl, opts)
case LevelPod:
return makePodMetricExpr(tmpl, opts)
case LevelContainer:
return makeContainerMetricExpr(tmpl, opts)
case LevelPVC:
return makePVCMetricExpr(tmpl, opts)
case LevelIngress:
return makeIngressMetricExpr(tmpl, opts)
case LevelComponent:
return tmpl
default:
return tmpl
}
}
func makeNodeMetricExpr(tmpl string, o QueryOptions) string {
var nodeSelector string
if o.NodeName != "" {
nodeSelector = fmt.Sprintf(`node="%s"`, o.NodeName)
} else {
nodeSelector = fmt.Sprintf(`node=~"%s"`, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", nodeSelector, -1)
}
func makeWorkspaceMetricExpr(tmpl string, o QueryOptions) string {
var workspaceSelector string
if o.WorkspaceName != "" {
workspaceSelector = fmt.Sprintf(`workspace="%s"`, o.WorkspaceName)
} else {
workspaceSelector = fmt.Sprintf(`workspace=~"%s", workspace!=""`, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", workspaceSelector, -1)
}
func makeNamespaceMetricExpr(tmpl string, o QueryOptions) string {
var namespaceSelector string
// For monitoring namespaces in the specific workspace
// GET /workspaces/{workspace}/namespaces
if o.WorkspaceName != "" {
namespaceSelector = fmt.Sprintf(`workspace="%s", namespace=~"%s"`, o.WorkspaceName, o.ResourceFilter)
return strings.Replace(tmpl, "$1", namespaceSelector, -1)
}
// For monitoring the specific namespaces
// GET /namespaces/{namespace} or
// GET /namespaces
if o.NamespaceName != "" {
namespaceSelector = fmt.Sprintf(`namespace="%s"`, o.NamespaceName)
} else {
namespaceSelector = fmt.Sprintf(`namespace=~"%s"`, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", namespaceSelector, -1)
}
func makeWorkloadMetricExpr(metric, tmpl string, o QueryOptions) string {
var kindSelector, workloadSelector string
switch o.WorkloadKind {
case "deployment":
o.WorkloadKind = Deployment
case "statefulset":
o.WorkloadKind = StatefulSet
case "daemonset":
o.WorkloadKind = DaemonSet
default:
o.WorkloadKind = ".*"
}
workloadSelector = fmt.Sprintf(`namespace="%s", workload=~"%s:(%s)"`, o.NamespaceName, o.WorkloadKind, o.ResourceFilter)
if strings.Contains(metric, "deployment") {
kindSelector = fmt.Sprintf(`namespace="%s", deployment!="", deployment=~"%s"`, o.NamespaceName, o.ResourceFilter)
}
if strings.Contains(metric, "statefulset") {
kindSelector = fmt.Sprintf(`namespace="%s", statefulset!="", statefulset=~"%s"`, o.NamespaceName, o.ResourceFilter)
}
if strings.Contains(metric, "daemonset") {
kindSelector = fmt.Sprintf(`namespace="%s", daemonset!="", daemonset=~"%s"`, o.NamespaceName, o.ResourceFilter)
}
return strings.NewReplacer("$1", workloadSelector, "$2", kindSelector).Replace(tmpl)
}
func makePodMetricExpr(tmpl string, o QueryOptions) string {
var podSelector, workloadSelector string
// For monitoriong pods of the specific workload
// GET /namespaces/{namespace}/workloads/{kind}/{workload}/pods
if o.WorkloadName != "" {
switch o.WorkloadKind {
case "deployment":
workloadSelector = fmt.Sprintf(`owner_kind="ReplicaSet", owner_name=~"^%s-[^-]{1,10}$"`, o.WorkloadName)
case "statefulset":
workloadSelector = fmt.Sprintf(`owner_kind="StatefulSet", owner_name="%s"`, o.WorkloadName)
case "daemonset":
workloadSelector = fmt.Sprintf(`owner_kind="DaemonSet", owner_name="%s"`, o.WorkloadName)
}
}
// For monitoring pods in the specific namespace
// GET /namespaces/{namespace}/workloads/{kind}/{workload}/pods or
// GET /namespaces/{namespace}/pods/{pod} or
// GET /namespaces/{namespace}/pods
if o.NamespaceName != "" {
if o.PodName != "" {
podSelector = fmt.Sprintf(`pod="%s", namespace="%s"`, o.PodName, o.NamespaceName)
} else {
podSelector = fmt.Sprintf(`pod=~"%s", namespace="%s"`, o.ResourceFilter, o.NamespaceName)
}
} else {
var namespaces, pods []string
if o.NamespacedResourcesFilter != "" {
for _, np := range strings.Split(o.NamespacedResourcesFilter, "|") {
if nparr := strings.SplitN(np, "/", 2); len(nparr) > 1 {
namespaces = append(namespaces, nparr[0])
pods = append(pods, nparr[1])
} else {
pods = append(pods, np)
}
}
}
// For monitoring pods on the specific node
// GET /nodes/{node}/pods/{pod}
// GET /nodes/{node}/pods
if o.NodeName != "" {
if o.PodName != "" {
if nparr := strings.SplitN(o.PodName, "/", 2); len(nparr) > 1 {
podSelector = fmt.Sprintf(`namespace="%s",pod="%s", node="%s"`, nparr[0], nparr[1], o.NodeName)
} else {
podSelector = fmt.Sprintf(`pod="%s", node="%s"`, o.PodName, o.NodeName)
}
} else {
var ps []string
ps = append(ps, fmt.Sprintf(`node="%s"`, o.NodeName))
if o.ResourceFilter != "" {
ps = append(ps, fmt.Sprintf(`pod=~"%s"`, o.ResourceFilter))
}
if len(namespaces) > 0 {
ps = append(ps, fmt.Sprintf(`namespace=~"%s"`, strings.Join(namespaces, "|")))
}
if len(pods) > 0 {
ps = append(ps, fmt.Sprintf(`pod=~"%s"`, strings.Join(pods, "|")))
}
podSelector = strings.Join(ps, ",")
}
} else {
// For monitoring pods in the whole cluster
// Get /pods
var ps []string
if len(namespaces) > 0 {
ps = append(ps, fmt.Sprintf(`namespace=~"%s"`, strings.Join(namespaces, "|")))
}
if len(pods) > 0 {
ps = append(ps, fmt.Sprintf(`pod=~"%s"`, strings.Join(pods, "|")))
}
if len(ps) > 0 {
podSelector = strings.Join(ps, ",")
}
}
}
return strings.NewReplacer("$1", workloadSelector, "$2", podSelector).Replace(tmpl)
}
func makeContainerMetricExpr(tmpl string, o QueryOptions) string {
var containerSelector string
if o.ContainerName != "" {
containerSelector = fmt.Sprintf(`pod="%s", namespace="%s", container="%s"`, o.PodName, o.NamespaceName, o.ContainerName)
} else {
containerSelector = fmt.Sprintf(`pod="%s", namespace="%s", container=~"%s"`, o.PodName, o.NamespaceName, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", containerSelector, -1)
}
func makePVCMetricExpr(tmpl string, o QueryOptions) string {
var pvcSelector string
// For monitoring persistentvolumeclaims in the specific namespace
// GET /namespaces/{namespace}/persistentvolumeclaims/{persistentvolumeclaim} or
// GET /namespaces/{namespace}/persistentvolumeclaims
if o.NamespaceName != "" {
if o.PersistentVolumeClaimName != "" {
pvcSelector = fmt.Sprintf(`namespace="%s", persistentvolumeclaim="%s"`, o.NamespaceName, o.PersistentVolumeClaimName)
} else {
pvcSelector = fmt.Sprintf(`namespace="%s", persistentvolumeclaim=~"%s"`, o.NamespaceName, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", pvcSelector, -1)
}
// For monitoring persistentvolumeclaims of the specific storageclass
// GET /storageclasses/{storageclass}/persistentvolumeclaims
if o.StorageClassName != "" {
pvcSelector = fmt.Sprintf(`storageclass="%s", persistentvolumeclaim=~"%s"`, o.StorageClassName, o.ResourceFilter)
}
return strings.Replace(tmpl, "$1", pvcSelector, -1)
}
func makeIngressMetricExpr(tmpl string, o QueryOptions) string {
var ingressSelector string
var jobSelector string
duration := "5m"
// parse Range Vector Selectors metric{key=value}[duration]
if o.Duration != nil {
duration = o.Duration.String()
}
// job is a reqiuried filter
// GET /namespaces/{namespace}/ingress?job=xxx&pod=xxx
if o.Job != "" {
jobSelector = fmt.Sprintf(`job="%s"`, o.Job)
if o.PodName != "" {
jobSelector = fmt.Sprintf(`%s,controller_pod="%s"`, jobSelector, o.PodName)
}
}
tmpl = strings.Replace(tmpl, "$1", ingressSelector, -1)
tmpl = strings.Replace(tmpl, "$2", jobSelector, -1)
return strings.Replace(tmpl, "$3", duration, -1)
}

318
pkg/tracker/queryoptions.go Normal file
View File

@ -0,0 +1,318 @@
package tracker
import (
"fmt"
"strings"
"time"
)
type Level int
const (
LevelCluster = 1 << iota
LevelNode
LevelWorkspace
LevelNamespace
LevelApplication
LevelOpenpitrix
LevelWorkload
LevelService
LevelPod
LevelContainer
LevelPVC
LevelComponent
LevelIngress
)
var MeteringLevelMap = map[string]int{
"LevelCluster": LevelCluster,
"LevelNode": LevelNode,
"LevelWorkspace": LevelWorkspace,
"LevelNamespace": LevelNamespace,
"LevelApplication": LevelApplication,
"LevelWorkload": LevelWorkload,
"LevelService": LevelService,
"LevelPod": LevelPod,
"LevelContainer": LevelContainer,
"LevelPVC": LevelPVC,
"LevelComponent": LevelComponent,
}
type QueryOption interface {
Apply(*QueryOptions)
}
type Meteroptions struct {
Start time.Time
End time.Time
Step time.Duration
}
type QueryOptions struct {
Level Level
NamespacedResourcesFilter string
QueryType string
ResourceFilter string
NodeName string
WorkspaceName string
NamespaceName string
WorkloadKind string
WorkloadName string
PodName string
ContainerName string
StorageClassName string
PersistentVolumeClaimName string
PVCFilter string
ApplicationName string
ServiceName string
Ingress string
Job string
Duration *time.Duration
MeterOptions *Meteroptions
}
func NewQueryOptions() *QueryOptions {
return &QueryOptions{}
}
type ClusterOption struct{}
func (_ ClusterOption) Apply(o *QueryOptions) {
o.Level = LevelCluster
}
type NodeOption struct {
ResourceFilter string
NodeName string
PVCFilter string
StorageClassName string
QueryType string
}
func (no NodeOption) Apply(o *QueryOptions) {
o.Level = LevelNode
o.ResourceFilter = no.ResourceFilter
o.NodeName = no.NodeName
o.PVCFilter = no.PVCFilter
o.StorageClassName = no.StorageClassName
o.QueryType = no.QueryType
}
type WorkspaceOption struct {
ResourceFilter string
WorkspaceName string
PVCFilter string
StorageClassName string
}
func (wo WorkspaceOption) Apply(o *QueryOptions) {
o.Level = LevelWorkspace
o.ResourceFilter = wo.ResourceFilter
o.WorkspaceName = wo.WorkspaceName
o.PVCFilter = wo.PVCFilter
o.StorageClassName = wo.StorageClassName
}
type NamespaceOption struct {
ResourceFilter string
WorkspaceName string
NamespaceName string
PVCFilter string
StorageClassName string
}
func (no NamespaceOption) Apply(o *QueryOptions) {
o.Level = LevelNamespace
o.ResourceFilter = no.ResourceFilter
o.WorkspaceName = no.WorkspaceName
o.NamespaceName = no.NamespaceName
o.PVCFilter = no.PVCFilter
o.StorageClassName = no.StorageClassName
}
type ApplicationsOption struct {
NamespaceName string
Applications []string
StorageClassName string
}
func (aso ApplicationsOption) Apply(o *QueryOptions) {
// nothing should be done
//nolint:gosimple
return
}
type OpenpitrixsOption struct {
Cluster string
NamespaceName string
Openpitrixs []string
StorageClassName string
}
func (oso OpenpitrixsOption) Apply(o *QueryOptions) {
// nothing should be done
//nolint:gosimple
return
}
// ApplicationsOption & OpenpitrixsOption share the same ApplicationOption struct
type ApplicationOption struct {
NamespaceName string
Application string
ApplicationComponents []string
StorageClassName string
}
func (ao ApplicationOption) Apply(o *QueryOptions) {
o.Level = LevelApplication
o.NamespaceName = ao.NamespaceName
o.ApplicationName = ao.Application
o.StorageClassName = ao.StorageClassName
app_components := strings.Join(ao.ApplicationComponents[:], "|")
if len(app_components) > 0 {
o.ResourceFilter = fmt.Sprintf(`namespace="%s", workload=~"%s"`, o.NamespaceName, app_components)
} else {
o.ResourceFilter = fmt.Sprintf(`namespace="%s", workload=~"%s"`, o.NamespaceName, ".*")
}
}
type WorkloadOption struct {
ResourceFilter string
NamespaceName string
WorkloadKind string
}
func (wo WorkloadOption) Apply(o *QueryOptions) {
o.Level = LevelWorkload
o.ResourceFilter = wo.ResourceFilter
o.NamespaceName = wo.NamespaceName
o.WorkloadKind = wo.WorkloadKind
}
type ServicesOption struct {
NamespaceName string
Services []string
}
func (sso ServicesOption) Apply(o *QueryOptions) {
// nothing should be done
//nolint:gosimple
return
}
type ServiceOption struct {
ResourceFilter string
NamespaceName string
ServiceName string
PodNames []string
}
func (so ServiceOption) Apply(o *QueryOptions) {
o.Level = LevelService
o.NamespaceName = so.NamespaceName
o.ServiceName = so.ServiceName
pod_names := strings.Join(so.PodNames, "|")
if len(pod_names) > 0 {
o.ResourceFilter = fmt.Sprintf(`pod=~"%s", namespace="%s"`, pod_names, o.NamespaceName)
} else {
o.ResourceFilter = fmt.Sprintf(`pod=~"%s", namespace="%s"`, ".*", o.NamespaceName)
}
}
type PodOption struct {
NamespacedResourcesFilter string
ResourceFilter string
NodeName string
NamespaceName string
WorkloadKind string
WorkloadName string
PodName string
}
func (po PodOption) Apply(o *QueryOptions) {
o.Level = LevelPod
o.NamespacedResourcesFilter = po.NamespacedResourcesFilter
o.ResourceFilter = po.ResourceFilter
o.NodeName = po.NodeName
o.NamespaceName = po.NamespaceName
o.WorkloadKind = po.WorkloadKind
o.WorkloadName = po.WorkloadName
o.PodName = po.PodName
}
type ContainerOption struct {
ResourceFilter string
NamespaceName string
PodName string
ContainerName string
}
func (co ContainerOption) Apply(o *QueryOptions) {
o.Level = LevelContainer
o.ResourceFilter = co.ResourceFilter
o.NamespaceName = co.NamespaceName
o.PodName = co.PodName
o.ContainerName = co.ContainerName
}
type PVCOption struct {
ResourceFilter string
NamespaceName string
StorageClassName string
PersistentVolumeClaimName string
}
func (po PVCOption) Apply(o *QueryOptions) {
o.Level = LevelPVC
o.ResourceFilter = po.ResourceFilter
o.NamespaceName = po.NamespaceName
o.StorageClassName = po.StorageClassName
o.PersistentVolumeClaimName = po.PersistentVolumeClaimName
// for meter
o.PVCFilter = po.PersistentVolumeClaimName
}
type IngressOption struct {
ResourceFilter string
NamespaceName string
Ingress string
Job string
Pod string
Duration *time.Duration
}
func (no IngressOption) Apply(o *QueryOptions) {
o.Level = LevelIngress
o.ResourceFilter = no.ResourceFilter
o.NamespaceName = no.NamespaceName
o.Ingress = no.Ingress
o.Job = no.Job
o.PodName = no.Pod
o.Duration = no.Duration
}
type ComponentOption struct{}
func (_ ComponentOption) Apply(o *QueryOptions) {
o.Level = LevelComponent
}
type MeterOption struct {
Start time.Time
End time.Time
Step time.Duration
}
func (mo MeterOption) Apply(o *QueryOptions) {
o.MeterOptions = &Meteroptions{
Start: mo.Start,
End: mo.End,
Step: mo.Step,
}
}

View File

@ -1,14 +1,20 @@
package tracker package tracker
import ( import (
"context"
"github.com/prometheus/client_golang/api" "github.com/prometheus/client_golang/api"
v1 "github.com/prometheus/client_golang/api/prometheus/v1" v1 "github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
"strings"
"sync"
"time"
) )
type prometheus struct { type prometheus struct {
client v1.API client v1.API
} }
// NewPrometheus 初始化Prometheus客户端
func NewPrometheus(address string) (Interface, error) { func NewPrometheus(address string) (Interface, error) {
cfg := api.Config{ cfg := api.Config{
Address: address, Address: address,
@ -17,3 +23,83 @@ func NewPrometheus(address string) (Interface, error) {
client, err := api.NewClient(cfg) client, err := api.NewClient(cfg)
return prometheus{client: v1.NewAPI(client)}, err return prometheus{client: v1.NewAPI(client)}, err
} }
func (p prometheus) GetNamedMetrics(metrics []string, ts time.Time, o QueryOption) []Metric {
var res []Metric
var mtx sync.Mutex
var wg sync.WaitGroup
opts := NewQueryOptions()
o.Apply(opts)
for _, metric := range metrics {
wg.Add(1)
go func(metric string) {
parsedResp := Metric{MetricName: metric}
value, _, err := p.client.Query(context.Background(), makeExpr(metric, *opts), ts)
if err != nil {
parsedResp.Error = err.Error()
} else {
parsedResp.MetricData = parseQueryResp(value, genMetricFilter(o))
}
mtx.Lock()
res = append(res, parsedResp)
mtx.Unlock()
wg.Done()
}(metric)
}
wg.Wait()
return res
}
func parseQueryResp(value model.Value, metricFilter func(metric model.Metric) bool) MetricData {
res := MetricData{MetricType: MetricTypeVector}
data, _ := value.(model.Vector)
for _, v := range data {
if metricFilter != nil && !metricFilter(v.Metric) {
continue
}
mv := MetricValue{
Metadata: make(map[string]string),
}
for k, v := range v.Metric {
mv.Metadata[string(k)] = string(v)
}
mv.Sample = &Point{float64(v.Timestamp) / 1000, float64(v.Value)}
res.MetricValues = append(res.MetricValues, mv)
}
return res
}
func genMetricFilter(o QueryOption) func(metric model.Metric) bool {
if o != nil {
if po, ok := o.(PodOption); ok {
if po.NamespacedResourcesFilter != "" {
namespacedPodsMap := make(map[string]struct{})
for _, s := range strings.Split(po.NamespacedResourcesFilter, "|") {
namespacedPodsMap[s] = struct{}{}
}
return func(metric model.Metric) bool {
if len(metric) == 0 {
return false
}
_, ok := namespacedPodsMap[string(metric["namespace"])+"/"+string(metric["pod"])]
return ok
}
}
}
}
return func(metric model.Metric) bool {
return true
}
}

View File

@ -0,0 +1,17 @@
package tracker
import (
"testing"
"time"
)
func TestGetNamedMetrics(t *testing.T) {
client, _ := NewPrometheus("http://10.101.15.3:32585")
result := client.GetNamedMetrics([]string{"pod_cpu_resource_limits"}, time.Now(), PodOption{
PodName: "prometheus-k8s-0",
NamespaceName: "monitoring-system",
})
println("zzz", result[0].MetricValues[0].Sample.Value())
}

174
pkg/tracker/types.go Normal file
View File

@ -0,0 +1,174 @@
package tracker
import (
"errors"
"fmt"
"strconv"
"time"
jsoniter "github.com/json-iterator/go"
)
const (
MetricTypeMatrix = "matrix"
MetricTypeVector = "vector"
)
type Metadata struct {
Metric string `json:"metric,omitempty" description:"metric name"`
Type string `json:"type,omitempty" description:"metric type"`
Help string `json:"help,omitempty" description:"metric description"`
}
type Metric struct {
MetricName string `json:"metric_name,omitempty" description:"metric name, eg. scheduler_up_sum" csv:"metric_name"`
MetricData `json:"data,omitempty" description:"actual metric result"`
Error string `json:"error,omitempty" csv:"-"`
}
type MetricValues []MetricValue
type MetricData struct {
MetricType string `json:"resultType,omitempty" description:"result type, one of matrix, vector" csv:"metric_type"`
MetricValues `json:"result,omitempty" description:"metric data including labels, time series and values" csv:"metric_values"`
}
type DashboardEntity struct {
GrafanaDashboardUrl string `json:"grafanaDashboardUrl,omitempty"`
GrafanaDashboardContent string `json:"grafanaDashboardContent,omitempty"`
Description string `json:"description,omitempty"`
Namespace string `json:"namespace,omitempty"`
}
// The first element is the timestamp, the second is the metric value.
// eg, [1585658599.195, 0.528]
type Point [2]float64
type MetricValue struct {
Metadata map[string]string `json:"metric,omitempty" description:"time series labels"`
// The type of Point is a float64 array with fixed length of 2.
// So Point will always be initialized as [0, 0], rather than nil.
// To allow empty Sample, we should declare Sample to type *Point
Sample *Point `json:"value,omitempty" description:"time series, values of vector type"`
Series []Point `json:"values,omitempty" description:"time series, values of matrix type"`
ExportSample *ExportPoint `json:"exported_value,omitempty" description:"exported time series, values of vector type"`
ExportedSeries []ExportPoint `json:"exported_values,omitempty" description:"exported time series, values of matrix type"`
MinValue string `json:"min_value" description:"minimum value from monitor points"`
MaxValue string `json:"max_value" description:"maximum value from monitor points"`
AvgValue string `json:"avg_value" description:"average value from monitor points"`
SumValue string `json:"sum_value" description:"sum value from monitor points"`
Fee string `json:"fee" description:"resource fee"`
ResourceUnit string `json:"resource_unit"`
CurrencyUnit string `json:"currency_unit"`
}
func (mv *MetricValue) TransferToExportedMetricValue() {
if mv.Sample != nil {
sample := mv.Sample.transferToExported()
mv.ExportSample = &sample
mv.Sample = nil
}
for _, item := range mv.Series {
mv.ExportedSeries = append(mv.ExportedSeries, item.transferToExported())
}
mv.Series = nil
}
func (p Point) Timestamp() float64 {
return p[0]
}
func (p Point) Value() float64 {
return p[1]
}
func (p Point) transferToExported() ExportPoint {
return ExportPoint{p[0], p[1]}
}
func (p Point) Add(other Point) Point {
return Point{p[0], p[1] + other[1]}
}
// MarshalJSON implements json.Marshaler. It will be called when writing JSON to HTTP response
// Inspired by prometheus/client_golang
func (p Point) MarshalJSON() ([]byte, error) {
t, err := jsoniter.Marshal(p.Timestamp())
if err != nil {
return nil, err
}
v, err := jsoniter.Marshal(strconv.FormatFloat(p.Value(), 'f', -1, 64))
if err != nil {
return nil, err
}
return []byte(fmt.Sprintf("[%s,%s]", t, v)), nil
}
// UnmarshalJSON implements json.Unmarshaler. This is for unmarshaling test data.
func (p *Point) UnmarshalJSON(b []byte) error {
var v []interface{}
if err := jsoniter.Unmarshal(b, &v); err != nil {
return err
}
if v == nil {
return nil
}
if len(v) != 2 {
return errors.New("unsupported array length")
}
ts, ok := v[0].(float64)
if !ok {
return errors.New("failed to unmarshal [timestamp]")
}
valstr, ok := v[1].(string)
if !ok {
return errors.New("failed to unmarshal [value]")
}
valf, err := strconv.ParseFloat(valstr, 64)
if err != nil {
return err
}
p[0] = ts
p[1] = valf
return nil
}
type CSVPoint struct {
MetricName string `csv:"metric_name"`
Selector string `csv:"selector"`
Time string `csv:"time"`
Value string `csv:"value"`
ResourceUnit string `csv:"unit"`
}
type ExportPoint [2]float64
func (p ExportPoint) Timestamp() string {
return time.Unix(int64(p[0]), 0).Format("2006-01-02 03:04:05 PM")
}
func (p ExportPoint) Value() float64 {
return p[1]
}
func (p ExportPoint) Format() string {
return p.Timestamp() + " " + strconv.FormatFloat(p.Value(), 'f', -1, 64)
}
func (p ExportPoint) TransformToCSVPoint(metricName string, selector string, resourceUnit string) CSVPoint {
return CSVPoint{
MetricName: metricName,
Selector: selector,
Time: p.Timestamp(),
Value: strconv.FormatFloat(p.Value(), 'f', -1, 64),
ResourceUnit: resourceUnit,
}
}