diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index 4c25b15c..9ac87a46 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -40,7 +40,6 @@ type remoteResp { type ( clustersLoadReq { - adapterId int64 `form:"adapterId"` clusterName string `form:"clusterName"` } clustersLoadResp { diff --git a/api/desc/monitoring/pcm-monitoring.api b/api/desc/monitoring/pcm-monitoring.api index edf95b4a..e09efd66 100644 --- a/api/desc/monitoring/pcm-monitoring.api +++ b/api/desc/monitoring/pcm-monitoring.api @@ -1,12 +1,34 @@ syntax = "v1" type CreateAlertRuleReq { + CLusterId int64 `json:"clusterId"` ClusterName string `json:"clusterName"` - Namespace string `json:"namespace"` Name string `json:"name"` - PromQL string `json:"PromQL"` + PromQL string `json:"promQL"` Duration string `json:"duration"` - Labels map[string]string `json:"labels"` - Annotations map[string]string `json:"annotations"` + Annotations string `json:"annotations,optional"` AlertLevel string `json:"alertLevel"` -} \ No newline at end of file + AlertType string `json:"alertType"` +} + +type ( + alertRulesResp { + Id int64 `json:"id"` + ClusterName string `json:"clusterName"` + Name string `json:"name"` + PromQL string `json:"promQL"` + Duration string `json:"duration"` + Annotations string `json:"annotations"` + AlertLevel string `json:"alertLevel"` + } +) + +type ( + nodesLoadTopReq { + ClusterName string `form:"clusterName"` + Metrics string `form:"metrics"` + } + nodesLoadTopResp { + data interface{} `json:"data"` + } +) \ No newline at end of file diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 24fd3043..7fefaa36 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -948,9 +948,13 @@ service pcm { @doc "alert rules" @handler alertRulesHandler - get /monitoring/alert/rules + get /monitoring/alert/rule returns (alertRulesResp) @doc "cluster resource load" @handler clustersLoadHandler get /monitoring/cluster/load (clustersLoadReq) returns (clustersLoadResp) + + @doc "node resource load" + @handler nodesLoadTopHandler + get /monitoring/node/top (nodesLoadTopReq) returns (nodesLoadTopResp) } \ No newline at end of file diff --git a/api/internal/handler/cloud/clusterinfohandler.go b/api/internal/handler/cloud/clusterinfohandler.go index 50083770..51522ea4 100644 --- a/api/internal/handler/cloud/clusterinfohandler.go +++ b/api/internal/handler/cloud/clusterinfohandler.go @@ -1,6 +1,7 @@ package cloud import ( + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "net/http" "github.com/zeromicro/go-zero/rest/httpx" @@ -19,10 +20,6 @@ func ClusterInfoHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { l := cloud.NewClusterInfoLogic(r.Context(), svcCtx) resp, err := l.ClusterInfo(&req) - if err != nil { - httpx.ErrorCtx(r.Context(), w, err) - } else { - httpx.OkJsonCtx(r.Context(), w, resp) - } + result.HttpResult(r, w, resp, err) } } diff --git a/api/internal/handler/monitoring/alertruleshandler.go b/api/internal/handler/monitoring/alertruleshandler.go index b058e73e..28cea567 100644 --- a/api/internal/handler/monitoring/alertruleshandler.go +++ b/api/internal/handler/monitoring/alertruleshandler.go @@ -1,9 +1,9 @@ package monitoring import ( + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "net/http" - "github.com/zeromicro/go-zero/rest/httpx" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/monitoring" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" ) @@ -11,11 +11,7 @@ import ( func AlertRulesHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { l := monitoring.NewAlertRulesLogic(r.Context(), svcCtx) - err := l.AlertRules() - if err != nil { - httpx.ErrorCtx(r.Context(), w, err) - } else { - httpx.Ok(w) - } + resp, err := l.AlertRules() + result.HttpResult(r, w, resp, err) } } diff --git a/api/internal/handler/monitoring/createalertrulehandler.go b/api/internal/handler/monitoring/createalertrulehandler.go index 444f60dd..292ca021 100644 --- a/api/internal/handler/monitoring/createalertrulehandler.go +++ b/api/internal/handler/monitoring/createalertrulehandler.go @@ -1,6 +1,7 @@ package monitoring import ( + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "net/http" "github.com/zeromicro/go-zero/rest/httpx" @@ -19,10 +20,6 @@ func CreateAlertRuleHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { l := monitoring.NewCreateAlertRuleLogic(r.Context(), svcCtx) err := l.CreateAlertRule(&req) - if err != nil { - httpx.ErrorCtx(r.Context(), w, err) - } else { - httpx.Ok(w) - } + result.HttpResult(r, w, nil, err) } } diff --git a/api/internal/handler/monitoring/nodesloadtophandler.go b/api/internal/handler/monitoring/nodesloadtophandler.go new file mode 100644 index 00000000..4c0e5925 --- /dev/null +++ b/api/internal/handler/monitoring/nodesloadtophandler.go @@ -0,0 +1,28 @@ +package monitoring + +import ( + "net/http" + + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/monitoring" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" +) + +func NodesLoadTopHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.NodesLoadTopReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + + l := monitoring.NewNodesLoadTopLogic(r.Context(), svcCtx) + resp, err := l.NodesLoadTop(&req) + if err != nil { + httpx.ErrorCtx(r.Context(), w, err) + } else { + httpx.OkJsonCtx(r.Context(), w, resp) + } + } +} diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index 45e701c2..9235c609 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -1189,7 +1189,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { }, { Method: http.MethodGet, - Path: "/monitoring/alert/rules", + Path: "/monitoring/alert/rule", Handler: monitoring.AlertRulesHandler(serverCtx), }, { @@ -1197,6 +1197,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/monitoring/cluster/load", Handler: monitoring.ClustersLoadHandler(serverCtx), }, + { + Method: http.MethodGet, + Path: "/monitoring/node/top", + Handler: monitoring.NodesLoadTopHandler(serverCtx), + }, }, rest.WithPrefix("/pcm/v1"), ) diff --git a/api/internal/logic/monitoring/alertruleslogic.go b/api/internal/logic/monitoring/alertruleslogic.go index 3014853b..93d5500c 100644 --- a/api/internal/logic/monitoring/alertruleslogic.go +++ b/api/internal/logic/monitoring/alertruleslogic.go @@ -2,10 +2,11 @@ package monitoring import ( "context" - "github.com/prometheus/alertmanager/api/v2/client/alert" + + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "github.com/zeromicro/go-zero/core/logx" - "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" ) type AlertRulesLogic struct { @@ -22,13 +23,8 @@ func NewAlertRulesLogic(ctx context.Context, svcCtx *svc.ServiceContext) *AlertR } } -func (l *AlertRulesLogic) AlertRules() error { - // todo: add your logic here and delete this line - alerts, err := l.svcCtx.AlertClient.Alert.GetAlerts(&alert.GetAlertsParams{}) - if err != nil { - return err - } - println(alerts.Error()) - return nil - return nil +func (l *AlertRulesLogic) AlertRules() (resp *types.AlertRulesResp, err error) { + resp = &types.AlertRulesResp{} + l.svcCtx.DbEngin.Raw("SELECT ar.id,ar.*,GROUP_CONCAT(tc.`name` ORDER BY tc.`name` ASC SEPARATOR ',') as cluster_name FROM alert_rule ar JOIN t_cluster tc ON ar.cluster_id = tc.id WHERE ar.deleted_at IS NULL AND tc.deleted_at IS NULL GROUP BY ar.id;").Scan(&resp) + return resp, nil } diff --git a/api/internal/logic/monitoring/clustersloadlogic.go b/api/internal/logic/monitoring/clustersloadlogic.go index 68f07eb1..ed2a4d07 100644 --- a/api/internal/logic/monitoring/clustersloadlogic.go +++ b/api/internal/logic/monitoring/clustersloadlogic.go @@ -28,8 +28,7 @@ func NewClustersLoadLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Clus func (l *ClustersLoadLogic) ClustersLoad(req *types.ClustersLoadReq) (resp *types.ClustersLoadResp, err error) { resp = &types.ClustersLoadResp{} metrics := []string{"cluster_cpu_utilisation", "cluster_cpu_avail", "cluster_cpu_total", "cluster_memory_total", "cluster_memory_avail", "cluster_memory_utilisation", "cluster_disk_utilisation", "cluster_disk_avail", "cluster_disk_total"} - result := l.svcCtx.PromClient.GetNamedMetrics(metrics, time.Now(), tracker.ClusterOption{AdapterId: req.AdapterId, ClusterName: req.ClusterName}) + result := l.svcCtx.PromClient.GetNamedMetrics(metrics, time.Now(), tracker.ClusterOption{ClusterName: req.ClusterName}) resp.Data = result return resp, nil - return } diff --git a/api/internal/logic/monitoring/createalertrulelogic.go b/api/internal/logic/monitoring/createalertrulelogic.go index 63dfb95e..828f5635 100644 --- a/api/internal/logic/monitoring/createalertrulelogic.go +++ b/api/internal/logic/monitoring/createalertrulelogic.go @@ -3,12 +3,14 @@ package monitoring import ( "context" v1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - v12 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" - "github.com/zeromicro/go-zero/core/logx" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" + tool "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" + v12 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/json" ) type CreateAlertRuleLogic struct { @@ -25,16 +27,49 @@ func NewCreateAlertRuleLogic(ctx context.Context, svcCtx *svc.ServiceContext) *C } } +type RuleSelectorResp struct { + Code int `json:"code"` + Msg string `json:"msg"` + Prometheus v1.Prometheus `json:"data"` +} + func (l *CreateAlertRuleLogic) CreateAlertRule(req *types.CreateAlertRuleReq) error { - // todo: add your logic here and delete this line + + // save to db + var alertRule models.AlertRule + tool.Convert(req, &alertRule) + alertRule.Id = tool.GenSnowflakeID() + tx := l.svcCtx.DbEngin.Save(&alertRule) + if tx.Error != nil { + return tx.Error + } + + // query server http url. + var server string + l.svcCtx.DbEngin.Raw("select ta.server from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and tc.name = ?", &req.ClusterName).Scan(&server) + + // rule selector + var ruleSelectorResp RuleSelectorResp + + response, err := l.svcCtx.HttpClient.R(). + SetQueryParams(map[string]string{ + "clusterName": req.ClusterName, + }). + SetResult(&ruleSelectorResp). + ForceContentType("application/json"). + Get(server + "/api/v1/monitoring/rule/selector") + if err != nil || response.IsError() { + return err + } + // Data Filling ruleDuration := v1.Duration(req.Duration) rule := &v1.PrometheusRule{ + TypeMeta: v12.TypeMeta{Kind: "PrometheusRule", + APIVersion: "monitoring.coreos.com/v1"}, ObjectMeta: v12.ObjectMeta{ Name: req.Name, - Namespace: req.Namespace, - Labels: map[string]string{ - "release": "prometheus", - }, + Namespace: ruleSelectorResp.Prometheus.ObjectMeta.Namespace, + Labels: ruleSelectorResp.Prometheus.Spec.RuleSelector.MatchLabels, }, Spec: v1.PrometheusRuleSpec{ Groups: []v1.RuleGroup{ @@ -48,13 +83,35 @@ func (l *CreateAlertRuleLogic) CreateAlertRule(req *types.CreateAlertRuleReq) er Labels: map[string]string{ "severity": req.AlertLevel, }, - Annotations: req.Annotations, + Annotations: map[string]string{"description": req.Annotations}, }, }, }, }, }, } - println(rule.Kind) + + ruleBytes, err := json.Marshal(rule) + if err != nil { + return err + } + + // create prometheus rule + response, err = l.svcCtx.HttpClient.R(). + SetBody(&OperateStruct{ + ClusterName: req.ClusterName, + YamlString: string(ruleBytes), + }). + ForceContentType("application/json"). + Post(server + "/api/v1/operate/apply") + if err != nil || response.IsError() { + return err + } + return nil } + +type OperateStruct struct { + ClusterName string `json:"clusterName"` + YamlString string `json:"yamlString"` +} diff --git a/api/internal/logic/monitoring/nodesloadtoplogic.go b/api/internal/logic/monitoring/nodesloadtoplogic.go new file mode 100644 index 00000000..7efb6cd4 --- /dev/null +++ b/api/internal/logic/monitoring/nodesloadtoplogic.go @@ -0,0 +1,44 @@ +package monitoring + +import ( + "context" + + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + + "github.com/zeromicro/go-zero/core/logx" +) + +type NodesLoadTopLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewNodesLoadTopLogic(ctx context.Context, svcCtx *svc.ServiceContext) *NodesLoadTopLogic { + return &NodesLoadTopLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +func (l *NodesLoadTopLogic) NodesLoadTop(req *types.NodesLoadTopReq) (resp *types.NodesLoadTopResp, err error) { + + resp = &types.NodesLoadTopResp{} + + var server string + l.svcCtx.DbEngin.Raw("select ta.server from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and tc.name = ?", &req.ClusterName).Scan(&server) + response, err := l.svcCtx.HttpClient.R(). + SetQueryParams(map[string]string{ + "clusterName": req.ClusterName, + "metrics": req.Metrics, + }). + SetResult(&resp). + ForceContentType("application/json"). + Get(server + "/api/v1/monitoring/node") + if err != nil || response.IsError() { + + } + return resp, nil +} diff --git a/api/internal/types/types.go b/api/internal/types/types.go index f1a3978a..d651558b 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -32,7 +32,6 @@ type RemoteResp struct { } type ClustersLoadReq struct { - AdapterId int64 `form:"adapterId"` ClusterName string `form:"clusterName"` } @@ -5376,12 +5375,31 @@ type PushResourceInfoReq struct { } type CreateAlertRuleReq struct { - ClusterName string `json:"clusterName"` - Namespace string `json:"namespace"` - Name string `json:"name"` - PromQL string `json:"PromQL"` - Duration string `json:"duration"` - Labels map[string]string `json:"labels"` - Annotations map[string]string `json:"annotations"` - AlertLevel string `json:"alertLevel"` + CLusterId int64 `json:"clusterId"` + ClusterName string `json:"clusterName"` + Name string `json:"name"` + PromQL string `json:"promQL"` + Duration string `json:"duration"` + Annotations string `json:"annotations,optional"` + AlertLevel string `json:"alertLevel"` + AlertType string `json:"alertType"` +} + +type AlertRulesResp struct { + Id int64 `json:"id"` + ClusterName string `json:"clusterName"` + Name string `json:"name"` + PromQL string `json:"promQL"` + Duration string `json:"duration"` + Annotations string `json:"annotations"` + AlertLevel string `json:"alertLevel"` +} + +type NodesLoadTopReq struct { + ClusterName string `form:"clusterName"` + Metrics string `form:"metrics"` +} + +type NodesLoadTopResp struct { + Data interface{} `json:"data"` } diff --git a/pkg/models/alertrulemodel.go b/pkg/models/alertrulemodel.go new file mode 100644 index 00000000..b5e47bab --- /dev/null +++ b/pkg/models/alertrulemodel.go @@ -0,0 +1,18 @@ +package models + +import ( + "database/sql" +) + +type AlertRule struct { + Id int64 `db:"id"` // id + ClusterId int64 `db:"cluster_id"` + Name string `db:"name"` // 节点名称 + AlertType string `db:"alert_type"` // 节点类型 int64 `db:"cpu_total"` // cpu核数 + PromQL string `db:"prom_ql"` + Duration string `db:"duration"` + AlertLevel string `db:"alert_level"` + Annotations string `db:"annotations"` + CreatedBy sql.NullInt64 `db:"created_by"` // 创建人 + UpdatedBy sql.NullInt64 `db:"updated_by"` // 更新人 +} diff --git a/pkg/tracker/promql.go b/pkg/tracker/promql.go index 1dc07394..013c079f 100644 --- a/pkg/tracker/promql.go +++ b/pkg/tracker/promql.go @@ -19,32 +19,31 @@ import ( "strings" ) -const ( - StatefulSet = "StatefulSet" - DaemonSet = "DaemonSet" - Deployment = "Deployment" -) - var promQLTemplates = map[string]string{ - "cluster_cpu_utilisation": "cluster_cpu_utilisation{$1}", - "cluster_memory_utilisation": "cluster_memory_utilisation{$1}", - "cluster_disk_utilisation": "cluster_disk_utilisation{$1}", - "cluster_cpu_total": "cluster_cpu_total{$1}", - "cluster_memory_total": "cluster_memory_total{$1}", - "cluster_disk_total": "cluster_disk_total{$1}", - "cluster_cpu_avail": "cluster_cpu_total{$1}", - "cluster_memory_avail": "cluster_memory_total{$1}", - "cluster_disk_avail": "cluster_disk_total{$1}", - "center_cpu_utilisation": "(sum by (adapter_id)(cluster_cpu_total{$1})-sum by (adapter_id)(cluster_cpu_avail{$1}))/sum by (adapter_id)(cluster_cpu_total{$1})", - "center_memory_utilisation": "(sum by (adapter_id)(cluster_memory_total{$1})-sum by (adapter_id)(cluster_memory_avail{$1}))/sum by (adapter_id)(cluster_memory_total{$1})", - "center_disk_utilisation": "(sum by (adapter_id)(cluster_disk_total{$1})-sum by (adapter_id)(cluster_disk_avail{$1}))/sum by (adapter_id)(cluster_disk_total{$1})", - "center_top3": "topk(3,((sum by (adapter_id)(cluster_cpu_total)-sum by (adapter_id)(cluster_cpu_avail))/sum by (adapter_id)(cluster_cpu_total) + (sum by (adapter_id)(cluster_memory_total) - sum by (adapter_id)(cluster_memory_avail))/sum by (adapter_id)(cluster_memory_total) + (sum by (adapter_id)(cluster_disk_total)-sum by (adapter_id)(cluster_disk_avail))/sum by (adapter_id)(cluster_disk_total))/3)", + "cluster_cpu_utilisation": "cluster_cpu_utilisation{$1}", + "cluster_memory_utilisation": "cluster_memory_utilisation{$1}", + "cluster_disk_utilisation": "cluster_disk_utilisation{$1}", + "cluster_cpu_total": "cluster_cpu_total{$1}", + "cluster_memory_total": "cluster_memory_total{$1}", + "cluster_disk_total": "cluster_disk_total{$1}", + "cluster_cpu_avail": "cluster_cpu_total{$1}", + "cluster_memory_avail": "cluster_memory_total{$1}", + "cluster_disk_avail": "cluster_disk_total{$1}", + + // center + "center_cpu_utilisation": "(sum by (adapter_id)(cluster_cpu_total{$1})-sum by (adapter_id)(cluster_cpu_avail{$1}))/sum by (adapter_id)(cluster_cpu_total{$1})", + "center_memory_utilisation": "(sum by (adapter_id)(cluster_memory_total{$1})-sum by (adapter_id)(cluster_memory_avail{$1}))/sum by (adapter_id)(cluster_memory_total{$1})", + "center_disk_utilisation": "(sum by (adapter_id)(cluster_disk_total{$1})-sum by (adapter_id)(cluster_disk_avail{$1}))/sum by (adapter_id)(cluster_disk_total{$1})", + "center_top3": "topk(3,((sum by (adapter_id)(cluster_cpu_total)-sum by (adapter_id)(cluster_cpu_avail))/sum by (adapter_id)(cluster_cpu_total) + (sum by (adapter_id)(cluster_memory_total) - sum by (adapter_id)(cluster_memory_avail))/sum by (adapter_id)(cluster_memory_total) + (sum by (adapter_id)(cluster_disk_total)-sum by (adapter_id)(cluster_disk_avail))/sum by (adapter_id)(cluster_disk_total))/3)", + + // namespace "namespace_cpu_usage": `round(namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}, 0.001)`, "namespace_memory_usage": `namespace:container_memory_usage_bytes:sum{namespace!="", $1}`, "namespace_memory_usage_wo_cache": `namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1}`, - "controller_cpu_usage_rate": `sum( node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{$1}) by (workload)/sum( kube_pod_container_resource_limits{job="kube-state-metrics", resource="cpu"}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{ }) by (workload)`, - "controller_memory_usage_rate": `sum( container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", container!="", image!=""} * on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{$1}) by (workload)/sum( kube_pod_container_resource_limits{job="kube-state-metrics", resource="memory"}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{ }) by (workload)`, + // controller + "controller_cpu_usage_rate": `sum( node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{$1}) by (workload)/sum( kube_pod_container_resource_limits{job="kube-state-metrics", resource="cpu"}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{ }) by (workload)`, + "controller_memory_usage_rate": `sum( container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", container!="", image!=""} * on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{$1}) by (workload)/sum( kube_pod_container_resource_limits{job="kube-state-metrics", resource="memory"}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{ }) by (workload)`, // pod "pod_cpu_usage": `round(sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{job="kubelet", pod!="", image!=""}[5m])) * on (namespace, pod) group_left(owner_kind,owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.001)`, "pod_cpu_usage_rate": `sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{ $1}) by (pod) / sum(kube_pod_container_resource_limits{ $1,unit="core"}) by (pod)`,