Former-commit-id: 545aa908151fb83153dbe89334f2a6cf1d3baf53
This commit is contained in:
zhangwei 2024-04-07 17:11:35 +08:00
parent cecdf2be3a
commit a8660d5d5b
15 changed files with 257 additions and 78 deletions

View File

@ -40,7 +40,6 @@ type remoteResp {
type (
clustersLoadReq {
adapterId int64 `form:"adapterId"`
clusterName string `form:"clusterName"`
}
clustersLoadResp {

View File

@ -1,12 +1,34 @@
syntax = "v1"
type CreateAlertRuleReq {
CLusterId int64 `json:"clusterId"`
ClusterName string `json:"clusterName"`
Namespace string `json:"namespace"`
Name string `json:"name"`
PromQL string `json:"PromQL"`
PromQL string `json:"promQL"`
Duration string `json:"duration"`
Labels map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
Annotations string `json:"annotations,optional"`
AlertLevel string `json:"alertLevel"`
AlertType string `json:"alertType"`
}
type (
alertRulesResp {
Id int64 `json:"id"`
ClusterName string `json:"clusterName"`
Name string `json:"name"`
PromQL string `json:"promQL"`
Duration string `json:"duration"`
Annotations string `json:"annotations"`
AlertLevel string `json:"alertLevel"`
}
)
type (
nodesLoadTopReq {
ClusterName string `form:"clusterName"`
Metrics string `form:"metrics"`
}
nodesLoadTopResp {
data interface{} `json:"data"`
}
)

View File

@ -948,9 +948,13 @@ service pcm {
@doc "alert rules"
@handler alertRulesHandler
get /monitoring/alert/rules
get /monitoring/alert/rule returns (alertRulesResp)
@doc "cluster resource load"
@handler clustersLoadHandler
get /monitoring/cluster/load (clustersLoadReq) returns (clustersLoadResp)
@doc "node resource load"
@handler nodesLoadTopHandler
get /monitoring/node/top (nodesLoadTopReq) returns (nodesLoadTopResp)
}

View File

@ -1,6 +1,7 @@
package cloud
import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
@ -19,10 +20,6 @@ func ClusterInfoHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
l := cloud.NewClusterInfoLogic(r.Context(), svcCtx)
resp, err := l.ClusterInfo(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
result.HttpResult(r, w, resp, err)
}
}

View File

@ -1,9 +1,9 @@
package monitoring
import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/monitoring"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
@ -11,11 +11,7 @@ import (
func AlertRulesHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
l := monitoring.NewAlertRulesLogic(r.Context(), svcCtx)
err := l.AlertRules()
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.Ok(w)
}
resp, err := l.AlertRules()
result.HttpResult(r, w, resp, err)
}
}

View File

@ -1,6 +1,7 @@
package monitoring
import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
@ -19,10 +20,6 @@ func CreateAlertRuleHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
l := monitoring.NewCreateAlertRuleLogic(r.Context(), svcCtx)
err := l.CreateAlertRule(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.Ok(w)
}
result.HttpResult(r, w, nil, err)
}
}

View File

@ -0,0 +1,28 @@
package monitoring
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/monitoring"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
)
func NodesLoadTopHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.NodesLoadTopReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
return
}
l := monitoring.NewNodesLoadTopLogic(r.Context(), svcCtx)
resp, err := l.NodesLoadTop(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -1189,7 +1189,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
},
{
Method: http.MethodGet,
Path: "/monitoring/alert/rules",
Path: "/monitoring/alert/rule",
Handler: monitoring.AlertRulesHandler(serverCtx),
},
{
@ -1197,6 +1197,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/monitoring/cluster/load",
Handler: monitoring.ClustersLoadHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/monitoring/node/top",
Handler: monitoring.NodesLoadTopHandler(serverCtx),
},
},
rest.WithPrefix("/pcm/v1"),
)

View File

@ -2,10 +2,11 @@ package monitoring
import (
"context"
"github.com/prometheus/alertmanager/api/v2/client/alert"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
type AlertRulesLogic struct {
@ -22,13 +23,8 @@ func NewAlertRulesLogic(ctx context.Context, svcCtx *svc.ServiceContext) *AlertR
}
}
func (l *AlertRulesLogic) AlertRules() error {
// todo: add your logic here and delete this line
alerts, err := l.svcCtx.AlertClient.Alert.GetAlerts(&alert.GetAlertsParams{})
if err != nil {
return err
}
println(alerts.Error())
return nil
return nil
func (l *AlertRulesLogic) AlertRules() (resp *types.AlertRulesResp, err error) {
resp = &types.AlertRulesResp{}
l.svcCtx.DbEngin.Raw("SELECT ar.id,ar.*,GROUP_CONCAT(tc.`name` ORDER BY tc.`name` ASC SEPARATOR ',') as cluster_name FROM alert_rule ar JOIN t_cluster tc ON ar.cluster_id = tc.id WHERE ar.deleted_at IS NULL AND tc.deleted_at IS NULL GROUP BY ar.id;").Scan(&resp)
return resp, nil
}

View File

@ -28,8 +28,7 @@ func NewClustersLoadLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Clus
func (l *ClustersLoadLogic) ClustersLoad(req *types.ClustersLoadReq) (resp *types.ClustersLoadResp, err error) {
resp = &types.ClustersLoadResp{}
metrics := []string{"cluster_cpu_utilisation", "cluster_cpu_avail", "cluster_cpu_total", "cluster_memory_total", "cluster_memory_avail", "cluster_memory_utilisation", "cluster_disk_utilisation", "cluster_disk_avail", "cluster_disk_total"}
result := l.svcCtx.PromClient.GetNamedMetrics(metrics, time.Now(), tracker.ClusterOption{AdapterId: req.AdapterId, ClusterName: req.ClusterName})
result := l.svcCtx.PromClient.GetNamedMetrics(metrics, time.Now(), tracker.ClusterOption{ClusterName: req.ClusterName})
resp.Data = result
return resp, nil
return
}

View File

@ -3,12 +3,14 @@ package monitoring
import (
"context"
v1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
v12 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
tool "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
v12 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/util/json"
)
type CreateAlertRuleLogic struct {
@ -25,16 +27,49 @@ func NewCreateAlertRuleLogic(ctx context.Context, svcCtx *svc.ServiceContext) *C
}
}
type RuleSelectorResp struct {
Code int `json:"code"`
Msg string `json:"msg"`
Prometheus v1.Prometheus `json:"data"`
}
func (l *CreateAlertRuleLogic) CreateAlertRule(req *types.CreateAlertRuleReq) error {
// todo: add your logic here and delete this line
// save to db
var alertRule models.AlertRule
tool.Convert(req, &alertRule)
alertRule.Id = tool.GenSnowflakeID()
tx := l.svcCtx.DbEngin.Save(&alertRule)
if tx.Error != nil {
return tx.Error
}
// query server http url.
var server string
l.svcCtx.DbEngin.Raw("select ta.server from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and tc.name = ?", &req.ClusterName).Scan(&server)
// rule selector
var ruleSelectorResp RuleSelectorResp
response, err := l.svcCtx.HttpClient.R().
SetQueryParams(map[string]string{
"clusterName": req.ClusterName,
}).
SetResult(&ruleSelectorResp).
ForceContentType("application/json").
Get(server + "/api/v1/monitoring/rule/selector")
if err != nil || response.IsError() {
return err
}
// Data Filling
ruleDuration := v1.Duration(req.Duration)
rule := &v1.PrometheusRule{
TypeMeta: v12.TypeMeta{Kind: "PrometheusRule",
APIVersion: "monitoring.coreos.com/v1"},
ObjectMeta: v12.ObjectMeta{
Name: req.Name,
Namespace: req.Namespace,
Labels: map[string]string{
"release": "prometheus",
},
Namespace: ruleSelectorResp.Prometheus.ObjectMeta.Namespace,
Labels: ruleSelectorResp.Prometheus.Spec.RuleSelector.MatchLabels,
},
Spec: v1.PrometheusRuleSpec{
Groups: []v1.RuleGroup{
@ -48,13 +83,35 @@ func (l *CreateAlertRuleLogic) CreateAlertRule(req *types.CreateAlertRuleReq) er
Labels: map[string]string{
"severity": req.AlertLevel,
},
Annotations: req.Annotations,
Annotations: map[string]string{"description": req.Annotations},
},
},
},
},
},
}
println(rule.Kind)
ruleBytes, err := json.Marshal(rule)
if err != nil {
return err
}
// create prometheus rule
response, err = l.svcCtx.HttpClient.R().
SetBody(&OperateStruct{
ClusterName: req.ClusterName,
YamlString: string(ruleBytes),
}).
ForceContentType("application/json").
Post(server + "/api/v1/operate/apply")
if err != nil || response.IsError() {
return err
}
return nil
}
type OperateStruct struct {
ClusterName string `json:"clusterName"`
YamlString string `json:"yamlString"`
}

View File

@ -0,0 +1,44 @@
package monitoring
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type NodesLoadTopLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewNodesLoadTopLogic(ctx context.Context, svcCtx *svc.ServiceContext) *NodesLoadTopLogic {
return &NodesLoadTopLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *NodesLoadTopLogic) NodesLoadTop(req *types.NodesLoadTopReq) (resp *types.NodesLoadTopResp, err error) {
resp = &types.NodesLoadTopResp{}
var server string
l.svcCtx.DbEngin.Raw("select ta.server from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and tc.name = ?", &req.ClusterName).Scan(&server)
response, err := l.svcCtx.HttpClient.R().
SetQueryParams(map[string]string{
"clusterName": req.ClusterName,
"metrics": req.Metrics,
}).
SetResult(&resp).
ForceContentType("application/json").
Get(server + "/api/v1/monitoring/node")
if err != nil || response.IsError() {
}
return resp, nil
}

View File

@ -32,7 +32,6 @@ type RemoteResp struct {
}
type ClustersLoadReq struct {
AdapterId int64 `form:"adapterId"`
ClusterName string `form:"clusterName"`
}
@ -5376,12 +5375,31 @@ type PushResourceInfoReq struct {
}
type CreateAlertRuleReq struct {
ClusterName string `json:"clusterName"`
Namespace string `json:"namespace"`
Name string `json:"name"`
PromQL string `json:"PromQL"`
Duration string `json:"duration"`
Labels map[string]string `json:"labels"`
Annotations map[string]string `json:"annotations"`
AlertLevel string `json:"alertLevel"`
CLusterId int64 `json:"clusterId"`
ClusterName string `json:"clusterName"`
Name string `json:"name"`
PromQL string `json:"promQL"`
Duration string `json:"duration"`
Annotations string `json:"annotations,optional"`
AlertLevel string `json:"alertLevel"`
AlertType string `json:"alertType"`
}
type AlertRulesResp struct {
Id int64 `json:"id"`
ClusterName string `json:"clusterName"`
Name string `json:"name"`
PromQL string `json:"promQL"`
Duration string `json:"duration"`
Annotations string `json:"annotations"`
AlertLevel string `json:"alertLevel"`
}
type NodesLoadTopReq struct {
ClusterName string `form:"clusterName"`
Metrics string `form:"metrics"`
}
type NodesLoadTopResp struct {
Data interface{} `json:"data"`
}

View File

@ -0,0 +1,18 @@
package models
import (
"database/sql"
)
type AlertRule struct {
Id int64 `db:"id"` // id
ClusterId int64 `db:"cluster_id"`
Name string `db:"name"` // 节点名称
AlertType string `db:"alert_type"` // 节点类型 int64 `db:"cpu_total"` // cpu核数
PromQL string `db:"prom_ql"`
Duration string `db:"duration"`
AlertLevel string `db:"alert_level"`
Annotations string `db:"annotations"`
CreatedBy sql.NullInt64 `db:"created_by"` // 创建人
UpdatedBy sql.NullInt64 `db:"updated_by"` // 更新人
}

View File

@ -19,32 +19,31 @@ import (
"strings"
)
const (
StatefulSet = "StatefulSet"
DaemonSet = "DaemonSet"
Deployment = "Deployment"
)
var promQLTemplates = map[string]string{
"cluster_cpu_utilisation": "cluster_cpu_utilisation{$1}",
"cluster_memory_utilisation": "cluster_memory_utilisation{$1}",
"cluster_disk_utilisation": "cluster_disk_utilisation{$1}",
"cluster_cpu_total": "cluster_cpu_total{$1}",
"cluster_memory_total": "cluster_memory_total{$1}",
"cluster_disk_total": "cluster_disk_total{$1}",
"cluster_cpu_avail": "cluster_cpu_total{$1}",
"cluster_memory_avail": "cluster_memory_total{$1}",
"cluster_disk_avail": "cluster_disk_total{$1}",
"center_cpu_utilisation": "(sum by (adapter_id)(cluster_cpu_total{$1})-sum by (adapter_id)(cluster_cpu_avail{$1}))/sum by (adapter_id)(cluster_cpu_total{$1})",
"center_memory_utilisation": "(sum by (adapter_id)(cluster_memory_total{$1})-sum by (adapter_id)(cluster_memory_avail{$1}))/sum by (adapter_id)(cluster_memory_total{$1})",
"center_disk_utilisation": "(sum by (adapter_id)(cluster_disk_total{$1})-sum by (adapter_id)(cluster_disk_avail{$1}))/sum by (adapter_id)(cluster_disk_total{$1})",
"center_top3": "topk(3,((sum by (adapter_id)(cluster_cpu_total)-sum by (adapter_id)(cluster_cpu_avail))/sum by (adapter_id)(cluster_cpu_total) + (sum by (adapter_id)(cluster_memory_total) - sum by (adapter_id)(cluster_memory_avail))/sum by (adapter_id)(cluster_memory_total) + (sum by (adapter_id)(cluster_disk_total)-sum by (adapter_id)(cluster_disk_avail))/sum by (adapter_id)(cluster_disk_total))/3)",
"cluster_cpu_utilisation": "cluster_cpu_utilisation{$1}",
"cluster_memory_utilisation": "cluster_memory_utilisation{$1}",
"cluster_disk_utilisation": "cluster_disk_utilisation{$1}",
"cluster_cpu_total": "cluster_cpu_total{$1}",
"cluster_memory_total": "cluster_memory_total{$1}",
"cluster_disk_total": "cluster_disk_total{$1}",
"cluster_cpu_avail": "cluster_cpu_total{$1}",
"cluster_memory_avail": "cluster_memory_total{$1}",
"cluster_disk_avail": "cluster_disk_total{$1}",
// center
"center_cpu_utilisation": "(sum by (adapter_id)(cluster_cpu_total{$1})-sum by (adapter_id)(cluster_cpu_avail{$1}))/sum by (adapter_id)(cluster_cpu_total{$1})",
"center_memory_utilisation": "(sum by (adapter_id)(cluster_memory_total{$1})-sum by (adapter_id)(cluster_memory_avail{$1}))/sum by (adapter_id)(cluster_memory_total{$1})",
"center_disk_utilisation": "(sum by (adapter_id)(cluster_disk_total{$1})-sum by (adapter_id)(cluster_disk_avail{$1}))/sum by (adapter_id)(cluster_disk_total{$1})",
"center_top3": "topk(3,((sum by (adapter_id)(cluster_cpu_total)-sum by (adapter_id)(cluster_cpu_avail))/sum by (adapter_id)(cluster_cpu_total) + (sum by (adapter_id)(cluster_memory_total) - sum by (adapter_id)(cluster_memory_avail))/sum by (adapter_id)(cluster_memory_total) + (sum by (adapter_id)(cluster_disk_total)-sum by (adapter_id)(cluster_disk_avail))/sum by (adapter_id)(cluster_disk_total))/3)",
// namespace
"namespace_cpu_usage": `round(namespace:container_cpu_usage_seconds_total:sum_rate{namespace!="", $1}, 0.001)`,
"namespace_memory_usage": `namespace:container_memory_usage_bytes:sum{namespace!="", $1}`,
"namespace_memory_usage_wo_cache": `namespace:container_memory_usage_bytes_wo_cache:sum{namespace!="", $1}`,
"controller_cpu_usage_rate": `sum( node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{$1}) by (workload)/sum( kube_pod_container_resource_limits{job="kube-state-metrics", resource="cpu"}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{ }) by (workload)`,
"controller_memory_usage_rate": `sum( container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", container!="", image!=""} * on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{$1}) by (workload)/sum( kube_pod_container_resource_limits{job="kube-state-metrics", resource="memory"}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{ }) by (workload)`,
// controller
"controller_cpu_usage_rate": `sum( node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{$1}) by (workload)/sum( kube_pod_container_resource_limits{job="kube-state-metrics", resource="cpu"}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{ }) by (workload)`,
"controller_memory_usage_rate": `sum( container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", container!="", image!=""} * on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{$1}) by (workload)/sum( kube_pod_container_resource_limits{job="kube-state-metrics", resource="memory"}* on(namespace,pod) group_left(workload) namespace_workload_pod:kube_pod_owner:relabel{ }) by (workload)`,
// pod
"pod_cpu_usage": `round(sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{job="kubelet", pod!="", image!=""}[5m])) * on (namespace, pod) group_left(owner_kind,owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.001)`,
"pod_cpu_usage_rate": `sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{ $1}) by (pod) / sum(kube_pod_container_resource_limits{ $1,unit="core"}) by (pod)`,