From b3aa276ff7aba8e1f6b1e7fedb64a91a706a676f Mon Sep 17 00:00:00 2001 From: zhangwei <894646498@qq.com> Date: Tue, 9 Apr 2024 10:09:26 +0800 Subject: [PATCH 1/6] =?UTF-8?q?=E5=91=8A=E8=AD=A6=E6=B6=88=E6=81=AF?= =?UTF-8?q?=E6=9F=A5=E8=AF=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former-commit-id: aba3c3cff2e2e0a92dbeff9f6dd1b3dd6da2eb36 --- api/desc/monitoring/pcm-monitoring.api | 6 +- api/desc/pcm.api | 4 ++ .../handler/monitoring/alertlisthandler.go | 17 +++++ .../handler/monitoring/clustersloadhandler.go | 7 +-- api/internal/handler/routes.go | 5 ++ .../logic/monitoring/alertlistlogic.go | 63 +++++++++++++++++++ .../logic/monitoring/createalertrulelogic.go | 1 + api/internal/types/types.go | 4 ++ 8 files changed, 101 insertions(+), 6 deletions(-) create mode 100644 api/internal/handler/monitoring/alertlisthandler.go create mode 100644 api/internal/logic/monitoring/alertlistlogic.go diff --git a/api/desc/monitoring/pcm-monitoring.api b/api/desc/monitoring/pcm-monitoring.api index c5ff3f5a..45c9b585 100644 --- a/api/desc/monitoring/pcm-monitoring.api +++ b/api/desc/monitoring/pcm-monitoring.api @@ -37,4 +37,8 @@ type ( data interface{} `json:"data"` msg string `json:"msg"` } -) \ No newline at end of file +) + +type alertListResp { + alertMap map[string]interface{} `json:"alertMap"` +} \ No newline at end of file diff --git a/api/desc/pcm.api b/api/desc/pcm.api index ee7ffd74..0c9725a7 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -961,4 +961,8 @@ service pcm { @doc "node resource load" @handler nodesLoadTopHandler get /monitoring/node/top (nodesLoadTopReq) returns (nodesLoadTopResp) + + @doc "alert list" + @handler alertListHandler + get /monitoring/alert/list returns (alertListResp) } \ No newline at end of file diff --git a/api/internal/handler/monitoring/alertlisthandler.go b/api/internal/handler/monitoring/alertlisthandler.go new file mode 100644 index 00000000..53c82a9b --- /dev/null +++ b/api/internal/handler/monitoring/alertlisthandler.go @@ -0,0 +1,17 @@ +package monitoring + +import ( + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" + "net/http" + + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/monitoring" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" +) + +func AlertListHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + l := monitoring.NewAlertListLogic(r.Context(), svcCtx) + resp, err := l.AlertList() + result.HttpResult(r, w, resp, err) + } +} diff --git a/api/internal/handler/monitoring/clustersloadhandler.go b/api/internal/handler/monitoring/clustersloadhandler.go index e0fba0d0..6758c46a 100644 --- a/api/internal/handler/monitoring/clustersloadhandler.go +++ b/api/internal/handler/monitoring/clustersloadhandler.go @@ -1,6 +1,7 @@ package monitoring import ( + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "net/http" "github.com/zeromicro/go-zero/rest/httpx" @@ -19,10 +20,6 @@ func ClustersLoadHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { l := monitoring.NewClustersLoadLogic(r.Context(), svcCtx) resp, err := l.ClustersLoad(&req) - if err != nil { - httpx.ErrorCtx(r.Context(), w, err) - } else { - httpx.OkJsonCtx(r.Context(), w, resp) - } + result.HttpResult(r, w, resp, err) } } diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index c4d6163d..3acfb456 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -1207,6 +1207,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/monitoring/node/top", Handler: monitoring.NodesLoadTopHandler(serverCtx), }, + { + Method: http.MethodGet, + Path: "/monitoring/alert/list", + Handler: monitoring.AlertListHandler(serverCtx), + }, }, rest.WithPrefix("/pcm/v1"), ) diff --git a/api/internal/logic/monitoring/alertlistlogic.go b/api/internal/logic/monitoring/alertlistlogic.go new file mode 100644 index 00000000..81c168e2 --- /dev/null +++ b/api/internal/logic/monitoring/alertlistlogic.go @@ -0,0 +1,63 @@ +package monitoring + +import ( + "context" + "github.com/pkg/errors" + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + tool "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" + + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + + "github.com/zeromicro/go-zero/core/logx" +) + +type AlertListLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewAlertListLogic(ctx context.Context, svcCtx *svc.ServiceContext) *AlertListLogic { + return &AlertListLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +type AlertListResp struct { + Mode int `json:"code"` + Msg string `json:"msg"` + Data map[string][]*v1.Alert `json:"data"` +} + +func (l *AlertListLogic) AlertList() (resp *types.AlertListResp, err error) { + // todo: add your logic here and delete this line + resp = &types.AlertListResp{} + + // query server http url. + var serverArray []string + l.svcCtx.DbEngin.Raw("select ta.server from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes'").Scan(&serverArray) + + result := make(map[string][]*v1.Alert) + for _, server := range serverArray { + alertListResp := AlertListResp{} + response, err := l.svcCtx.HttpClient.R(). + SetResult(&alertListResp). + ForceContentType("application/json"). + Get(server + "/api/v1/alert/rule/list") + if err != nil { + logx.Error(response) + return nil, err + } + if response.IsError() { + return nil, errors.New(response.String()) + } + for k, v := range alertListResp.Data { + result[k] = v + } + } + tool.Convert(result, &resp.AlertMap) + return resp, nil +} diff --git a/api/internal/logic/monitoring/createalertrulelogic.go b/api/internal/logic/monitoring/createalertrulelogic.go index 828f5635..fadf50c3 100644 --- a/api/internal/logic/monitoring/createalertrulelogic.go +++ b/api/internal/logic/monitoring/createalertrulelogic.go @@ -59,6 +59,7 @@ func (l *CreateAlertRuleLogic) CreateAlertRule(req *types.CreateAlertRuleReq) er ForceContentType("application/json"). Get(server + "/api/v1/monitoring/rule/selector") if err != nil || response.IsError() { + logx.Error(response) return err } // Data Filling diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 274ccf66..ba19e5ea 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -5468,3 +5468,7 @@ type NodesLoadTopResp struct { Data interface{} `json:"data"` Msg string `json:"msg"` } + +type AlertListResp struct { + AlertMap map[string]interface{} `json:"alertMap"` +} From f2b871bba2aaaf5df5227c3dd3bd0a85fae1b770 Mon Sep 17 00:00:00 2001 From: zhangwei <894646498@qq.com> Date: Tue, 9 Apr 2024 15:12:44 +0800 Subject: [PATCH 2/6] node load Former-commit-id: 59a88853d68daacd2a0c7dc8627935b49ddf1300 --- api/internal/types/types.go | 117 ++++++++++++++++++++++++++++++++++++ pkg/tracker/promql.go | 4 +- 2 files changed, 119 insertions(+), 2 deletions(-) diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 55c82f1e..45d56633 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -5316,6 +5316,123 @@ type AiAlgorithmsResp struct { Algorithms []string `json:"algorithms"` } +type PullTaskInfoReq struct { + AdapterId int64 `form:"adapterId"` +} + +type PullTaskInfoResp struct { + HpcInfoList []*HpcInfo `json:"HpcInfoList,omitempty"` + CloudInfoList []*CloudInfo `json:"CloudInfoList,omitempty"` + AiInfoList []*AiInfo `json:"AiInfoList,omitempty"` + VmInfoList []*VmInfo `json:"VmInfoList,omitempty"` +} + +type HpcInfo struct { + Id int64 `json:"id"` // id + TaskId int64 `json:"task_id"` // 任务id + JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id) + AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id + ClusterId int64 `json:"cluster_id"` // 执行任务的集群id + ClusterType string `json:"cluster_type"` // 执行任务的集群类型 + Name string `json:"name"` // 名称 + Status string `json:"status"` // 状态 + CmdScript string `json:"cmd_script"` + StartTime string `json:"start_time"` // 开始时间 + RunningTime int64 `json:"running_time"` // 运行时间 + DerivedEs string `json:"derived_es"` + Cluster string `json:"cluster"` + BlockId int64 `json:"block_id"` + AllocNodes int64 `json:"alloc_nodes"` + AllocCpu int64 `json:"alloc_cpu"` + CardCount int64 `json:"card_count"` // 卡数 + Version string `json:"version"` + Account string `json:"account"` + WorkDir string `json:"work_dir"` // 工作路径 + AssocId int64 `json:"assoc_id"` + ExitCode int64 `json:"exit_code"` + WallTime string `json:"wall_time"` // 最大运行时间 + Result string `json:"result"` // 运行结果 + DeletedAt string `json:"deleted_at"` // 删除时间 + YamlString string `json:"yaml_string"` + AppType string `json:"app_type"` // 应用类型 + AppName string `json:"app_name"` // 应用名称 + Queue string `json:"queue"` // 队列名称 + SubmitType string `json:"submit_type"` // cmd(命令行模式) + NNode string `json:"n_node"` // 节点个数(当指定该参数时,GAP_NODE_STRING必须为"") + StdOutFile string `json:"std_out_file"` // 工作路径/std.err.%j + StdErrFile string `json:"std_err_file"` // 工作路径/std.err.%j + StdInput string `json:"std_input"` + Environment string `json:"environment"` + DeletedFlag int64 `json:"deleted_flag"` // 是否删除(0-否,1-是) + CreatedBy int64 `json:"created_by"` // 创建人 + CreatedTime string `json:"created_time"` // 创建时间 + UpdatedBy int64 `json:"updated_by"` // 更新人 + UpdatedTime string `json:"updated_time"` // 更新时间 +} + +type CloudInfo struct { + Participant int64 `json:"participant,omitempty"` + Id int64 `json:"id,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + ApiVersion string `json:"apiVersion,omitempty"` + Kind string `json:"kind,omitempty"` + Namespace string `json:"namespace,omitempty"` + Name string `json:"name,omitempty"` + Status string `json:"status,omitempty"` + StartTime string `json:"startTime,omitempty"` + RunningTime int64 `json:"runningTime,omitempty"` + Result string `json:"result,omitempty"` + YamlString string `json:"yamlString,omitempty"` +} + +type AiInfo struct { + ParticipantId int64 `json:"participantId,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + ProjectId string `json:"project_id,omitempty"` + Name string `json:"name,omitempty"` + Status string `json:"status,omitempty"` + StartTime string `json:"startTime,omitempty"` + RunningTime int64 `json:"runningTime,omitempty"` + Result string `json:"result,omitempty"` + JobId string `json:"jobId,omitempty"` + CreateTime string `json:"createTime,omitempty"` + ImageUrl string `json:"imageUrl,omitempty"` + Command string `json:"command,omitempty"` + FlavorId string `json:"flavorId,omitempty"` + SubscriptionId string `json:"subscriptionId,omitempty"` + ItemVersionId string `json:"itemVersionId,omitempty"` +} + +type VmInfo struct { + ParticipantId int64 `json:"participantId,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + Name string `json:"name,omitempty"` + FlavorRef string `json:"flavor_ref,omitempty"` + ImageRef string `json:"image_ref,omitempty"` + NetworkUuid string `json:"network_uuid,omitempty"` + BlockUuid string `json:"block_uuid,omitempty"` + SourceType string `json:"source_type,omitempty"` + DeleteOnTermination bool `json:"delete_on_termination,omitempty"` + State string `json:"state,omitempty"` +} + +type PushTaskInfoReq struct { + AdapterId int64 `json:"adapterId"` + HpcInfoList []*HpcInfo `json:"hpcInfoList"` + CloudInfoList []*CloudInfo `json:"cloudInfoList"` + AiInfoList []*AiInfo `json:"aiInfoList"` + VmInfoList []*VmInfo `json:"vmInfoList"` +} + +type PushTaskInfoResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` +} + +type PushResourceInfoReq struct { + AdapterId int64 `json:"adapterId"` +} + type CreateAlertRuleReq struct { CLusterId int64 `json:"clusterId"` ClusterName string `json:"clusterName"` diff --git a/pkg/tracker/promql.go b/pkg/tracker/promql.go index 013c079f..eef13139 100644 --- a/pkg/tracker/promql.go +++ b/pkg/tracker/promql.go @@ -95,8 +95,8 @@ func makeExpr(metric string, opts QueryOptions) string { func makeClusterMetricExpr(tmpl string, o QueryOptions) string { var clusterSelector string - if o.AdapterId != 0 && o.ClusterName != "" { - clusterSelector = fmt.Sprintf(`adapter_id="%d",cluster_name="%s"`, o.AdapterId, o.ClusterName) + if o.ClusterName != "" { + clusterSelector = fmt.Sprintf(`cluster_name="%s"`, o.ClusterName) } return strings.Replace(tmpl, "$1", clusterSelector, -1) From 7cbdf8d91b541149ed2cb7dd69ba4017339d0029 Mon Sep 17 00:00:00 2001 From: zhangwei <894646498@qq.com> Date: Thu, 11 Apr 2024 18:46:12 +0800 Subject: [PATCH 3/6] add metrics target Former-commit-id: 3d7f18b273207725cedbe3007b8acfdaacd8cee8 --- api/desc/core/pcm-core.api | 1 + api/desc/monitoring/pcm-monitoring.api | 15 ++++++++++++--- api/desc/pcm.api | 4 ++-- .../handler/monitoring/alertlisthandler.go | 10 +++++++++- .../handler/monitoring/alertruleshandler.go | 10 +++++++++- api/internal/logic/core/syncclusterloadlogic.go | 2 ++ api/internal/logic/monitoring/alertlistlogic.go | 4 ++-- api/internal/logic/monitoring/alertruleslogic.go | 4 ++-- api/internal/types/types.go | 10 ++++++++++ pkg/tracker/promql.go | 6 +++--- pkg/tracker/tracker.go | 9 +++++---- 11 files changed, 57 insertions(+), 18 deletions(-) diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index 6f215733..41a71a75 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -63,6 +63,7 @@ type ( DiskAvail float64 `json:"diskAvail"` DiskTotal float64 `json:"diskTotal"` DiskUtilisation float64 `json:"diskUtilisation"` + PodsUtilisation float64 `json:"podsUtilisation"` } ) diff --git a/api/desc/monitoring/pcm-monitoring.api b/api/desc/monitoring/pcm-monitoring.api index 45c9b585..bcb2d982 100644 --- a/api/desc/monitoring/pcm-monitoring.api +++ b/api/desc/monitoring/pcm-monitoring.api @@ -12,6 +12,9 @@ type CreateAlertRuleReq { } type ( + AlertRulesReq { + AlertType string `form:"alertType"` + } AlertRulesResp { alertRules []AlertRule `json:"alertRules"` } @@ -20,6 +23,7 @@ type ( Id int64 `json:"id"` ClusterName string `json:"clusterName"` Name string `json:"name"` + AlertType string `json:"alertType"` PromQL string `json:"promQL"` Duration string `json:"duration"` Annotations string `json:"annotations"` @@ -39,6 +43,11 @@ type ( } ) -type alertListResp { - alertMap map[string]interface{} `json:"alertMap"` -} \ No newline at end of file +type ( + alertListReq { + alertType string `form:"alertType"` + } + alertListResp { + alertMap map[string]interface{} `json:"alertMap"` + } +) \ No newline at end of file diff --git a/api/desc/pcm.api b/api/desc/pcm.api index f43442ab..951d9abb 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -952,7 +952,7 @@ service pcm { @doc "alert rules" @handler alertRulesHandler - get /monitoring/alert/rule returns (AlertRulesResp) + get /monitoring/alert/rule (AlertRulesReq)returns (AlertRulesResp) @doc "cluster resource load" @handler clustersLoadHandler @@ -964,5 +964,5 @@ service pcm { @doc "alert list" @handler alertListHandler - get /monitoring/alert/list returns (alertListResp) + get /monitoring/alert/list (alertListReq) returns (alertListResp) } \ No newline at end of file diff --git a/api/internal/handler/monitoring/alertlisthandler.go b/api/internal/handler/monitoring/alertlisthandler.go index 53c82a9b..227e45cb 100644 --- a/api/internal/handler/monitoring/alertlisthandler.go +++ b/api/internal/handler/monitoring/alertlisthandler.go @@ -4,14 +4,22 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "net/http" + "github.com/zeromicro/go-zero/rest/httpx" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/monitoring" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" ) func AlertListHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { + var req types.AlertListReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + l := monitoring.NewAlertListLogic(r.Context(), svcCtx) - resp, err := l.AlertList() + resp, err := l.AlertList(&req) result.HttpResult(r, w, resp, err) } } diff --git a/api/internal/handler/monitoring/alertruleshandler.go b/api/internal/handler/monitoring/alertruleshandler.go index 28cea567..dd7c8607 100644 --- a/api/internal/handler/monitoring/alertruleshandler.go +++ b/api/internal/handler/monitoring/alertruleshandler.go @@ -4,14 +4,22 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "net/http" + "github.com/zeromicro/go-zero/rest/httpx" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/monitoring" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" ) func AlertRulesHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { + var req types.AlertRulesReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + l := monitoring.NewAlertRulesLogic(r.Context(), svcCtx) - resp, err := l.AlertRules() + resp, err := l.AlertRules(&req) result.HttpResult(r, w, resp, err) } } diff --git a/api/internal/logic/core/syncclusterloadlogic.go b/api/internal/logic/core/syncclusterloadlogic.go index f605521d..971f8fd0 100644 --- a/api/internal/logic/core/syncclusterloadlogic.go +++ b/api/internal/logic/core/syncclusterloadlogic.go @@ -38,6 +38,8 @@ func (l *SyncClusterLoadLogic) SyncClusterLoad(req *types.SyncClusterLoadReq) er tracker.ClusterDiskUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskUtilisation) tracker.ClusterDiskAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskAvail) tracker.ClusterDiskTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskTotal) + + tracker.ClusterPodUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.PodsUtilisation) } } return nil diff --git a/api/internal/logic/monitoring/alertlistlogic.go b/api/internal/logic/monitoring/alertlistlogic.go index 81c168e2..fed39a58 100644 --- a/api/internal/logic/monitoring/alertlistlogic.go +++ b/api/internal/logic/monitoring/alertlistlogic.go @@ -32,13 +32,13 @@ type AlertListResp struct { Data map[string][]*v1.Alert `json:"data"` } -func (l *AlertListLogic) AlertList() (resp *types.AlertListResp, err error) { +func (l *AlertListLogic) AlertList(req *types.AlertListReq) (resp *types.AlertListResp, err error) { // todo: add your logic here and delete this line resp = &types.AlertListResp{} // query server http url. var serverArray []string - l.svcCtx.DbEngin.Raw("select ta.server from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes'").Scan(&serverArray) + l.svcCtx.DbEngin.Raw("select ta.server from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes' and ta.type = ?", req.AlertType).Scan(&serverArray) result := make(map[string][]*v1.Alert) for _, server := range serverArray { diff --git a/api/internal/logic/monitoring/alertruleslogic.go b/api/internal/logic/monitoring/alertruleslogic.go index ee9eb1dc..e821897a 100644 --- a/api/internal/logic/monitoring/alertruleslogic.go +++ b/api/internal/logic/monitoring/alertruleslogic.go @@ -23,10 +23,10 @@ func NewAlertRulesLogic(ctx context.Context, svcCtx *svc.ServiceContext) *AlertR } } -func (l *AlertRulesLogic) AlertRules() (resp *types.AlertRulesResp, err error) { +func (l *AlertRulesLogic) AlertRules(req *types.AlertRulesReq) (resp *types.AlertRulesResp, err error) { resp = &types.AlertRulesResp{} var alertRules []types.AlertRule - l.svcCtx.DbEngin.Raw("SELECT ar.id,ar.*,GROUP_CONCAT(tc.`name` ORDER BY tc.`name` ASC SEPARATOR ',') as cluster_name FROM alert_rule ar JOIN t_cluster tc ON ar.cluster_id = tc.id WHERE ar.deleted_at IS NULL AND tc.deleted_at IS NULL GROUP BY ar.id").Scan(&alertRules) + l.svcCtx.DbEngin.Raw("SELECT ar.id,ar.*,GROUP_CONCAT(tc.`name` ORDER BY tc.`name` ASC SEPARATOR ',') as cluster_name FROM alert_rule ar JOIN t_cluster tc ON ar.cluster_id = tc.id WHERE ar.alert_type = ? AND ar.deleted_at IS NULL AND tc.deleted_at IS NULL GROUP BY ar.id", req.AlertType).Scan(&alertRules) resp.AlertRules = alertRules return resp, nil } diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 45d56633..5a095435 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -55,6 +55,7 @@ type ClusterLoadRecord struct { DiskAvail float64 `json:"diskAvail"` DiskTotal float64 `json:"diskTotal"` DiskUtilisation float64 `json:"diskUtilisation"` + PodsUtilisation float64 `json:"podsUtilisation"` } type GetClusterListReq struct { @@ -5444,6 +5445,10 @@ type CreateAlertRuleReq struct { AlertType string `json:"alertType"` } +type AlertRulesReq struct { + AlertType string `form:"alertType"` +} + type AlertRulesResp struct { AlertRules []AlertRule `json:"alertRules"` } @@ -5452,6 +5457,7 @@ type AlertRule struct { Id int64 `json:"id"` ClusterName string `json:"clusterName"` Name string `json:"name"` + AlertType string `json:"alertType"` PromQL string `json:"promQL"` Duration string `json:"duration"` Annotations string `json:"annotations"` @@ -5469,6 +5475,10 @@ type NodesLoadTopResp struct { Msg string `json:"msg"` } +type AlertListReq struct { + AlertType string `form:"alertType"` +} + type AlertListResp struct { AlertMap map[string]interface{} `json:"alertMap"` } diff --git a/pkg/tracker/promql.go b/pkg/tracker/promql.go index eef13139..6d8e5662 100644 --- a/pkg/tracker/promql.go +++ b/pkg/tracker/promql.go @@ -27,9 +27,9 @@ var promQLTemplates = map[string]string{ "cluster_cpu_total": "cluster_cpu_total{$1}", "cluster_memory_total": "cluster_memory_total{$1}", "cluster_disk_total": "cluster_disk_total{$1}", - "cluster_cpu_avail": "cluster_cpu_total{$1}", - "cluster_memory_avail": "cluster_memory_total{$1}", - "cluster_disk_avail": "cluster_disk_total{$1}", + "cluster_cpu_avail": "cluster_cpu_avail{$1}", + "cluster_memory_avail": "cluster_memory_avail{$1}", + "cluster_disk_avail": "cluster_disk_avail{$1}", // center "center_cpu_utilisation": "(sum by (adapter_id)(cluster_cpu_total{$1})-sum by (adapter_id)(cluster_cpu_avail{$1}))/sum by (adapter_id)(cluster_cpu_total{$1})", diff --git a/pkg/tracker/tracker.go b/pkg/tracker/tracker.go index b9b0058e..13796e1b 100644 --- a/pkg/tracker/tracker.go +++ b/pkg/tracker/tracker.go @@ -66,6 +66,10 @@ var ( Name: "cluster_disk_total", Help: "Cluster Disk Total.", }, []string{"cluster_name", "adapter_id"}) + ClusterPodUtilisationGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_pod_utilisation", + Help: "Cluster Pod Utilisation.", + }, []string{"cluster_name", "adapter_id"}) metrics = []prometheus.Collector{ ClusterCpuUtilisationGauge, @@ -77,6 +81,7 @@ var ( ClusterDiskUtilisationGauge, ClusterDiskAvailGauge, ClusterDiskTotalGauge, + ClusterPodUtilisationGauge, } ) @@ -275,7 +280,3 @@ func (p Prometheus) GetRawData(expr string, o QueryOption) (model.Value, error) } return value, nil } - -func AddAlertRule() { - -} From 79749b3ed71ee15db3f5d4e87724d6ba870bfd4c Mon Sep 17 00:00:00 2001 From: zhangwei <894646498@qq.com> Date: Thu, 18 Apr 2024 15:54:32 +0800 Subject: [PATCH 4/6] alert list Former-commit-id: 454efc36db52ba10f611d5624aa651015801ed28 --- api/desc/monitoring/pcm-monitoring.api | 10 ++++- api/desc/pcm.api | 4 ++ .../monitoring/syncclusteralerthandler.go | 28 ++++++++++++ api/internal/handler/routes.go | 5 +++ .../logic/monitoring/alertlistlogic.go | 44 +++++++++---------- .../logic/monitoring/clustersloadlogic.go | 2 +- .../logic/monitoring/syncclusteralertlogic.go | 44 +++++++++++++++++++ api/internal/svc/servicecontext.go | 1 + api/internal/types/types.go | 8 +++- pkg/tracker/promql.go | 1 + 10 files changed, 119 insertions(+), 28 deletions(-) create mode 100644 api/internal/handler/monitoring/syncclusteralerthandler.go create mode 100644 api/internal/logic/monitoring/syncclusteralertlogic.go diff --git a/api/desc/monitoring/pcm-monitoring.api b/api/desc/monitoring/pcm-monitoring.api index bcb2d982..a70f3bf5 100644 --- a/api/desc/monitoring/pcm-monitoring.api +++ b/api/desc/monitoring/pcm-monitoring.api @@ -1,7 +1,7 @@ syntax = "v1" type CreateAlertRuleReq { - CLusterId int64 `json:"clusterId"` + CLusterId string `json:"clusterId"` ClusterName string `json:"clusterName"` Name string `json:"name"` PromQL string `json:"promQL"` @@ -46,8 +46,14 @@ type ( type ( alertListReq { alertType string `form:"alertType"` + adapterId string `form:"adapterId,optional"` + clusterId string `form:"clusterId,optional"` } alertListResp { alertMap map[string]interface{} `json:"alertMap"` } -) \ No newline at end of file +) + +type SyncClusterAlertReq { + AlertRecordsMap map[string]interface{} `json:"alertRecordsMap"` +} \ No newline at end of file diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 84956825..9180680e 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -969,4 +969,8 @@ service pcm { @doc "alert list" @handler alertListHandler get /monitoring/alert/list (alertListReq) returns (alertListResp) + + @doc "Synchronize Cluster alert Information" + @handler syncClusterAlertHandler + post /core/syncClusterAlert (SyncClusterAlertReq) } \ No newline at end of file diff --git a/api/internal/handler/monitoring/syncclusteralerthandler.go b/api/internal/handler/monitoring/syncclusteralerthandler.go new file mode 100644 index 00000000..8aac80f6 --- /dev/null +++ b/api/internal/handler/monitoring/syncclusteralerthandler.go @@ -0,0 +1,28 @@ +package monitoring + +import ( + "net/http" + + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/monitoring" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" +) + +func SyncClusterAlertHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.SyncClusterAlertReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + + l := monitoring.NewSyncClusterAlertLogic(r.Context(), svcCtx) + err := l.SyncClusterAlert(&req) + if err != nil { + httpx.ErrorCtx(r.Context(), w, err) + } else { + httpx.Ok(w) + } + } +} diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index 56d6c5ec..810a8957 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -1217,6 +1217,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/monitoring/alert/list", Handler: monitoring.AlertListHandler(serverCtx), }, + { + Method: http.MethodPost, + Path: "/core/syncClusterAlert", + Handler: monitoring.SyncClusterAlertHandler(serverCtx), + }, }, rest.WithPrefix("/pcm/v1"), ) diff --git a/api/internal/logic/monitoring/alertlistlogic.go b/api/internal/logic/monitoring/alertlistlogic.go index fed39a58..e7d7a665 100644 --- a/api/internal/logic/monitoring/alertlistlogic.go +++ b/api/internal/logic/monitoring/alertlistlogic.go @@ -2,9 +2,9 @@ package monitoring import ( "context" - "github.com/pkg/errors" + "fmt" v1 "github.com/prometheus/client_golang/api/prometheus/v1" - tool "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" + "k8s.io/apimachinery/pkg/util/json" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" @@ -33,31 +33,27 @@ type AlertListResp struct { } func (l *AlertListLogic) AlertList(req *types.AlertListReq) (resp *types.AlertListResp, err error) { - // todo: add your logic here and delete this line - resp = &types.AlertListResp{} + resp = &types.AlertListResp{ + AlertMap: make(map[string]interface{}), + } // query server http url. - var serverArray []string - l.svcCtx.DbEngin.Raw("select ta.server from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes' and ta.type = ?", req.AlertType).Scan(&serverArray) + var clusterArray []string - result := make(map[string][]*v1.Alert) - for _, server := range serverArray { - alertListResp := AlertListResp{} - response, err := l.svcCtx.HttpClient.R(). - SetResult(&alertListResp). - ForceContentType("application/json"). - Get(server + "/api/v1/alert/rule/list") - if err != nil { - logx.Error(response) - return nil, err - } - if response.IsError() { - return nil, errors.New(response.String()) - } - for k, v := range alertListResp.Data { - result[k] = v - } + sql := "select distinct tc.name from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes' and ta.type = ? and ta.id = ?" + if len(req.AdapterId) > 0 { + sql = fmt.Sprintf("select distinct tc.name from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes' and ta.type = ? and ta.id = %s", req.AdapterId) + } + if len(req.ClusterId) > 0 { + sql = fmt.Sprintf("select distinct tc.name from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes' and ta.type = ? and tc.id = %s", req.ClusterId) + } + l.svcCtx.DbEngin.Raw(sql, req.AlertType).Scan(&clusterArray) + + for _, clusterName := range clusterArray { + getResult := l.svcCtx.RedisClient.Get(l.ctx, clusterName) + var alerts []v1.Alert + json.Unmarshal([]byte(getResult.Val()), &alerts) + resp.AlertMap[clusterName] = alerts } - tool.Convert(result, &resp.AlertMap) return resp, nil } diff --git a/api/internal/logic/monitoring/clustersloadlogic.go b/api/internal/logic/monitoring/clustersloadlogic.go index ed2a4d07..184cd5ed 100644 --- a/api/internal/logic/monitoring/clustersloadlogic.go +++ b/api/internal/logic/monitoring/clustersloadlogic.go @@ -27,7 +27,7 @@ func NewClustersLoadLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Clus func (l *ClustersLoadLogic) ClustersLoad(req *types.ClustersLoadReq) (resp *types.ClustersLoadResp, err error) { resp = &types.ClustersLoadResp{} - metrics := []string{"cluster_cpu_utilisation", "cluster_cpu_avail", "cluster_cpu_total", "cluster_memory_total", "cluster_memory_avail", "cluster_memory_utilisation", "cluster_disk_utilisation", "cluster_disk_avail", "cluster_disk_total"} + metrics := []string{"cluster_cpu_utilisation", "cluster_cpu_avail", "cluster_cpu_total", "cluster_memory_total", "cluster_memory_avail", "cluster_memory_utilisation", "cluster_disk_utilisation", "cluster_disk_avail", "cluster_disk_total", "cluster_pod_utilisation"} result := l.svcCtx.PromClient.GetNamedMetrics(metrics, time.Now(), tracker.ClusterOption{ClusterName: req.ClusterName}) resp.Data = result return resp, nil diff --git a/api/internal/logic/monitoring/syncclusteralertlogic.go b/api/internal/logic/monitoring/syncclusteralertlogic.go new file mode 100644 index 00000000..11e8eef3 --- /dev/null +++ b/api/internal/logic/monitoring/syncclusteralertlogic.go @@ -0,0 +1,44 @@ +package monitoring + +import ( + "context" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + "k8s.io/apimachinery/pkg/util/json" + "time" + + "github.com/zeromicro/go-zero/core/logx" +) + +type SyncClusterAlertLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewSyncClusterAlertLogic(ctx context.Context, svcCtx *svc.ServiceContext) *SyncClusterAlertLogic { + return &SyncClusterAlertLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +func (l *SyncClusterAlertLogic) SyncClusterAlert(req *types.SyncClusterAlertReq) error { + + if len(req.AlertRecordsMap) != 0 { + for k, v := range req.AlertRecordsMap { + bytes, err := json.Marshal(v) + if err != nil { + return err + } + setCmd := l.svcCtx.RedisClient.Set(l.ctx, k, bytes, 1*time.Minute) + if setCmd.Err() != nil { + logx.Error(setCmd.Err()) + } + + } + } + + return nil +} diff --git a/api/internal/svc/servicecontext.go b/api/internal/svc/servicecontext.go index ee6fc50f..dfd3f102 100644 --- a/api/internal/svc/servicecontext.go +++ b/api/internal/svc/servicecontext.go @@ -72,6 +72,7 @@ func NewServiceContext(c config.Config) *ServiceContext { panic("InitSnowflake err") } httpClient := resty.New() + httpClient.SetTimeout(1 * time.Second) alertClient := tracker.NewAlertClient(c.Monitoring.AlertUrl) if err != nil { logx.Errorf("InitPrometheus err: %v", err) diff --git a/api/internal/types/types.go b/api/internal/types/types.go index da9a25ac..727b1eaf 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -5453,7 +5453,7 @@ type PushResourceInfoReq struct { } type CreateAlertRuleReq struct { - CLusterId int64 `json:"clusterId"` + CLusterId string `json:"clusterId"` ClusterName string `json:"clusterName"` Name string `json:"name"` PromQL string `json:"promQL"` @@ -5495,8 +5495,14 @@ type NodesLoadTopResp struct { type AlertListReq struct { AlertType string `form:"alertType"` + AdapterId string `form:"adapterId,optional"` + ClusterId string `form:"clusterId,optional"` } type AlertListResp struct { AlertMap map[string]interface{} `json:"alertMap"` } + +type SyncClusterAlertReq struct { + AlertRecordsMap map[string]interface{} `json:"alertRecordsMap"` +} diff --git a/pkg/tracker/promql.go b/pkg/tracker/promql.go index 6d8e5662..ce84be43 100644 --- a/pkg/tracker/promql.go +++ b/pkg/tracker/promql.go @@ -30,6 +30,7 @@ var promQLTemplates = map[string]string{ "cluster_cpu_avail": "cluster_cpu_avail{$1}", "cluster_memory_avail": "cluster_memory_avail{$1}", "cluster_disk_avail": "cluster_disk_avail{$1}", + "cluster_pod_utilisation": "cluster_pod_utilisation{$1}", // center "center_cpu_utilisation": "(sum by (adapter_id)(cluster_cpu_total{$1})-sum by (adapter_id)(cluster_cpu_avail{$1}))/sum by (adapter_id)(cluster_cpu_total{$1})", From e07d030f7b695e75011c2958c10076dee8212f6e Mon Sep 17 00:00:00 2001 From: zhangwei <894646498@qq.com> Date: Fri, 19 Apr 2024 16:53:02 +0800 Subject: [PATCH 5/6] Sync Cluster Load Former-commit-id: 0eac6041b3b479dbc541d07c9367c5e55a1c2ebf --- api/desc/core/pcm-core.api | 42 +-- .../logic/core/syncclusterloadlogic.go | 2 + .../logic/monitoring/alertlistlogic.go | 15 +- api/internal/types/types.go | 354 ++++++++++-------- pkg/tracker/tracker.go | 10 + 5 files changed, 227 insertions(+), 196 deletions(-) diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index 07c0e74c..b661712c 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -52,35 +52,21 @@ type ( clusterLoadRecords []ClusterLoadRecord `json:"clusterLoadRecords"` } ClusterLoadRecord { - AdapterId int64 `json:"adapterId"` - ClusterName string `json:"clusterName"` - CpuAvail float64 `json:"cpuAvail"` - CpuTotal float64 `json:"cpuTotal"` - CpuUtilisation float64 `json:"cpuUtilisation"` - MemoryAvail float64 `json:"memoryAvail"` - MemoryUtilisation float64 `json:"memoryUtilisation"` - MemoryTotal float64 `json:"memoryTotal"` - DiskAvail float64 `json:"diskAvail"` - DiskTotal float64 `json:"diskTotal"` - DiskUtilisation float64 `json:"diskUtilisation"` - PodsUtilisation float64 `json:"podsUtilisation"` + AdapterId int64 `json:"adapterId,optional"` + ClusterName string `json:"clusterName,optional"` + CpuAvail float64 `json:"cpuAvail,optional"` + CpuTotal float64 `json:"cpuTotal,optional"` + CpuUtilisation float64 `json:"cpuUtilisation,optional"` + MemoryAvail float64 `json:"memoryAvail,optional"` + MemoryUtilisation float64 `json:"memoryUtilisation,optional"` + MemoryTotal float64 `json:"memoryTotal,optional"` + DiskAvail float64 `json:"diskAvail,optional"` + DiskTotal float64 `json:"diskTotal,optional"` + DiskUtilisation float64 `json:"diskUtilisation,optional"` + PodsUtilisation float64 `json:"podsUtilisation,optional"` + PodsCount int64 `json:"podsCount,optional"` + PodsTotal int64 `json:"podsTotal,optional"` } - syncClusterLoadReq { - ClusterLoadRecords []ClusterLoadRecord `json:"clusterLoadRecords"` - } - ClusterLoadRecord { - AdapterId int64 `json:"adapterId"` - ClusterName string `json:"clusterName"` - CpuAvail float64 `json:"cpuAvail"` - CpuTotal float64 `json:"cpuTotal"` - CpuUtilisation float64 `json:"cpuUtilisation"` - MemoryAvail float64 `json:"memoryAvail"` - MemoryUtilisation float64 `json:"memoryUtilisation"` - MemoryTotal float64 `json:"memoryTotal"` - DiskAvail float64 `json:"diskAvail"` - DiskTotal float64 `json:"diskTotal"` - DiskUtilisation float64 `json:"diskUtilisation"` - } ) type ( diff --git a/api/internal/logic/core/syncclusterloadlogic.go b/api/internal/logic/core/syncclusterloadlogic.go index 971f8fd0..abad70e5 100644 --- a/api/internal/logic/core/syncclusterloadlogic.go +++ b/api/internal/logic/core/syncclusterloadlogic.go @@ -40,6 +40,8 @@ func (l *SyncClusterLoadLogic) SyncClusterLoad(req *types.SyncClusterLoadReq) er tracker.ClusterDiskTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskTotal) tracker.ClusterPodUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.PodsUtilisation) + tracker.ClusterPodCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsCount)) + tracker.ClusterPodTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsTotal)) } } return nil diff --git a/api/internal/logic/monitoring/alertlistlogic.go b/api/internal/logic/monitoring/alertlistlogic.go index e7d7a665..099e72d7 100644 --- a/api/internal/logic/monitoring/alertlistlogic.go +++ b/api/internal/logic/monitoring/alertlistlogic.go @@ -4,10 +4,9 @@ import ( "context" "fmt" v1 "github.com/prometheus/client_golang/api/prometheus/v1" - "k8s.io/apimachinery/pkg/util/json" - "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" ) @@ -40,7 +39,7 @@ func (l *AlertListLogic) AlertList(req *types.AlertListReq) (resp *types.AlertLi // query server http url. var clusterArray []string - sql := "select distinct tc.name from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes' and ta.type = ? and ta.id = ?" + sql := "select distinct tc.name from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes' and ta.type = ?" if len(req.AdapterId) > 0 { sql = fmt.Sprintf("select distinct tc.name from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes' and ta.type = ? and ta.id = %s", req.AdapterId) } @@ -51,9 +50,13 @@ func (l *AlertListLogic) AlertList(req *types.AlertListReq) (resp *types.AlertLi for _, clusterName := range clusterArray { getResult := l.svcCtx.RedisClient.Get(l.ctx, clusterName) - var alerts []v1.Alert - json.Unmarshal([]byte(getResult.Val()), &alerts) - resp.AlertMap[clusterName] = alerts + if len(getResult.Val()) != 0 { + var alerts []v1.Alert + json.Unmarshal([]byte(getResult.Val()), &alerts) + + resp.AlertMap[clusterName] = alerts + } + } return resp, nil } diff --git a/api/internal/types/types.go b/api/internal/types/types.go index d36629e4..ee39cb00 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -44,18 +44,20 @@ type SyncClusterLoadReq struct { } type ClusterLoadRecord struct { - AdapterId int64 `json:"adapterId"` - ClusterName string `json:"clusterName"` - CpuAvail float64 `json:"cpuAvail"` - CpuTotal float64 `json:"cpuTotal"` - CpuUtilisation float64 `json:"cpuUtilisation"` - MemoryAvail float64 `json:"memoryAvail"` - MemoryUtilisation float64 `json:"memoryUtilisation"` - MemoryTotal float64 `json:"memoryTotal"` - DiskAvail float64 `json:"diskAvail"` - DiskTotal float64 `json:"diskTotal"` - DiskUtilisation float64 `json:"diskUtilisation"` - PodsUtilisation float64 `json:"podsUtilisation"` + AdapterId int64 `json:"adapterId,optional"` + ClusterName string `json:"clusterName,optional"` + CpuAvail float64 `json:"cpuAvail,optional"` + CpuTotal float64 `json:"cpuTotal,optional"` + CpuUtilisation float64 `json:"cpuUtilisation,optional"` + MemoryAvail float64 `json:"memoryAvail,optional"` + MemoryUtilisation float64 `json:"memoryUtilisation,optional"` + MemoryTotal float64 `json:"memoryTotal,optional"` + DiskAvail float64 `json:"diskAvail,optional"` + DiskTotal float64 `json:"diskTotal,optional"` + DiskUtilisation float64 `json:"diskUtilisation,optional"` + PodsUtilisation float64 `json:"podsUtilisation,optional"` + PodsCount int64 `json:"podsCount,optional"` + PodsTotal int64 `json:"podsTotal,optional"` } type GetClusterListReq struct { @@ -848,6 +850,184 @@ type PageResult struct { PageSize int `json:"pageSize,omitempty"` } +type HpcInfo struct { + Id int64 `json:"id"` // id + TaskId int64 `json:"task_id"` // 任务id + JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id) + AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id + ClusterId int64 `json:"cluster_id"` // 执行任务的集群id + ClusterType string `json:"cluster_type"` // 执行任务的集群类型 + Name string `json:"name"` // 名称 + Status string `json:"status"` // 状态 + CmdScript string `json:"cmd_script"` + StartTime string `json:"start_time"` // 开始时间 + RunningTime int64 `json:"running_time"` // 运行时间 + DerivedEs string `json:"derived_es"` + Cluster string `json:"cluster"` + BlockId int64 `json:"block_id"` + AllocNodes int64 `json:"alloc_nodes"` + AllocCpu int64 `json:"alloc_cpu"` + CardCount int64 `json:"card_count"` // 卡数 + Version string `json:"version"` + Account string `json:"account"` + WorkDir string `json:"work_dir"` // 工作路径 + AssocId int64 `json:"assoc_id"` + ExitCode int64 `json:"exit_code"` + WallTime string `json:"wall_time"` // 最大运行时间 + Result string `json:"result"` // 运行结果 + DeletedAt string `json:"deleted_at"` // 删除时间 + YamlString string `json:"yaml_string"` + AppType string `json:"app_type"` // 应用类型 + AppName string `json:"app_name"` // 应用名称 + Queue string `json:"queue"` // 队列名称 + SubmitType string `json:"submit_type"` // cmd(命令行模式) + NNode string `json:"n_node"` // 节点个数(当指定该参数时,GAP_NODE_STRING必须为"") + StdOutFile string `json:"std_out_file"` // 工作路径/std.err.%j + StdErrFile string `json:"std_err_file"` // 工作路径/std.err.%j + StdInput string `json:"std_input"` + Environment string `json:"environment"` + DeletedFlag int64 `json:"deleted_flag"` // 是否删除(0-否,1-是) + CreatedBy int64 `json:"created_by"` // 创建人 + CreatedTime string `json:"created_time"` // 创建时间 + UpdatedBy int64 `json:"updated_by"` // 更新人 + UpdatedTime string `json:"updated_time"` // 更新时间 +} + +type CloudInfo struct { + Participant int64 `json:"participant,omitempty"` + Id int64 `json:"id,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + ApiVersion string `json:"apiVersion,omitempty"` + Kind string `json:"kind,omitempty"` + Namespace string `json:"namespace,omitempty"` + Name string `json:"name,omitempty"` + Status string `json:"status,omitempty"` + StartTime string `json:"startTime,omitempty"` + RunningTime int64 `json:"runningTime,omitempty"` + Result string `json:"result,omitempty"` + YamlString string `json:"yamlString,omitempty"` +} + +type AiInfo struct { + ParticipantId int64 `json:"participantId,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + ProjectId string `json:"project_id,omitempty"` + Name string `json:"name,omitempty"` + Status string `json:"status,omitempty"` + StartTime string `json:"startTime,omitempty"` + RunningTime int64 `json:"runningTime,omitempty"` + Result string `json:"result,omitempty"` + JobId string `json:"jobId,omitempty"` + CreateTime string `json:"createTime,omitempty"` + ImageUrl string `json:"imageUrl,omitempty"` + Command string `json:"command,omitempty"` + FlavorId string `json:"flavorId,omitempty"` + SubscriptionId string `json:"subscriptionId,omitempty"` + ItemVersionId string `json:"itemVersionId,omitempty"` +} + +type VmInfo struct { + ParticipantId int64 `json:"participantId,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + Name string `json:"name,omitempty"` + FlavorRef string `json:"flavor_ref,omitempty"` + ImageRef string `json:"image_ref,omitempty"` + NetworkUuid string `json:"network_uuid,omitempty"` + BlockUuid string `json:"block_uuid,omitempty"` + SourceType string `json:"source_type,omitempty"` + DeleteOnTermination bool `json:"delete_on_termination,omitempty"` + Status string `json:"status,omitempty"` + MinCount string `json:"min_count,omitempty"` + Platform string `json:"platform,omitempty"` + Uuid string `json:"uuid,omitempty"` +} + +type PullTaskInfoReq struct { + AdapterId int64 `form:"adapterId"` +} + +type PullTaskInfoResp struct { + HpcInfoList []*HpcInfo `json:"HpcInfoList,omitempty"` + CloudInfoList []*CloudInfo `json:"CloudInfoList,omitempty"` + AiInfoList []*AiInfo `json:"AiInfoList,omitempty"` + VmInfoList []*VmInfo `json:"VmInfoList,omitempty"` +} + +type PushTaskInfoReq struct { + AdapterId int64 `json:"adapterId"` + HpcInfoList []*HpcInfo `json:"hpcInfoList"` + CloudInfoList []*CloudInfo `json:"cloudInfoList"` + AiInfoList []*AiInfo `json:"aiInfoList"` + VmInfoList []*VmInfo `json:"vmInfoList"` +} + +type PushTaskInfoResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` +} + +type PushResourceInfoReq struct { + AdapterId int64 `json:"adapterId"` + ResourceStats []ResourceStats `json:"resourceStats"` +} + +type PushResourceInfoResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` +} + +type NoticeInfo struct { + AdapterId int64 `json:"adapterId"` + AdapterName string `json:"adapterName"` + ClusterId int64 `json:"clusterId"` + ClusterName string `json:"clusterName"` + NoticeType string `json:"noticeType"` + TaskName string `json:"taskName"` + Incident string `json:"incident"` +} + +type ListNoticeReq struct { +} + +type ListNoticeResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` + Data []NoticeInfo `json:"data"` +} + +type PushNoticeReq struct { + NoticeInfo NoticeInfo `json:"noticeInfo"` +} + +type PushNoticeResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` +} + +type ResourceStats struct { + ClusterId int64 `json:"clusterId"` + Name string `json:"name"` + CpuCoreAvail int64 `json:"cpuCoreAvail"` + CpuCoreTotal int64 `json:"cpuCoreTotal"` + MemAvail float64 `json:"memAvail"` + MemTotal float64 `json:"memTotal"` + DiskAvail float64 `json:"diskAvail"` + DiskTotal float64 `json:"diskTotal"` + GpuAvail int64 `json:"gpuAvail"` + CardsAvail []*Card `json:"cardsAvail"` + CpuCoreHours float64 `json:"cpuCoreHours"` + Balance float64 `json:"balance"` +} + +type Card struct { + Platform string `json:"platform"` + Type string `json:"type"` + Name string `json:"name"` + TOpsAtFp16 float64 `json:"TOpsAtFp16"` + CardHours float64 `json:"cardHours"` + CardNum int32 `json:"cardNum"` +} + type CommitHpcTaskReq struct { Name string `json:"name"` // paratera:jobName Description string `json:"description,optional"` @@ -5324,156 +5504,6 @@ type AiAlgorithmsResp struct { Algorithms []string `json:"algorithms"` } -type PullTaskInfoReq struct { - AdapterId int64 `form:"adapterId"` -} - -type PullTaskInfoResp struct { - HpcInfoList []*HpcInfo `json:"HpcInfoList,omitempty"` - CloudInfoList []*CloudInfo `json:"CloudInfoList,omitempty"` - AiInfoList []*AiInfo `json:"AiInfoList,omitempty"` - VmInfoList []*VmInfo `json:"VmInfoList,omitempty"` -} - -type HpcInfo struct { - Id int64 `json:"id"` // id - TaskId int64 `json:"task_id"` // 任务id - JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id) - AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id - ClusterId int64 `json:"cluster_id"` // 执行任务的集群id - ClusterType string `json:"cluster_type"` // 执行任务的集群类型 - Name string `json:"name"` // 名称 - Status string `json:"status"` // 状态 - CmdScript string `json:"cmd_script"` - StartTime string `json:"start_time"` // 开始时间 - RunningTime int64 `json:"running_time"` // 运行时间 - DerivedEs string `json:"derived_es"` - Cluster string `json:"cluster"` - BlockId int64 `json:"block_id"` - AllocNodes int64 `json:"alloc_nodes"` - AllocCpu int64 `json:"alloc_cpu"` - CardCount int64 `json:"card_count"` // 卡数 - Version string `json:"version"` - Account string `json:"account"` - WorkDir string `json:"work_dir"` // 工作路径 - AssocId int64 `json:"assoc_id"` - ExitCode int64 `json:"exit_code"` - WallTime string `json:"wall_time"` // 最大运行时间 - Result string `json:"result"` // 运行结果 - DeletedAt string `json:"deleted_at"` // 删除时间 - YamlString string `json:"yaml_string"` - AppType string `json:"app_type"` // 应用类型 - AppName string `json:"app_name"` // 应用名称 - Queue string `json:"queue"` // 队列名称 - SubmitType string `json:"submit_type"` // cmd(命令行模式) - NNode string `json:"n_node"` // 节点个数(当指定该参数时,GAP_NODE_STRING必须为"") - StdOutFile string `json:"std_out_file"` // 工作路径/std.err.%j - StdErrFile string `json:"std_err_file"` // 工作路径/std.err.%j - StdInput string `json:"std_input"` - Environment string `json:"environment"` - DeletedFlag int64 `json:"deleted_flag"` // 是否删除(0-否,1-是) - CreatedBy int64 `json:"created_by"` // 创建人 - CreatedTime string `json:"created_time"` // 创建时间 - UpdatedBy int64 `json:"updated_by"` // 更新人 - UpdatedTime string `json:"updated_time"` // 更新时间 -} - -type CloudInfo struct { - Participant int64 `json:"participant,omitempty"` - Id int64 `json:"id,omitempty"` - TaskId int64 `json:"taskId,omitempty"` - ApiVersion string `json:"apiVersion,omitempty"` - Kind string `json:"kind,omitempty"` - Namespace string `json:"namespace,omitempty"` - Name string `json:"name,omitempty"` - Status string `json:"status,omitempty"` - StartTime string `json:"startTime,omitempty"` - RunningTime int64 `json:"runningTime,omitempty"` - Result string `json:"result,omitempty"` - YamlString string `json:"yamlString,omitempty"` -} - -type AiInfo struct { - ParticipantId int64 `json:"participantId,omitempty"` - TaskId int64 `json:"taskId,omitempty"` - ProjectId string `json:"project_id,omitempty"` - Name string `json:"name,omitempty"` - Status string `json:"status,omitempty"` - StartTime string `json:"startTime,omitempty"` - RunningTime int64 `json:"runningTime,omitempty"` - Result string `json:"result,omitempty"` - JobId string `json:"jobId,omitempty"` - CreateTime string `json:"createTime,omitempty"` - ImageUrl string `json:"imageUrl,omitempty"` - Command string `json:"command,omitempty"` - FlavorId string `json:"flavorId,omitempty"` - SubscriptionId string `json:"subscriptionId,omitempty"` - ItemVersionId string `json:"itemVersionId,omitempty"` -} - -type VmInfo struct { - ParticipantId int64 `json:"participantId,omitempty"` - TaskId int64 `json:"taskId,omitempty"` - Name string `json:"name,omitempty"` - FlavorRef string `json:"flavor_ref,omitempty"` - ImageRef string `json:"image_ref,omitempty"` - NetworkUuid string `json:"network_uuid,omitempty"` - BlockUuid string `json:"block_uuid,omitempty"` - SourceType string `json:"source_type,omitempty"` - DeleteOnTermination bool `json:"delete_on_termination,omitempty"` - Status string `json:"status,omitempty"` - MinCount string `json:"min_count,omitempty"` - Platform string `json:"platform,omitempty"` - Uuid string `json:"uuid,omitempty"` -} - -type PushTaskInfoReq struct { - AdapterId int64 `json:"adapterId"` - HpcInfoList []*HpcInfo `json:"hpcInfoList"` - CloudInfoList []*CloudInfo `json:"cloudInfoList"` - AiInfoList []*AiInfo `json:"aiInfoList"` - VmInfoList []*VmInfo `json:"vmInfoList"` -} - -type PushTaskInfoResp struct { - Code int64 `json:"code"` - Msg string `json:"msg"` -} - -type PushResourceInfoReq struct { - AdapterId int64 `json:"adapterId"` - ResourceStats []ResourceStats `json:"resourceStats"` -} - -type PushResourceInfoResp struct { - Code int64 `json:"code"` - Msg string `json:"msg"` -} - -type ResourceStats struct { - ClusterId int64 `json:"clusterId"` - Name string `json:"name"` - CpuCoreAvail int64 `json:"cpuCoreAvail"` - CpuCoreTotal int64 `json:"cpuCoreTotal"` - MemAvail float64 `json:"memAvail"` - MemTotal float64 `json:"memTotal"` - DiskAvail float64 `json:"diskAvail"` - DiskTotal float64 `json:"diskTotal"` - GpuAvail int64 `json:"gpuAvail"` - CardsAvail []*Card `json:"cardsAvail"` - CpuCoreHours float64 `json:"cpuCoreHours"` - Balance float64 `json:"balance"` -} - -type Card struct { - Platform string `json:"platform"` - Type string `json:"type"` - Name string `json:"name"` - TOpsAtFp16 float64 `json:"TOpsAtFp16"` - CardHours float64 `json:"cardHours"` - CardNum int32 `json:"cardNum"` -} - type CreateAlertRuleReq struct { CLusterId string `json:"clusterId"` ClusterName string `json:"clusterName"` diff --git a/pkg/tracker/tracker.go b/pkg/tracker/tracker.go index 13796e1b..fdc094c0 100644 --- a/pkg/tracker/tracker.go +++ b/pkg/tracker/tracker.go @@ -70,6 +70,14 @@ var ( Name: "cluster_pod_utilisation", Help: "Cluster Pod Utilisation.", }, []string{"cluster_name", "adapter_id"}) + ClusterPodCountGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_pod_count", + Help: "Cluster Pod Count.", + }, []string{"cluster_name", "adapter_id"}) + ClusterPodTotalGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_pod_total", + Help: "Cluster Pod total.", + }, []string{"cluster_name", "adapter_id"}) metrics = []prometheus.Collector{ ClusterCpuUtilisationGauge, @@ -82,6 +90,8 @@ var ( ClusterDiskAvailGauge, ClusterDiskTotalGauge, ClusterPodUtilisationGauge, + ClusterPodCountGauge, + ClusterPodTotalGauge, } ) From 8ea6d19eb4fb1030dc2d3b2ea2a58585574b8d00 Mon Sep 17 00:00:00 2001 From: zhangwei <894646498@qq.com> Date: Fri, 19 Apr 2024 16:54:52 +0800 Subject: [PATCH 6/6] merge Former-commit-id: 7a02b3ac98c0e725bf0ea601112067dc9903e36c --- api/internal/types/types.go | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/api/internal/types/types.go b/api/internal/types/types.go index ce180789..a0c697dd 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -82,6 +82,26 @@ type Region struct { RunningJobs int64 `json:"runningJobs"` } +type GeneralTaskReq struct { + Name string `json:"name"` + ComputeType string `json:"computeType"` + TemplateId string `json:"templateId"` + AdapterId string `json:"adapterId"` + ClusterIds []string `json:"clusterIds"` + Strategy Strategy `json:"strategy"` + ReqBody []string `json:"reqBody"` +} + +type Strategy struct { + Name string `json:"name"` + StaticWeightList []StaticWeightList `json:"staticWeightList"` +} + +type StaticWeightList struct { + ClusterName string `json:"clusterName"` + Weight int `json:"weight"` +} + type DeleteTaskReq struct { Id int64 `path:"id"` } @@ -1116,9 +1136,9 @@ type HpcResourceReq struct { } type HpcResourceResp struct { - Code int32 `json:"code"` - Msg string `json:"msg"` - Data HPCResource `json:"data"` + Code int32 `json:"code"` + Msg string `json:"msg"` + HPCResource HPCResource `json:"hpcResource"` } type HPCResource struct {