From e07d030f7b695e75011c2958c10076dee8212f6e Mon Sep 17 00:00:00 2001 From: zhangwei <894646498@qq.com> Date: Fri, 19 Apr 2024 16:53:02 +0800 Subject: [PATCH] Sync Cluster Load Former-commit-id: 0eac6041b3b479dbc541d07c9367c5e55a1c2ebf --- api/desc/core/pcm-core.api | 42 +-- .../logic/core/syncclusterloadlogic.go | 2 + .../logic/monitoring/alertlistlogic.go | 15 +- api/internal/types/types.go | 354 ++++++++++-------- pkg/tracker/tracker.go | 10 + 5 files changed, 227 insertions(+), 196 deletions(-) diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index 07c0e74c..b661712c 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -52,35 +52,21 @@ type ( clusterLoadRecords []ClusterLoadRecord `json:"clusterLoadRecords"` } ClusterLoadRecord { - AdapterId int64 `json:"adapterId"` - ClusterName string `json:"clusterName"` - CpuAvail float64 `json:"cpuAvail"` - CpuTotal float64 `json:"cpuTotal"` - CpuUtilisation float64 `json:"cpuUtilisation"` - MemoryAvail float64 `json:"memoryAvail"` - MemoryUtilisation float64 `json:"memoryUtilisation"` - MemoryTotal float64 `json:"memoryTotal"` - DiskAvail float64 `json:"diskAvail"` - DiskTotal float64 `json:"diskTotal"` - DiskUtilisation float64 `json:"diskUtilisation"` - PodsUtilisation float64 `json:"podsUtilisation"` + AdapterId int64 `json:"adapterId,optional"` + ClusterName string `json:"clusterName,optional"` + CpuAvail float64 `json:"cpuAvail,optional"` + CpuTotal float64 `json:"cpuTotal,optional"` + CpuUtilisation float64 `json:"cpuUtilisation,optional"` + MemoryAvail float64 `json:"memoryAvail,optional"` + MemoryUtilisation float64 `json:"memoryUtilisation,optional"` + MemoryTotal float64 `json:"memoryTotal,optional"` + DiskAvail float64 `json:"diskAvail,optional"` + DiskTotal float64 `json:"diskTotal,optional"` + DiskUtilisation float64 `json:"diskUtilisation,optional"` + PodsUtilisation float64 `json:"podsUtilisation,optional"` + PodsCount int64 `json:"podsCount,optional"` + PodsTotal int64 `json:"podsTotal,optional"` } - syncClusterLoadReq { - ClusterLoadRecords []ClusterLoadRecord `json:"clusterLoadRecords"` - } - ClusterLoadRecord { - AdapterId int64 `json:"adapterId"` - ClusterName string `json:"clusterName"` - CpuAvail float64 `json:"cpuAvail"` - CpuTotal float64 `json:"cpuTotal"` - CpuUtilisation float64 `json:"cpuUtilisation"` - MemoryAvail float64 `json:"memoryAvail"` - MemoryUtilisation float64 `json:"memoryUtilisation"` - MemoryTotal float64 `json:"memoryTotal"` - DiskAvail float64 `json:"diskAvail"` - DiskTotal float64 `json:"diskTotal"` - DiskUtilisation float64 `json:"diskUtilisation"` - } ) type ( diff --git a/api/internal/logic/core/syncclusterloadlogic.go b/api/internal/logic/core/syncclusterloadlogic.go index 971f8fd0..abad70e5 100644 --- a/api/internal/logic/core/syncclusterloadlogic.go +++ b/api/internal/logic/core/syncclusterloadlogic.go @@ -40,6 +40,8 @@ func (l *SyncClusterLoadLogic) SyncClusterLoad(req *types.SyncClusterLoadReq) er tracker.ClusterDiskTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskTotal) tracker.ClusterPodUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.PodsUtilisation) + tracker.ClusterPodCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsCount)) + tracker.ClusterPodTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsTotal)) } } return nil diff --git a/api/internal/logic/monitoring/alertlistlogic.go b/api/internal/logic/monitoring/alertlistlogic.go index e7d7a665..099e72d7 100644 --- a/api/internal/logic/monitoring/alertlistlogic.go +++ b/api/internal/logic/monitoring/alertlistlogic.go @@ -4,10 +4,9 @@ import ( "context" "fmt" v1 "github.com/prometheus/client_golang/api/prometheus/v1" - "k8s.io/apimachinery/pkg/util/json" - "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" ) @@ -40,7 +39,7 @@ func (l *AlertListLogic) AlertList(req *types.AlertListReq) (resp *types.AlertLi // query server http url. var clusterArray []string - sql := "select distinct tc.name from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes' and ta.type = ? and ta.id = ?" + sql := "select distinct tc.name from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes' and ta.type = ?" if len(req.AdapterId) > 0 { sql = fmt.Sprintf("select distinct tc.name from t_adapter ta,t_cluster tc where ta.id = tc.adapter_id and label = 'kubernetes' and ta.type = ? and ta.id = %s", req.AdapterId) } @@ -51,9 +50,13 @@ func (l *AlertListLogic) AlertList(req *types.AlertListReq) (resp *types.AlertLi for _, clusterName := range clusterArray { getResult := l.svcCtx.RedisClient.Get(l.ctx, clusterName) - var alerts []v1.Alert - json.Unmarshal([]byte(getResult.Val()), &alerts) - resp.AlertMap[clusterName] = alerts + if len(getResult.Val()) != 0 { + var alerts []v1.Alert + json.Unmarshal([]byte(getResult.Val()), &alerts) + + resp.AlertMap[clusterName] = alerts + } + } return resp, nil } diff --git a/api/internal/types/types.go b/api/internal/types/types.go index d36629e4..ee39cb00 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -44,18 +44,20 @@ type SyncClusterLoadReq struct { } type ClusterLoadRecord struct { - AdapterId int64 `json:"adapterId"` - ClusterName string `json:"clusterName"` - CpuAvail float64 `json:"cpuAvail"` - CpuTotal float64 `json:"cpuTotal"` - CpuUtilisation float64 `json:"cpuUtilisation"` - MemoryAvail float64 `json:"memoryAvail"` - MemoryUtilisation float64 `json:"memoryUtilisation"` - MemoryTotal float64 `json:"memoryTotal"` - DiskAvail float64 `json:"diskAvail"` - DiskTotal float64 `json:"diskTotal"` - DiskUtilisation float64 `json:"diskUtilisation"` - PodsUtilisation float64 `json:"podsUtilisation"` + AdapterId int64 `json:"adapterId,optional"` + ClusterName string `json:"clusterName,optional"` + CpuAvail float64 `json:"cpuAvail,optional"` + CpuTotal float64 `json:"cpuTotal,optional"` + CpuUtilisation float64 `json:"cpuUtilisation,optional"` + MemoryAvail float64 `json:"memoryAvail,optional"` + MemoryUtilisation float64 `json:"memoryUtilisation,optional"` + MemoryTotal float64 `json:"memoryTotal,optional"` + DiskAvail float64 `json:"diskAvail,optional"` + DiskTotal float64 `json:"diskTotal,optional"` + DiskUtilisation float64 `json:"diskUtilisation,optional"` + PodsUtilisation float64 `json:"podsUtilisation,optional"` + PodsCount int64 `json:"podsCount,optional"` + PodsTotal int64 `json:"podsTotal,optional"` } type GetClusterListReq struct { @@ -848,6 +850,184 @@ type PageResult struct { PageSize int `json:"pageSize,omitempty"` } +type HpcInfo struct { + Id int64 `json:"id"` // id + TaskId int64 `json:"task_id"` // 任务id + JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id) + AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id + ClusterId int64 `json:"cluster_id"` // 执行任务的集群id + ClusterType string `json:"cluster_type"` // 执行任务的集群类型 + Name string `json:"name"` // 名称 + Status string `json:"status"` // 状态 + CmdScript string `json:"cmd_script"` + StartTime string `json:"start_time"` // 开始时间 + RunningTime int64 `json:"running_time"` // 运行时间 + DerivedEs string `json:"derived_es"` + Cluster string `json:"cluster"` + BlockId int64 `json:"block_id"` + AllocNodes int64 `json:"alloc_nodes"` + AllocCpu int64 `json:"alloc_cpu"` + CardCount int64 `json:"card_count"` // 卡数 + Version string `json:"version"` + Account string `json:"account"` + WorkDir string `json:"work_dir"` // 工作路径 + AssocId int64 `json:"assoc_id"` + ExitCode int64 `json:"exit_code"` + WallTime string `json:"wall_time"` // 最大运行时间 + Result string `json:"result"` // 运行结果 + DeletedAt string `json:"deleted_at"` // 删除时间 + YamlString string `json:"yaml_string"` + AppType string `json:"app_type"` // 应用类型 + AppName string `json:"app_name"` // 应用名称 + Queue string `json:"queue"` // 队列名称 + SubmitType string `json:"submit_type"` // cmd(命令行模式) + NNode string `json:"n_node"` // 节点个数(当指定该参数时,GAP_NODE_STRING必须为"") + StdOutFile string `json:"std_out_file"` // 工作路径/std.err.%j + StdErrFile string `json:"std_err_file"` // 工作路径/std.err.%j + StdInput string `json:"std_input"` + Environment string `json:"environment"` + DeletedFlag int64 `json:"deleted_flag"` // 是否删除(0-否,1-是) + CreatedBy int64 `json:"created_by"` // 创建人 + CreatedTime string `json:"created_time"` // 创建时间 + UpdatedBy int64 `json:"updated_by"` // 更新人 + UpdatedTime string `json:"updated_time"` // 更新时间 +} + +type CloudInfo struct { + Participant int64 `json:"participant,omitempty"` + Id int64 `json:"id,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + ApiVersion string `json:"apiVersion,omitempty"` + Kind string `json:"kind,omitempty"` + Namespace string `json:"namespace,omitempty"` + Name string `json:"name,omitempty"` + Status string `json:"status,omitempty"` + StartTime string `json:"startTime,omitempty"` + RunningTime int64 `json:"runningTime,omitempty"` + Result string `json:"result,omitempty"` + YamlString string `json:"yamlString,omitempty"` +} + +type AiInfo struct { + ParticipantId int64 `json:"participantId,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + ProjectId string `json:"project_id,omitempty"` + Name string `json:"name,omitempty"` + Status string `json:"status,omitempty"` + StartTime string `json:"startTime,omitempty"` + RunningTime int64 `json:"runningTime,omitempty"` + Result string `json:"result,omitempty"` + JobId string `json:"jobId,omitempty"` + CreateTime string `json:"createTime,omitempty"` + ImageUrl string `json:"imageUrl,omitempty"` + Command string `json:"command,omitempty"` + FlavorId string `json:"flavorId,omitempty"` + SubscriptionId string `json:"subscriptionId,omitempty"` + ItemVersionId string `json:"itemVersionId,omitempty"` +} + +type VmInfo struct { + ParticipantId int64 `json:"participantId,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + Name string `json:"name,omitempty"` + FlavorRef string `json:"flavor_ref,omitempty"` + ImageRef string `json:"image_ref,omitempty"` + NetworkUuid string `json:"network_uuid,omitempty"` + BlockUuid string `json:"block_uuid,omitempty"` + SourceType string `json:"source_type,omitempty"` + DeleteOnTermination bool `json:"delete_on_termination,omitempty"` + Status string `json:"status,omitempty"` + MinCount string `json:"min_count,omitempty"` + Platform string `json:"platform,omitempty"` + Uuid string `json:"uuid,omitempty"` +} + +type PullTaskInfoReq struct { + AdapterId int64 `form:"adapterId"` +} + +type PullTaskInfoResp struct { + HpcInfoList []*HpcInfo `json:"HpcInfoList,omitempty"` + CloudInfoList []*CloudInfo `json:"CloudInfoList,omitempty"` + AiInfoList []*AiInfo `json:"AiInfoList,omitempty"` + VmInfoList []*VmInfo `json:"VmInfoList,omitempty"` +} + +type PushTaskInfoReq struct { + AdapterId int64 `json:"adapterId"` + HpcInfoList []*HpcInfo `json:"hpcInfoList"` + CloudInfoList []*CloudInfo `json:"cloudInfoList"` + AiInfoList []*AiInfo `json:"aiInfoList"` + VmInfoList []*VmInfo `json:"vmInfoList"` +} + +type PushTaskInfoResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` +} + +type PushResourceInfoReq struct { + AdapterId int64 `json:"adapterId"` + ResourceStats []ResourceStats `json:"resourceStats"` +} + +type PushResourceInfoResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` +} + +type NoticeInfo struct { + AdapterId int64 `json:"adapterId"` + AdapterName string `json:"adapterName"` + ClusterId int64 `json:"clusterId"` + ClusterName string `json:"clusterName"` + NoticeType string `json:"noticeType"` + TaskName string `json:"taskName"` + Incident string `json:"incident"` +} + +type ListNoticeReq struct { +} + +type ListNoticeResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` + Data []NoticeInfo `json:"data"` +} + +type PushNoticeReq struct { + NoticeInfo NoticeInfo `json:"noticeInfo"` +} + +type PushNoticeResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` +} + +type ResourceStats struct { + ClusterId int64 `json:"clusterId"` + Name string `json:"name"` + CpuCoreAvail int64 `json:"cpuCoreAvail"` + CpuCoreTotal int64 `json:"cpuCoreTotal"` + MemAvail float64 `json:"memAvail"` + MemTotal float64 `json:"memTotal"` + DiskAvail float64 `json:"diskAvail"` + DiskTotal float64 `json:"diskTotal"` + GpuAvail int64 `json:"gpuAvail"` + CardsAvail []*Card `json:"cardsAvail"` + CpuCoreHours float64 `json:"cpuCoreHours"` + Balance float64 `json:"balance"` +} + +type Card struct { + Platform string `json:"platform"` + Type string `json:"type"` + Name string `json:"name"` + TOpsAtFp16 float64 `json:"TOpsAtFp16"` + CardHours float64 `json:"cardHours"` + CardNum int32 `json:"cardNum"` +} + type CommitHpcTaskReq struct { Name string `json:"name"` // paratera:jobName Description string `json:"description,optional"` @@ -5324,156 +5504,6 @@ type AiAlgorithmsResp struct { Algorithms []string `json:"algorithms"` } -type PullTaskInfoReq struct { - AdapterId int64 `form:"adapterId"` -} - -type PullTaskInfoResp struct { - HpcInfoList []*HpcInfo `json:"HpcInfoList,omitempty"` - CloudInfoList []*CloudInfo `json:"CloudInfoList,omitempty"` - AiInfoList []*AiInfo `json:"AiInfoList,omitempty"` - VmInfoList []*VmInfo `json:"VmInfoList,omitempty"` -} - -type HpcInfo struct { - Id int64 `json:"id"` // id - TaskId int64 `json:"task_id"` // 任务id - JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id) - AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id - ClusterId int64 `json:"cluster_id"` // 执行任务的集群id - ClusterType string `json:"cluster_type"` // 执行任务的集群类型 - Name string `json:"name"` // 名称 - Status string `json:"status"` // 状态 - CmdScript string `json:"cmd_script"` - StartTime string `json:"start_time"` // 开始时间 - RunningTime int64 `json:"running_time"` // 运行时间 - DerivedEs string `json:"derived_es"` - Cluster string `json:"cluster"` - BlockId int64 `json:"block_id"` - AllocNodes int64 `json:"alloc_nodes"` - AllocCpu int64 `json:"alloc_cpu"` - CardCount int64 `json:"card_count"` // 卡数 - Version string `json:"version"` - Account string `json:"account"` - WorkDir string `json:"work_dir"` // 工作路径 - AssocId int64 `json:"assoc_id"` - ExitCode int64 `json:"exit_code"` - WallTime string `json:"wall_time"` // 最大运行时间 - Result string `json:"result"` // 运行结果 - DeletedAt string `json:"deleted_at"` // 删除时间 - YamlString string `json:"yaml_string"` - AppType string `json:"app_type"` // 应用类型 - AppName string `json:"app_name"` // 应用名称 - Queue string `json:"queue"` // 队列名称 - SubmitType string `json:"submit_type"` // cmd(命令行模式) - NNode string `json:"n_node"` // 节点个数(当指定该参数时,GAP_NODE_STRING必须为"") - StdOutFile string `json:"std_out_file"` // 工作路径/std.err.%j - StdErrFile string `json:"std_err_file"` // 工作路径/std.err.%j - StdInput string `json:"std_input"` - Environment string `json:"environment"` - DeletedFlag int64 `json:"deleted_flag"` // 是否删除(0-否,1-是) - CreatedBy int64 `json:"created_by"` // 创建人 - CreatedTime string `json:"created_time"` // 创建时间 - UpdatedBy int64 `json:"updated_by"` // 更新人 - UpdatedTime string `json:"updated_time"` // 更新时间 -} - -type CloudInfo struct { - Participant int64 `json:"participant,omitempty"` - Id int64 `json:"id,omitempty"` - TaskId int64 `json:"taskId,omitempty"` - ApiVersion string `json:"apiVersion,omitempty"` - Kind string `json:"kind,omitempty"` - Namespace string `json:"namespace,omitempty"` - Name string `json:"name,omitempty"` - Status string `json:"status,omitempty"` - StartTime string `json:"startTime,omitempty"` - RunningTime int64 `json:"runningTime,omitempty"` - Result string `json:"result,omitempty"` - YamlString string `json:"yamlString,omitempty"` -} - -type AiInfo struct { - ParticipantId int64 `json:"participantId,omitempty"` - TaskId int64 `json:"taskId,omitempty"` - ProjectId string `json:"project_id,omitempty"` - Name string `json:"name,omitempty"` - Status string `json:"status,omitempty"` - StartTime string `json:"startTime,omitempty"` - RunningTime int64 `json:"runningTime,omitempty"` - Result string `json:"result,omitempty"` - JobId string `json:"jobId,omitempty"` - CreateTime string `json:"createTime,omitempty"` - ImageUrl string `json:"imageUrl,omitempty"` - Command string `json:"command,omitempty"` - FlavorId string `json:"flavorId,omitempty"` - SubscriptionId string `json:"subscriptionId,omitempty"` - ItemVersionId string `json:"itemVersionId,omitempty"` -} - -type VmInfo struct { - ParticipantId int64 `json:"participantId,omitempty"` - TaskId int64 `json:"taskId,omitempty"` - Name string `json:"name,omitempty"` - FlavorRef string `json:"flavor_ref,omitempty"` - ImageRef string `json:"image_ref,omitempty"` - NetworkUuid string `json:"network_uuid,omitempty"` - BlockUuid string `json:"block_uuid,omitempty"` - SourceType string `json:"source_type,omitempty"` - DeleteOnTermination bool `json:"delete_on_termination,omitempty"` - Status string `json:"status,omitempty"` - MinCount string `json:"min_count,omitempty"` - Platform string `json:"platform,omitempty"` - Uuid string `json:"uuid,omitempty"` -} - -type PushTaskInfoReq struct { - AdapterId int64 `json:"adapterId"` - HpcInfoList []*HpcInfo `json:"hpcInfoList"` - CloudInfoList []*CloudInfo `json:"cloudInfoList"` - AiInfoList []*AiInfo `json:"aiInfoList"` - VmInfoList []*VmInfo `json:"vmInfoList"` -} - -type PushTaskInfoResp struct { - Code int64 `json:"code"` - Msg string `json:"msg"` -} - -type PushResourceInfoReq struct { - AdapterId int64 `json:"adapterId"` - ResourceStats []ResourceStats `json:"resourceStats"` -} - -type PushResourceInfoResp struct { - Code int64 `json:"code"` - Msg string `json:"msg"` -} - -type ResourceStats struct { - ClusterId int64 `json:"clusterId"` - Name string `json:"name"` - CpuCoreAvail int64 `json:"cpuCoreAvail"` - CpuCoreTotal int64 `json:"cpuCoreTotal"` - MemAvail float64 `json:"memAvail"` - MemTotal float64 `json:"memTotal"` - DiskAvail float64 `json:"diskAvail"` - DiskTotal float64 `json:"diskTotal"` - GpuAvail int64 `json:"gpuAvail"` - CardsAvail []*Card `json:"cardsAvail"` - CpuCoreHours float64 `json:"cpuCoreHours"` - Balance float64 `json:"balance"` -} - -type Card struct { - Platform string `json:"platform"` - Type string `json:"type"` - Name string `json:"name"` - TOpsAtFp16 float64 `json:"TOpsAtFp16"` - CardHours float64 `json:"cardHours"` - CardNum int32 `json:"cardNum"` -} - type CreateAlertRuleReq struct { CLusterId string `json:"clusterId"` ClusterName string `json:"clusterName"` diff --git a/pkg/tracker/tracker.go b/pkg/tracker/tracker.go index 13796e1b..fdc094c0 100644 --- a/pkg/tracker/tracker.go +++ b/pkg/tracker/tracker.go @@ -70,6 +70,14 @@ var ( Name: "cluster_pod_utilisation", Help: "Cluster Pod Utilisation.", }, []string{"cluster_name", "adapter_id"}) + ClusterPodCountGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_pod_count", + Help: "Cluster Pod Count.", + }, []string{"cluster_name", "adapter_id"}) + ClusterPodTotalGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cluster_pod_total", + Help: "Cluster Pod total.", + }, []string{"cluster_name", "adapter_id"}) metrics = []prometheus.Collector{ ClusterCpuUtilisationGauge, @@ -82,6 +90,8 @@ var ( ClusterDiskAvailGauge, ClusterDiskTotalGauge, ClusterPodUtilisationGauge, + ClusterPodCountGauge, + ClusterPodTotalGauge, } )