diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index e67496da..6affef39 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -179,10 +179,12 @@ type ( Name string `yaml:"name"` Description string `yaml:"description"` tenantId int64 `yaml:"tenantId"` + nsID string `yaml:"nsID"` tasks []TaskYaml `yaml:"tasks"` } TaskYaml { TaskId int64 `yaml:"taskId"` + nsID string `yaml:"nsID"` taskType string `yaml:"taskType"` participantId int64 `yaml:"participantId"` matchLabels map[string]string `yaml:"matchLabels"` diff --git a/api/internal/logic/cloud/controllermetricslogic.go b/api/internal/logic/cloud/controllermetricslogic.go index 87c941d8..05a3dbab 100644 --- a/api/internal/logic/cloud/controllermetricslogic.go +++ b/api/internal/logic/cloud/controllermetricslogic.go @@ -26,9 +26,8 @@ func NewControllerMetricsLogic(ctx context.Context, svcCtx *svc.ServiceContext) func (l *ControllerMetricsLogic) ControllerMetrics(req *types.ControllerMetricsReq) (resp *types.ControllerMetricsResp, err error) { resp = &types.ControllerMetricsResp{} - metrics := l.svcCtx.PromClient[req.ParticipantId].GetNamedMetricsByTime([]string{"pod_cpu_usage", "pod_memory_usage_wo_cache"}, req.Start, req.End, 10*time.Minute, tracker.ControllerOption{ - PodsName: req.Pods, - Namespace: req.Namespace, + metrics := l.svcCtx.PromClient[req.ParticipantId].GetNamedMetricsByTime(req.Metrics, req.Start, req.End, 10*time.Minute, tracker.ControllerOption{ + PodsName: req.Pods, }) resp.Data = metrics return resp, nil diff --git a/api/internal/logic/core/scheduletaskbyyamllogic.go b/api/internal/logic/core/scheduletaskbyyamllogic.go index 10147457..70e5cc1a 100644 --- a/api/internal/logic/core/scheduletaskbyyamllogic.go +++ b/api/internal/logic/core/scheduletaskbyyamllogic.go @@ -54,6 +54,7 @@ func (l *ScheduleTaskByYamlLogic) ScheduleTaskByYaml(req *types.ScheduleTaskByYa Name: req.Name, YamlString: string(bytes), CommitTime: time.Now(), + NsID: req.NsID, } // 保存任务数据到数据库 tx := l.svcCtx.DbEngin.Create(&taskModel) @@ -63,6 +64,7 @@ func (l *ScheduleTaskByYamlLogic) ScheduleTaskByYaml(req *types.ScheduleTaskByYa // 遍历子任务放入任务队列中 for _, task := range req.Tasks { + task.NsID = req.NsID task.TaskId = taskModel.Id // 将任务数据转换成消息体 reqMessage, err := json.Marshal(task) diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 0f4e49cc..1503edcc 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -160,11 +160,13 @@ type ScheduleTaskByYamlReq struct { Name string `yaml:"name"` Description string `yaml:"description"` TenantId int64 `yaml:"tenantId"` + NsID string `yaml:"nsID"` Tasks []TaskYaml `yaml:"tasks"` } type TaskYaml struct { TaskId int64 `yaml:"taskId"` + NsID string `yaml:"nsID"` TaskType string `yaml:"taskType"` ParticipantId int64 `yaml:"participantId"` MatchLabels map[string]string `yaml:"matchLabels"` @@ -3326,12 +3328,13 @@ type ShowNodeDetailsResp struct { } type ControllerMetricsReq struct { - ParticipantId int64 `form:"participantId"` - Namespace string `form:"namespace"` - Pods string `form:"pods"` - Steps string `form:"steps"` - Start string `form:"start"` - End string `form:"end"` + Metrics []string `form:"metrics"` + ParticipantId int64 `form:"participantId"` + Namespace string `form:"namespace"` + Pods string `form:"pods"` + Steps string `form:"steps"` + Start string `form:"start"` + End string `form:"end"` } type ControllerMetricsResp struct { diff --git a/api/pkg/response/TaskInfo.go b/api/pkg/response/TaskInfo.go index 7868fd38..85b60267 100644 --- a/api/pkg/response/TaskInfo.go +++ b/api/pkg/response/TaskInfo.go @@ -18,6 +18,7 @@ import "fmt" type TaskInfo struct { TaskId int64 `json:"taskId,optional"` + NsID string `json:"nsID"` TaskType string `json:"taskType,optional"` MatchLabels map[string]string `json:"matchLabels"` ParticipantId int64 `json:"participantId"` diff --git a/pkg/models/cloudmodel_gen.go b/pkg/models/cloudmodel_gen.go index 17191649..1a81d6fc 100644 --- a/pkg/models/cloudmodel_gen.go +++ b/pkg/models/cloudmodel_gen.go @@ -48,6 +48,7 @@ type ( DeletedAt gorm.DeletedAt `gorm:"index"` YamlString string `db:"yaml_string"` Result string `db:"result"` // 运行结果 + NsID string `db:"ns_id"` } ) diff --git a/pkg/models/taskmodel_gen.go b/pkg/models/taskmodel_gen.go index 470abbe6..3562c287 100644 --- a/pkg/models/taskmodel_gen.go +++ b/pkg/models/taskmodel_gen.go @@ -48,7 +48,7 @@ type ( YamlString string `db:"yaml_string"` Result string `db:"result"` // 作业结果 DeletedAt gorm.DeletedAt `gorm:"index"` - TenantId int64 `db:"tenant_id"` + NsID string `db:"ns_id"` } ) diff --git a/pkg/scheduler/cloudScheduler.go b/pkg/scheduler/cloudScheduler.go index 09711982..ade1c5f3 100644 --- a/pkg/scheduler/cloudScheduler.go +++ b/pkg/scheduler/cloudScheduler.go @@ -64,6 +64,7 @@ func (cs *cloudScheduler) getNewStructForDb(task *response.TaskInfo, participant cloud := cs.UnMarshalK8sStruct(string(bytes), task.TaskId) cloud.Id = utils.GenSnowflakeID() cloud.YamlString = string(bytes) + cloud.NsID = task.NsID cloud.ParticipantId = participantId return cloud, nil } diff --git a/pkg/tracker/promql.go b/pkg/tracker/promql.go index b45ac894..b13315f9 100644 --- a/pkg/tracker/promql.go +++ b/pkg/tracker/promql.go @@ -26,57 +26,6 @@ const ( ) var promQLTemplates = map[string]string{ - //cluster - "cluster_cpu_utilisation": ":node_cpu_utilisation:avg1m", - "cluster_cpu_usage": `round(:node_cpu_utilisation:avg1m * sum(node:node_num_cpu:sum), 0.001)`, - "cluster_cpu_total": "sum(node:node_num_cpu:sum)", - "cluster_memory_utilisation": ":node_memory_utilisation:", - "cluster_memory_available": "sum(node:node_memory_bytes_available:sum)", - "cluster_memory_total": "sum(node:node_memory_bytes_total:sum)", - "cluster_memory_usage_wo_cache": "sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)", - "cluster_net_utilisation": ":node_net_utilisation:sum_irate", - "cluster_net_bytes_transmitted": "sum(node:node_net_bytes_transmitted:sum_irate)", - "cluster_net_bytes_received": "sum(node:node_net_bytes_received:sum_irate)", - "cluster_disk_read_iops": "sum(node:data_volume_iops_reads:sum)", - "cluster_disk_write_iops": "sum(node:data_volume_iops_writes:sum)", - "cluster_disk_read_throughput": "sum(node:data_volume_throughput_bytes_read:sum)", - "cluster_disk_write_throughput": "sum(node:data_volume_throughput_bytes_written:sum)", - "cluster_disk_size_usage": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`, - "cluster_disk_size_utilisation": `cluster:disk_utilization:ratio`, - "cluster_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`, - "cluster_disk_size_available": `sum(max(node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`, - "cluster_disk_inode_total": `sum(node:node_inodes_total:)`, - "cluster_disk_inode_usage": `sum(node:node_inodes_total:) - sum(node:node_inodes_free:)`, - "cluster_disk_inode_utilisation": `cluster:disk_inode_utilization:ratio`, - "cluster_namespace_count": `count(kube_namespace_labels)`, - "cluster_pod_count": `cluster:pod:sum`, - "cluster_pod_quota": `sum(max(kube_node_status_capacity{resource="pods"}) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`, - "cluster_pod_utilisation": `cluster:pod_utilization:ratio`, - "cluster_pod_running_count": `cluster:pod_running:count`, - "cluster_pod_succeeded_count": `count(kube_pod_info unless on (pod) (kube_pod_status_phase{phase=~"Failed|Pending|Unknown|Running"} > 0) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`, - "cluster_pod_abnormal_count": `cluster:pod_abnormal:sum`, - "cluster_node_online": `sum(kube_node_status_condition{condition="Ready",status="true"})`, - "cluster_node_offline": `cluster:node_offline:sum`, - "cluster_node_total": `sum(kube_node_status_condition{condition="Ready"})`, - "cluster_cronjob_count": `sum(kube_cronjob_labels)`, - "cluster_pvc_count": `sum(kube_persistentvolumeclaim_info)`, - "cluster_daemonset_count": `sum(kube_daemonset_labels)`, - "cluster_deployment_count": `sum(kube_deployment_labels)`, - "cluster_endpoint_count": `sum(kube_endpoint_labels)`, - "cluster_hpa_count": `sum(kube_horizontalpodautoscaler_labels)`, - "cluster_job_count": `sum(kube_job_labels)`, - "cluster_statefulset_count": `sum(kube_statefulset_labels)`, - "cluster_replicaset_count": `count(kube_replicaset_labels)`, - "cluster_service_count": `sum(kube_service_info)`, - "cluster_secret_count": `sum(kube_secret_info)`, - "cluster_pv_count": `sum(kube_persistentvolume_labels)`, - "cluster_ingresses_extensions_count": `sum(kube_ingress_labels)`, - "cluster_load1": `sum(node_load1{job="node-exporter"}) / sum(node:node_num_cpu:sum)`, - "cluster_load5": `sum(node_load5{job="node-exporter"}) / sum(node:node_num_cpu:sum)`, - "cluster_load15": `sum(node_load15{job="node-exporter"}) / sum(node:node_num_cpu:sum)`, - "cluster_pod_abnormal_ratio": `cluster:pod_abnormal:ratio`, - "cluster_node_offline_ratio": `cluster:node_offline:ratio`, - //node "node_cpu_utilisation": "node:node_cpu_utilisation:avg1m{$1}", "node_cpu_total": "node:node_num_cpu:sum{$1}", @@ -167,23 +116,6 @@ var promQLTemplates = map[string]string{ "namespace_ingresses_extensions_count": `sum by (namespace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, "namespace_s2ibuilder_count": `sum by (namespace) (s2i_s2ibuilder_created{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`, - // ingress - "ingress_request_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2}[$3])))`, - "ingress_request_4xx_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2,status=~"[4].*"}[$3])))`, - "ingress_request_5xx_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2,status=~"[5].*"}[$3])))`, - "ingress_active_connections": `sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{$2,state="active"}[$3]))`, - "ingress_success_rate": `sum(rate(nginx_ingress_controller_requests{$1,$2,status!~"[4-5].*"}[$3])) / sum(rate(nginx_ingress_controller_requests{$1,$2}[$3]))`, - "ingress_request_duration_average": `sum_over_time(nginx_ingress_controller_request_duration_seconds_sum{$1,$2}[$3])/sum_over_time(nginx_ingress_controller_request_duration_seconds_count{$1,$2}[$3])`, - "ingress_request_duration_50percentage": `histogram_quantile(0.50, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`, - "ingress_request_duration_95percentage": `histogram_quantile(0.95, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`, - "ingress_request_duration_99percentage": `histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`, - "ingress_request_volume": `round(sum(irate(nginx_ingress_controller_requests{$1,$2}[$3])), 0.001)`, - "ingress_request_volume_by_ingress": `round(sum(irate(nginx_ingress_controller_requests{$1,$2}[$3])) by (ingress), 0.001)`, - "ingress_request_network_sent": `sum(irate(nginx_ingress_controller_response_size_sum{$1,$2}[$3]))`, - "ingress_request_network_received": `sum(irate(nginx_ingress_controller_request_size_sum{$1,$2}[$3]))`, - "ingress_request_memory_bytes": `avg(nginx_ingress_controller_nginx_process_resident_memory_bytes{$2})`, - "ingress_request_cpu_usage": `avg(rate(nginx_ingress_controller_nginx_process_cpu_seconds_total{$2}[5m]))`, - // workload "workload_cpu_usage": `round(namespace:workload_cpu_usage:sum{$1}, 0.001)`, "workload_memory_usage": `namespace:workload_memory_usage:sum{$1}`, @@ -201,8 +133,12 @@ var promQLTemplates = map[string]string{ "workload_daemonset_unavailable_replicas_ratio": `namespace:daemonset_unavailable_replicas:ratio{$1}`, "workload_statefulset_unavailable_replicas_ratio": `namespace:statefulset_unavailable_replicas:ratio{$1}`, + "controller_cpu_usage_rate": `round(sum by (owner_name) (sum by (owner_name, pod) (irate(container_cpu_usage_seconds_total{job="kubelet", $1, image!=""}[5m]))/ sum by (owner_name,pod) (kube_pod_container_resource_limits{resource="cpu"}))/count(kube_pod_info{$2}) by (owner_name),0.0001)`, + "controller_memory_usage_rate": `round(sum by (owner_name) (sum by (owner_name, pod) (irate(container_memory_usage_bytes{job="kubelet", $1, image!=""}[5m]))/ sum by (owner_name,pod) (kube_pod_container_resource_limits{resource="memory"}))/count(kube_pod_info{$2}) by (owner_name),0.0001)`, // pod "pod_cpu_usage": `round(sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{job="kubelet", pod!="", image!=""}[5m])) * on (namespace, pod) group_left(owner_kind,owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.001)`, + "pod_cpu_usage_rate": `round(sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{job="kubelet", pod!="", image!=""}[5m]))/sum by (namespace,pod) (kube_pod_container_resource_limits{resource="cpu"}) * on (namespace, pod) group_left(owner_kind,owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.0001)`, + "pod_memory_usage_rate": `round(sum by (namespace, pod) (irate(container_memory_usage_bytes{job="kubelet", pod!="", image!=""}[5m]))/sum by (namespace,pod) (kube_pod_container_resource_limits{resource="memory"}) * on (namespace, pod) group_left(owner_kind,owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.0001)`, "pod_memory_usage": `sum by (namespace, pod) (container_memory_usage_bytes{job="kubelet", pod!="", image!=""}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`, "pod_memory_usage_wo_cache": `sum by (namespace, pod) (container_memory_working_set_bytes{job="kubelet", pod!="", image!=""}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`, "pod_net_bytes_transmitted": `sum by (namespace, pod) (irate(container_network_transmit_bytes_total{pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`, @@ -216,51 +152,6 @@ var promQLTemplates = map[string]string{ "container_memory_usage_wo_cache": `sum by (namespace, pod, container) (container_memory_working_set_bytes{job="kubelet", container!="POD", container!="", image!="", $1})`, "container_processes_usage": `sum by (namespace, pod, container) (container_processes{job="kubelet", container!="POD", container!="", image!="", $1})`, "container_threads_usage": `sum by (namespace, pod, container) (container_threads {job="kubelet", container!="POD", container!="", image!="", $1})`, - - // pvc - "pvc_inodes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_free) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, - "pvc_inodes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, - "pvc_inodes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, - "pvc_inodes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used / kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, - "pvc_bytes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_available_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, - "pvc_bytes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, - "pvc_bytes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, - "pvc_bytes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`, - - // component - "etcd_server_list": `label_replace(up{job="etcd"}, "node_ip", "$1", "instance", "(.*):.*")`, - "etcd_server_total": `count(up{job="etcd"})`, - "etcd_server_up_total": `etcd:up:sum`, - "etcd_server_has_leader": `label_replace(etcd_server_has_leader, "node_ip", "$1", "instance", "(.*):.*")`, - "etcd_server_is_leader": `label_replace(etcd_server_is_leader, "node_ip", "$1", "instance", "(.*):.*")`, - "etcd_server_leader_changes": `label_replace(etcd:etcd_server_leader_changes_seen:sum_changes, "node_ip", "$1", "node", "(.*)")`, - "etcd_server_proposals_failed_rate": `avg(etcd:etcd_server_proposals_failed:sum_irate)`, - "etcd_server_proposals_applied_rate": `avg(etcd:etcd_server_proposals_applied:sum_irate)`, - "etcd_server_proposals_committed_rate": `avg(etcd:etcd_server_proposals_committed:sum_irate)`, - "etcd_server_proposals_pending_count": `avg(etcd:etcd_server_proposals_pending:sum)`, - "etcd_mvcc_db_size": `avg(etcd:etcd_mvcc_db_total_size:sum)`, - "etcd_network_client_grpc_received_bytes": `sum(etcd:etcd_network_client_grpc_received_bytes:sum_irate)`, - "etcd_network_client_grpc_sent_bytes": `sum(etcd:etcd_network_client_grpc_sent_bytes:sum_irate)`, - "etcd_grpc_call_rate": `sum(etcd:grpc_server_started:sum_irate)`, - "etcd_grpc_call_failed_rate": `sum(etcd:grpc_server_handled:sum_irate)`, - "etcd_grpc_server_msg_received_rate": `sum(etcd:grpc_server_msg_received:sum_irate)`, - "etcd_grpc_server_msg_sent_rate": `sum(etcd:grpc_server_msg_sent:sum_irate)`, - "etcd_disk_wal_fsync_duration": `avg(etcd:etcd_disk_wal_fsync_duration:avg)`, - "etcd_disk_wal_fsync_duration_quantile": `avg(etcd:etcd_disk_wal_fsync_duration:histogram_quantile) by (quantile)`, - "etcd_disk_backend_commit_duration": `avg(etcd:etcd_disk_backend_commit_duration:avg)`, - "etcd_disk_backend_commit_duration_quantile": `avg(etcd:etcd_disk_backend_commit_duration:histogram_quantile) by (quantile)`, - - "apiserver_up_sum": `apiserver:up:sum`, - "apiserver_request_rate": `apiserver:apiserver_request_total:sum_irate`, - "apiserver_request_by_verb_rate": `apiserver:apiserver_request_total:sum_verb_irate`, - "apiserver_request_latencies": `apiserver:apiserver_request_duration:avg`, - "apiserver_request_by_verb_latencies": `apiserver:apiserver_request_duration:avg_by_verb`, - - "scheduler_up_sum": `scheduler:up:sum`, - "scheduler_schedule_attempts": `scheduler:scheduler_schedule_attempts:sum`, - "scheduler_schedule_attempt_rate": `scheduler:scheduler_schedule_attempts:sum_rate`, - "scheduler_e2e_scheduling_latency": `scheduler:scheduler_e2e_scheduling_duration:avg`, - "scheduler_e2e_scheduling_latency_quantile": `scheduler:scheduler_e2e_scheduling_duration:histogram_quantile`, } func makeExpr(metric string, opts QueryOptions) string { @@ -333,11 +224,10 @@ func makeNamespaceMetricExpr(tmpl string, o QueryOptions) string { } func makeControllerMetricExpr(tmpl string, o QueryOptions) string { - var namespace, podName string + var podName string - namespace = fmt.Sprintf(`namespace="%s"`, o.Namespace) podName = fmt.Sprintf(`pod=~"%s"`, o.PodName) - return strings.NewReplacer("$1", namespace, "$2", podName).Replace(tmpl) + return strings.NewReplacer("$1", podName, "$2", podName).Replace(tmpl) } func makePodMetricExpr(tmpl string, o QueryOptions) string { diff --git a/pkg/tracker/queryoptions.go b/pkg/tracker/queryoptions.go index c290d4aa..18ff013f 100644 --- a/pkg/tracker/queryoptions.go +++ b/pkg/tracker/queryoptions.go @@ -254,6 +254,7 @@ type ControllerOption struct { Namespace string Kind string OwnerName string + Level string } func (po PodOption) Apply(o *QueryOptions) {