|
|
|
@ -26,57 +26,6 @@ const (
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
var promQLTemplates = map[string]string{
|
|
|
|
|
//cluster
|
|
|
|
|
"cluster_cpu_utilisation": ":node_cpu_utilisation:avg1m",
|
|
|
|
|
"cluster_cpu_usage": `round(:node_cpu_utilisation:avg1m * sum(node:node_num_cpu:sum), 0.001)`,
|
|
|
|
|
"cluster_cpu_total": "sum(node:node_num_cpu:sum)",
|
|
|
|
|
"cluster_memory_utilisation": ":node_memory_utilisation:",
|
|
|
|
|
"cluster_memory_available": "sum(node:node_memory_bytes_available:sum)",
|
|
|
|
|
"cluster_memory_total": "sum(node:node_memory_bytes_total:sum)",
|
|
|
|
|
"cluster_memory_usage_wo_cache": "sum(node:node_memory_bytes_total:sum) - sum(node:node_memory_bytes_available:sum)",
|
|
|
|
|
"cluster_net_utilisation": ":node_net_utilisation:sum_irate",
|
|
|
|
|
"cluster_net_bytes_transmitted": "sum(node:node_net_bytes_transmitted:sum_irate)",
|
|
|
|
|
"cluster_net_bytes_received": "sum(node:node_net_bytes_received:sum_irate)",
|
|
|
|
|
"cluster_disk_read_iops": "sum(node:data_volume_iops_reads:sum)",
|
|
|
|
|
"cluster_disk_write_iops": "sum(node:data_volume_iops_writes:sum)",
|
|
|
|
|
"cluster_disk_read_throughput": "sum(node:data_volume_throughput_bytes_read:sum)",
|
|
|
|
|
"cluster_disk_write_throughput": "sum(node:data_volume_throughput_bytes_written:sum)",
|
|
|
|
|
"cluster_disk_size_usage": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"} - node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
|
|
|
|
|
"cluster_disk_size_utilisation": `cluster:disk_utilization:ratio`,
|
|
|
|
|
"cluster_disk_size_capacity": `sum(max(node_filesystem_size_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
|
|
|
|
|
"cluster_disk_size_available": `sum(max(node_filesystem_avail_bytes{device=~"/dev/.*", device!~"/dev/loop\\d+", job="node-exporter"}) by (device, instance))`,
|
|
|
|
|
"cluster_disk_inode_total": `sum(node:node_inodes_total:)`,
|
|
|
|
|
"cluster_disk_inode_usage": `sum(node:node_inodes_total:) - sum(node:node_inodes_free:)`,
|
|
|
|
|
"cluster_disk_inode_utilisation": `cluster:disk_inode_utilization:ratio`,
|
|
|
|
|
"cluster_namespace_count": `count(kube_namespace_labels)`,
|
|
|
|
|
"cluster_pod_count": `cluster:pod:sum`,
|
|
|
|
|
"cluster_pod_quota": `sum(max(kube_node_status_capacity{resource="pods"}) by (node) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`,
|
|
|
|
|
"cluster_pod_utilisation": `cluster:pod_utilization:ratio`,
|
|
|
|
|
"cluster_pod_running_count": `cluster:pod_running:count`,
|
|
|
|
|
"cluster_pod_succeeded_count": `count(kube_pod_info unless on (pod) (kube_pod_status_phase{phase=~"Failed|Pending|Unknown|Running"} > 0) unless on (node) (kube_node_status_condition{condition="Ready",status=~"unknown|false"} > 0))`,
|
|
|
|
|
"cluster_pod_abnormal_count": `cluster:pod_abnormal:sum`,
|
|
|
|
|
"cluster_node_online": `sum(kube_node_status_condition{condition="Ready",status="true"})`,
|
|
|
|
|
"cluster_node_offline": `cluster:node_offline:sum`,
|
|
|
|
|
"cluster_node_total": `sum(kube_node_status_condition{condition="Ready"})`,
|
|
|
|
|
"cluster_cronjob_count": `sum(kube_cronjob_labels)`,
|
|
|
|
|
"cluster_pvc_count": `sum(kube_persistentvolumeclaim_info)`,
|
|
|
|
|
"cluster_daemonset_count": `sum(kube_daemonset_labels)`,
|
|
|
|
|
"cluster_deployment_count": `sum(kube_deployment_labels)`,
|
|
|
|
|
"cluster_endpoint_count": `sum(kube_endpoint_labels)`,
|
|
|
|
|
"cluster_hpa_count": `sum(kube_horizontalpodautoscaler_labels)`,
|
|
|
|
|
"cluster_job_count": `sum(kube_job_labels)`,
|
|
|
|
|
"cluster_statefulset_count": `sum(kube_statefulset_labels)`,
|
|
|
|
|
"cluster_replicaset_count": `count(kube_replicaset_labels)`,
|
|
|
|
|
"cluster_service_count": `sum(kube_service_info)`,
|
|
|
|
|
"cluster_secret_count": `sum(kube_secret_info)`,
|
|
|
|
|
"cluster_pv_count": `sum(kube_persistentvolume_labels)`,
|
|
|
|
|
"cluster_ingresses_extensions_count": `sum(kube_ingress_labels)`,
|
|
|
|
|
"cluster_load1": `sum(node_load1{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
|
|
|
|
|
"cluster_load5": `sum(node_load5{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
|
|
|
|
|
"cluster_load15": `sum(node_load15{job="node-exporter"}) / sum(node:node_num_cpu:sum)`,
|
|
|
|
|
"cluster_pod_abnormal_ratio": `cluster:pod_abnormal:ratio`,
|
|
|
|
|
"cluster_node_offline_ratio": `cluster:node_offline:ratio`,
|
|
|
|
|
|
|
|
|
|
//node
|
|
|
|
|
"node_cpu_utilisation": "node:node_cpu_utilisation:avg1m{$1}",
|
|
|
|
|
"node_cpu_total": "node:node_num_cpu:sum{$1}",
|
|
|
|
@ -167,23 +116,6 @@ var promQLTemplates = map[string]string{
|
|
|
|
|
"namespace_ingresses_extensions_count": `sum by (namespace) (kube_ingress_labels{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
|
|
|
|
|
"namespace_s2ibuilder_count": `sum by (namespace) (s2i_s2ibuilder_created{namespace!=""} * on (namespace) group_left(workspace) kube_namespace_labels{$1})`,
|
|
|
|
|
|
|
|
|
|
// ingress
|
|
|
|
|
"ingress_request_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2}[$3])))`,
|
|
|
|
|
"ingress_request_4xx_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2,status=~"[4].*"}[$3])))`,
|
|
|
|
|
"ingress_request_5xx_count": `round(sum(increase(nginx_ingress_controller_requests{$1,$2,status=~"[5].*"}[$3])))`,
|
|
|
|
|
"ingress_active_connections": `sum(avg_over_time(nginx_ingress_controller_nginx_process_connections{$2,state="active"}[$3]))`,
|
|
|
|
|
"ingress_success_rate": `sum(rate(nginx_ingress_controller_requests{$1,$2,status!~"[4-5].*"}[$3])) / sum(rate(nginx_ingress_controller_requests{$1,$2}[$3]))`,
|
|
|
|
|
"ingress_request_duration_average": `sum_over_time(nginx_ingress_controller_request_duration_seconds_sum{$1,$2}[$3])/sum_over_time(nginx_ingress_controller_request_duration_seconds_count{$1,$2}[$3])`,
|
|
|
|
|
"ingress_request_duration_50percentage": `histogram_quantile(0.50, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`,
|
|
|
|
|
"ingress_request_duration_95percentage": `histogram_quantile(0.95, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`,
|
|
|
|
|
"ingress_request_duration_99percentage": `histogram_quantile(0.99, sum by (le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{$1,$2}[$3])))`,
|
|
|
|
|
"ingress_request_volume": `round(sum(irate(nginx_ingress_controller_requests{$1,$2}[$3])), 0.001)`,
|
|
|
|
|
"ingress_request_volume_by_ingress": `round(sum(irate(nginx_ingress_controller_requests{$1,$2}[$3])) by (ingress), 0.001)`,
|
|
|
|
|
"ingress_request_network_sent": `sum(irate(nginx_ingress_controller_response_size_sum{$1,$2}[$3]))`,
|
|
|
|
|
"ingress_request_network_received": `sum(irate(nginx_ingress_controller_request_size_sum{$1,$2}[$3]))`,
|
|
|
|
|
"ingress_request_memory_bytes": `avg(nginx_ingress_controller_nginx_process_resident_memory_bytes{$2})`,
|
|
|
|
|
"ingress_request_cpu_usage": `avg(rate(nginx_ingress_controller_nginx_process_cpu_seconds_total{$2}[5m]))`,
|
|
|
|
|
|
|
|
|
|
// workload
|
|
|
|
|
"workload_cpu_usage": `round(namespace:workload_cpu_usage:sum{$1}, 0.001)`,
|
|
|
|
|
"workload_memory_usage": `namespace:workload_memory_usage:sum{$1}`,
|
|
|
|
@ -201,8 +133,12 @@ var promQLTemplates = map[string]string{
|
|
|
|
|
"workload_daemonset_unavailable_replicas_ratio": `namespace:daemonset_unavailable_replicas:ratio{$1}`,
|
|
|
|
|
"workload_statefulset_unavailable_replicas_ratio": `namespace:statefulset_unavailable_replicas:ratio{$1}`,
|
|
|
|
|
|
|
|
|
|
"controller_cpu_usage_rate": `round(sum by (owner_name) (sum by (owner_name, pod) (irate(container_cpu_usage_seconds_total{job="kubelet", $1, image!=""}[5m]))/ sum by (owner_name,pod) (kube_pod_container_resource_limits{resource="cpu"}))/count(kube_pod_info{$2}) by (owner_name),0.0001)`,
|
|
|
|
|
"controller_memory_usage_rate": `round(sum by (owner_name) (sum by (owner_name, pod) (irate(container_memory_usage_bytes{job="kubelet", $1, image!=""}[5m]))/ sum by (owner_name,pod) (kube_pod_container_resource_limits{resource="memory"}))/count(kube_pod_info{$2}) by (owner_name),0.0001)`,
|
|
|
|
|
// pod
|
|
|
|
|
"pod_cpu_usage": `round(sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{job="kubelet", pod!="", image!=""}[5m])) * on (namespace, pod) group_left(owner_kind,owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.001)`,
|
|
|
|
|
"pod_cpu_usage_rate": `round(sum by (namespace, pod) (irate(container_cpu_usage_seconds_total{job="kubelet", pod!="", image!=""}[5m]))/sum by (namespace,pod) (kube_pod_container_resource_limits{resource="cpu"}) * on (namespace, pod) group_left(owner_kind,owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.0001)`,
|
|
|
|
|
"pod_memory_usage_rate": `round(sum by (namespace, pod) (irate(container_memory_usage_bytes{job="kubelet", pod!="", image!=""}[5m]))/sum by (namespace,pod) (kube_pod_container_resource_limits{resource="memory"}) * on (namespace, pod) group_left(owner_kind,owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}, 0.0001)`,
|
|
|
|
|
"pod_memory_usage": `sum by (namespace, pod) (container_memory_usage_bytes{job="kubelet", pod!="", image!=""}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
|
|
|
|
|
"pod_memory_usage_wo_cache": `sum by (namespace, pod) (container_memory_working_set_bytes{job="kubelet", pod!="", image!=""}) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
|
|
|
|
|
"pod_net_bytes_transmitted": `sum by (namespace, pod) (irate(container_network_transmit_bytes_total{pod!="", interface!~"^(cali.+|tunl.+|dummy.+|kube.+|flannel.+|cni.+|docker.+|veth.+|lo.*)", job="kubelet"}[5m])) * on (namespace, pod) group_left(owner_kind, owner_name) kube_pod_owner{$1} * on (namespace, pod) group_left(node) kube_pod_info{$2}`,
|
|
|
|
@ -216,51 +152,6 @@ var promQLTemplates = map[string]string{
|
|
|
|
|
"container_memory_usage_wo_cache": `sum by (namespace, pod, container) (container_memory_working_set_bytes{job="kubelet", container!="POD", container!="", image!="", $1})`,
|
|
|
|
|
"container_processes_usage": `sum by (namespace, pod, container) (container_processes{job="kubelet", container!="POD", container!="", image!="", $1})`,
|
|
|
|
|
"container_threads_usage": `sum by (namespace, pod, container) (container_threads {job="kubelet", container!="POD", container!="", image!="", $1})`,
|
|
|
|
|
|
|
|
|
|
// pvc
|
|
|
|
|
"pvc_inodes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_free) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
|
|
|
|
"pvc_inodes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
|
|
|
|
"pvc_inodes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
|
|
|
|
"pvc_inodes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_inodes_used / kubelet_volume_stats_inodes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
|
|
|
|
"pvc_bytes_available": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_available_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
|
|
|
|
"pvc_bytes_used": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
|
|
|
|
"pvc_bytes_total": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
|
|
|
|
"pvc_bytes_utilisation": `max by (namespace, persistentvolumeclaim) (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) * on (namespace, persistentvolumeclaim) group_left (storageclass) kube_persistentvolumeclaim_info{$1}`,
|
|
|
|
|
|
|
|
|
|
// component
|
|
|
|
|
"etcd_server_list": `label_replace(up{job="etcd"}, "node_ip", "$1", "instance", "(.*):.*")`,
|
|
|
|
|
"etcd_server_total": `count(up{job="etcd"})`,
|
|
|
|
|
"etcd_server_up_total": `etcd:up:sum`,
|
|
|
|
|
"etcd_server_has_leader": `label_replace(etcd_server_has_leader, "node_ip", "$1", "instance", "(.*):.*")`,
|
|
|
|
|
"etcd_server_is_leader": `label_replace(etcd_server_is_leader, "node_ip", "$1", "instance", "(.*):.*")`,
|
|
|
|
|
"etcd_server_leader_changes": `label_replace(etcd:etcd_server_leader_changes_seen:sum_changes, "node_ip", "$1", "node", "(.*)")`,
|
|
|
|
|
"etcd_server_proposals_failed_rate": `avg(etcd:etcd_server_proposals_failed:sum_irate)`,
|
|
|
|
|
"etcd_server_proposals_applied_rate": `avg(etcd:etcd_server_proposals_applied:sum_irate)`,
|
|
|
|
|
"etcd_server_proposals_committed_rate": `avg(etcd:etcd_server_proposals_committed:sum_irate)`,
|
|
|
|
|
"etcd_server_proposals_pending_count": `avg(etcd:etcd_server_proposals_pending:sum)`,
|
|
|
|
|
"etcd_mvcc_db_size": `avg(etcd:etcd_mvcc_db_total_size:sum)`,
|
|
|
|
|
"etcd_network_client_grpc_received_bytes": `sum(etcd:etcd_network_client_grpc_received_bytes:sum_irate)`,
|
|
|
|
|
"etcd_network_client_grpc_sent_bytes": `sum(etcd:etcd_network_client_grpc_sent_bytes:sum_irate)`,
|
|
|
|
|
"etcd_grpc_call_rate": `sum(etcd:grpc_server_started:sum_irate)`,
|
|
|
|
|
"etcd_grpc_call_failed_rate": `sum(etcd:grpc_server_handled:sum_irate)`,
|
|
|
|
|
"etcd_grpc_server_msg_received_rate": `sum(etcd:grpc_server_msg_received:sum_irate)`,
|
|
|
|
|
"etcd_grpc_server_msg_sent_rate": `sum(etcd:grpc_server_msg_sent:sum_irate)`,
|
|
|
|
|
"etcd_disk_wal_fsync_duration": `avg(etcd:etcd_disk_wal_fsync_duration:avg)`,
|
|
|
|
|
"etcd_disk_wal_fsync_duration_quantile": `avg(etcd:etcd_disk_wal_fsync_duration:histogram_quantile) by (quantile)`,
|
|
|
|
|
"etcd_disk_backend_commit_duration": `avg(etcd:etcd_disk_backend_commit_duration:avg)`,
|
|
|
|
|
"etcd_disk_backend_commit_duration_quantile": `avg(etcd:etcd_disk_backend_commit_duration:histogram_quantile) by (quantile)`,
|
|
|
|
|
|
|
|
|
|
"apiserver_up_sum": `apiserver:up:sum`,
|
|
|
|
|
"apiserver_request_rate": `apiserver:apiserver_request_total:sum_irate`,
|
|
|
|
|
"apiserver_request_by_verb_rate": `apiserver:apiserver_request_total:sum_verb_irate`,
|
|
|
|
|
"apiserver_request_latencies": `apiserver:apiserver_request_duration:avg`,
|
|
|
|
|
"apiserver_request_by_verb_latencies": `apiserver:apiserver_request_duration:avg_by_verb`,
|
|
|
|
|
|
|
|
|
|
"scheduler_up_sum": `scheduler:up:sum`,
|
|
|
|
|
"scheduler_schedule_attempts": `scheduler:scheduler_schedule_attempts:sum`,
|
|
|
|
|
"scheduler_schedule_attempt_rate": `scheduler:scheduler_schedule_attempts:sum_rate`,
|
|
|
|
|
"scheduler_e2e_scheduling_latency": `scheduler:scheduler_e2e_scheduling_duration:avg`,
|
|
|
|
|
"scheduler_e2e_scheduling_latency_quantile": `scheduler:scheduler_e2e_scheduling_duration:histogram_quantile`,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func makeExpr(metric string, opts QueryOptions) string {
|
|
|
|
@ -333,11 +224,10 @@ func makeNamespaceMetricExpr(tmpl string, o QueryOptions) string {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func makeControllerMetricExpr(tmpl string, o QueryOptions) string {
|
|
|
|
|
var namespace, podName string
|
|
|
|
|
var podName string
|
|
|
|
|
|
|
|
|
|
namespace = fmt.Sprintf(`namespace="%s"`, o.Namespace)
|
|
|
|
|
podName = fmt.Sprintf(`pod=~"%s"`, o.PodName)
|
|
|
|
|
return strings.NewReplacer("$1", namespace, "$2", podName).Replace(tmpl)
|
|
|
|
|
return strings.NewReplacer("$1", podName, "$2", podName).Replace(tmpl)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func makePodMetricExpr(tmpl string, o QueryOptions) string {
|
|
|
|
|