feat: support alerts based on bucket diff state (#119)

* feat: support alerts based on bucket diff status

* fix: correct document count calculation

* fix: empty param `event_id `

* fix: removing first and last time bucket

* fix: removing first and last time bucket

* chore: update bucket diff algorithm for improved accuracy

* refactor: optimize bucket diff algorithm

* feat: trigger bucket diff content change alert using expression

* feat: include bucket diff type in alert message API response

* chore: update release notes

* feat: add alert rule template to detect cluster metrics collection anomaly

* chore: update release notes
This commit is contained in:
silenceqi 2025-02-12 17:32:27 +08:00 committed by GitHub
parent 80e2a4356a
commit f910203599
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 429 additions and 61 deletions

View File

@ -1760,4 +1760,158 @@ POST $[[SETUP_INDEX_PREFIX]]alert-rule/$[[SETUP_DOC_TYPE]]/builtin-cal8n7p7h710d
"name": "$[[SETUP_USERNAME]]",
"id": "$[[SETUP_USER_ID]]"
}
}
POST $[[SETUP_INDEX_PREFIX]]alert-rule/$[[SETUP_DOC_TYPE]]/builtin-cujivv5ath26drn6bcl0
{
"id": "builtin-cujivv5ath26drn6bcl0",
"created": "2025-02-08T18:20:44.273334+08:00",
"updated": "2025-02-12T16:31:05.672771+08:00",
"name": "Cluster Metrics Collection Anomaly",
"enabled": true,
"resource": {
"resource_id": "$[[SETUP_RESOURCE_ID]]",
"resource_name": "$[[SETUP_RESOURCE_NAME]]",
"type": "elasticsearch",
"objects": [
".infini_metrics*"
],
"filter": {},
"raw_filter": {
"bool": {
"must": [
{
"terms": {
"metadata.name": [
"cluster_health",
"cluster_stats",
"index_stats",
"node_stats",
"shard_stats"
]
}
}
]
}
},
"time_field": "timestamp",
"context": {
"fields": null
}
},
"metrics": {
"bucket_size": "1m",
"groups": [
{
"field": "metadata.labels.cluster_id",
"limit": 5
},
{
"field": "metadata.name",
"limit": 5
}
],
"formula": "a",
"items": [
{
"name": "a",
"field": "agent.id",
"statistic": "count"
}
],
"bucket_label": {
"enabled": false
},
"expression": "count(agent.id)"
},
"bucket_conditions": {
"operator": "any",
"items": [
{
"minimum_period_match": 1,
"operator": "lt",
"values": [
"0"
],
"priority": "critical",
"type": "content",
"bucket_count": 10
}
]
},
"notification_config": {
"enabled": true,
"title": "🔥 [{{.rule_name}}] Alerting",
"message": "{{range .results}}\n{{$cn := lookup \"category=metadata, object=cluster, property=name, default=N/A\" (index .group_values 0) }}\n{{$cu := printf \"%s/#/cluster/monitor/elasticsearch/%s\" $.env.INFINI_CONSOLE_ENDPOINT (index .group_values 0)}}\nCluster [[{{$cn}}]({{$cu}}?_g=%7B%22timeRange%22:%7B%22min%22:%22{{$.min}}%22%2C%22max%22:%22{{$.max}}%22%7D%7D)] ({{index .group_values 1}}) metrics has dropped at {{.issue_timestamp | datetime}};\n{{end}}",
"normal": [
{
"id": "cgnb2nt3q95nmusjl65g",
"enabled": true
},
{
"id": "cgiospt3q95q49k3u00g",
"enabled": true
},
{
"id": "cj865st3q95rega919ig",
"enabled": true
},
{
"id": "cgnb2r53q95nmusjl6vg",
"enabled": true
},
{
"id": "ch1os6t3q95lk6lepkq0",
"enabled": true
},
{
"id": "cgnb2kt3q95nmusjl64g",
"enabled": true
}
],
"throttle_period": "6h",
"accept_time_range": {
"start": "00:00",
"end": "23:59"
}
},
"category": "Platform",
"recovery_notification_config": {
"enabled": true,
"title": "🌈 [{{.rule_name}}] Resolved",
"message": "EventID: {{.event_id}} \nTarget: {{.resource_name}}-{{.objects}} \nTriggerAt: {{.trigger_at | datetime}} \nResolveAt: {{.timestamp | datetime}} \nDuration: {{.duration}} ",
"normal": [
{
"id": "cj8bq8d3q95ogankugqg",
"enabled": true
},
{
"id": "cj8ctat3q95l9ebbntlg",
"enabled": true
},
{
"id": "cj8atf53q95lhahebg8g",
"enabled": true
},
{
"id": "cj8e9s53q95gsdbb054g",
"enabled": true
},
{
"id": "cj8e9gt3q95gsdbb0170",
"enabled": true
},
{
"id": "cj86l0l3q95rrpfea6ug",
"enabled": true
}
],
"event_enabled": true
},
"schedule": {
"interval": "1m"
},
"creator": {
"name": "$[[SETUP_USERNAME]]",
"id": "$[[SETUP_USER_ID]]"
}
}

View File

@ -12,6 +12,7 @@ Information about release notes of INFINI Console is provided here.
### Breaking changes
### Features
- Support alerts based on bucket diff state (#119)
### Bug fix

View File

@ -12,7 +12,7 @@ title: "版本历史"
### Breaking changes
### Features
- 告警功能支持根据桶之间文档数差值和内容差异告警 (#119)
### Bug fix
### Improvements

View File

@ -44,6 +44,28 @@ func (cond *Condition) GetMinimumPeriodMatch() int {
return minPeriodMatch
}
func (cond *Condition) GetMaxBucketCount() int {
var bucketCount = 0
for _, citem := range cond.Items {
if citem.BucketCount > bucketCount {
bucketCount = citem.BucketCount
}
}
return bucketCount
}
// BucketDiffType represents the type of bucket difference
type BucketDiffType string
// Constants defining possible bucket difference types
const (
// BucketDiffTypeSize indicates the difference in bucket size
BucketDiffTypeSize BucketDiffType = "size"
// BucketDiffTypeContent indicates the difference in bucket content
BucketDiffTypeContent BucketDiffType = "content"
)
type ConditionItem struct {
//MetricName string `json:"metric"`
MinimumPeriodMatch int `json:"minimum_period_match"`
@ -51,6 +73,10 @@ type ConditionItem struct {
Values []string `json:"values"`
Priority string `json:"priority"`
Expression string `json:"expression,omitempty"`
//bucket condition type, e.g: size, content
Type BucketDiffType `json:"type,omitempty"`
// Represents the number of buckets in the bucket condition type.
BucketCount int `json:"bucket_count,omitempty"`
}
func (cond *ConditionItem) GenerateConditionExpression() (conditionExpression string, err error) {

View File

@ -81,10 +81,15 @@ type QueryResult struct {
type MetricData struct {
GroupValues []string `json:"group_values"`
Data map[string][]TimeMetricData `json:"data"`
Data map[string][]MetricDataItem `json:"data"`
}
type TimeMetricData []interface{}
type MetricDataItem struct {
Timestamp interface{} `json:"timestamp,omitempty"`
Value interface{} `json:"value"`
Groups []string `json:"groups,omitempty"`
DocCount int `json:"doc_count,omitempty"`
}
type AlertMetricItem struct {
common.MetricItem

View File

@ -54,8 +54,9 @@ type Rule struct {
Name string `json:"name" elastic_mapping:"name: { type: keyword }"`
Id string `json:"id" elastic_mapping:"id: { type: keyword }"`
} `json:"creator" elastic_mapping:"creator:{type:object}"`
Category string `json:"category,omitempty" elastic_mapping:"category: { type: keyword,copy_to:search_text }"`
Tags []string `json:"tags,omitempty" elastic_mapping:"tags: { type: keyword,copy_to:search_text }"`
Category string `json:"category,omitempty" elastic_mapping:"category: { type: keyword,copy_to:search_text }"`
Tags []string `json:"tags,omitempty" elastic_mapping:"tags: { type: keyword,copy_to:search_text }"`
BucketConditions *Condition `json:"bucket_conditions" elastic_mapping:"bucket_conditions:{type:object}"`
}
func (rule *Rule) GetOrInitExpression() (string, error) {

View File

@ -404,13 +404,23 @@ func (h *AlertAPI) getAlertMessage(w http.ResponseWriter, req *http.Request, ps
return
}
metricExpression, _ := rule.Metrics.GenerateExpression()
var hitCondition string
for i, cond := range rule.Conditions.Items {
var (
hitCondition string
bucketDiffType string
)
conditions := rule.Conditions
if rule.BucketConditions != nil {
conditions = *rule.BucketConditions
}
for i, cond := range conditions.Items {
expression, _ := cond.GenerateConditionExpression()
if cond.Priority == message.Priority {
hitCondition = strings.ReplaceAll(expression, "result", "")
if rule.BucketConditions != nil {
bucketDiffType = string(cond.Type)
}
}
rule.Conditions.Items[i].Expression = strings.ReplaceAll(expression, "result", metricExpression)
conditions.Items[i].Expression = strings.ReplaceAll(expression, "result", metricExpression)
}
var duration time.Duration
if message.Status == alerting.MessageStateRecovered {
@ -419,26 +429,28 @@ func (h *AlertAPI) getAlertMessage(w http.ResponseWriter, req *http.Request, ps
duration = time.Now().Sub(message.Created)
}
detailObj := util.MapStr{
"message_id": message.ID,
"rule_id": message.RuleID,
"rule_name": rule.Name,
"rule_enabled": rule.Enabled,
"title": message.Title,
"message": message.Message,
"priority": message.Priority,
"created": message.Created,
"updated": message.Updated,
"resource_name": rule.Resource.Name,
"resource_id": rule.Resource.ID,
"resource_objects": rule.Resource.Objects,
"conditions": rule.Conditions,
"duration": duration.Milliseconds(),
"ignored_time": message.IgnoredTime,
"ignored_reason": message.IgnoredReason,
"ignored_user": message.IgnoredUser,
"status": message.Status,
"expression": rule.Metrics.Expression,
"hit_condition": hitCondition,
"message_id": message.ID,
"rule_id": message.RuleID,
"rule_name": rule.Name,
"rule_enabled": rule.Enabled,
"title": message.Title,
"message": message.Message,
"priority": message.Priority,
"created": message.Created,
"updated": message.Updated,
"resource_name": rule.Resource.Name,
"resource_id": rule.Resource.ID,
"resource_objects": rule.Resource.Objects,
"conditions": rule.Conditions,
"bucket_conditions": rule.BucketConditions,
"bucket_diff_type": bucketDiffType,
"duration": duration.Milliseconds(),
"ignored_time": message.IgnoredTime,
"ignored_reason": message.IgnoredReason,
"ignored_user": message.IgnoredUser,
"status": message.Status,
"expression": rule.Metrics.Expression,
"hit_condition": hitCondition,
}
h.WriteJSON(w, detailObj, http.StatusOK)
}

View File

@ -329,9 +329,16 @@ func getQueryTimeRange(rule *alerting.Rule, filterParam *alerting.FilterParam) (
} else {
return nil, fmt.Errorf("period interval: %s is too small", rule.Metrics.BucketSize)
}
bucketCount := rule.Conditions.GetMinimumPeriodMatch() + 1
var bucketCount int
if rule.BucketConditions != nil {
bucketCount = rule.BucketConditions.GetMaxBucketCount()
//for removing first and last time bucket
bucketCount += 2
} else {
bucketCount = rule.Conditions.GetMinimumPeriodMatch() + 1
}
if bucketCount <= 0 {
bucketCount = 1
bucketCount = 2
}
duration, err := time.ParseDuration(fmt.Sprintf("%d%s", value*bucketCount, units))
if err != nil {
@ -484,7 +491,7 @@ func (engine *Engine) GetTargetMetricData(rule *alerting.Rule, isFilterNaN bool,
} else {
targetData = alerting.MetricData{
GroupValues: md.GroupValues,
Data: map[string][]alerting.TimeMetricData{},
Data: map[string][]alerting.MetricDataItem{},
}
expression, err := govaluate.NewEvaluableExpression(rule.Metrics.Formula)
if err != nil {
@ -508,14 +515,14 @@ func (engine *Engine) GetTargetMetricData(rule *alerting.Rule, isFilterNaN bool,
}
//drop nil value bucket
if v == nil || len(v[i]) < 2 {
if v == nil {
continue DataLoop
}
if _, ok := v[i][1].(float64); !ok {
if _, ok := v[i].Value.(float64); !ok {
continue DataLoop
}
parameters[k] = v[i][1]
timestamp = v[i][0]
parameters[k] = v[i].Value
timestamp = v[i].Timestamp
}
if len(parameters) == 0 {
continue
@ -528,13 +535,13 @@ func (engine *Engine) GetTargetMetricData(rule *alerting.Rule, isFilterNaN bool,
if r, ok := result.(float64); ok {
if math.IsNaN(r) || math.IsInf(r, 0) {
if !isFilterNaN {
targetData.Data["result"] = append(targetData.Data["result"], []interface{}{timestamp, math.NaN()})
targetData.Data["result"] = append(targetData.Data["result"], alerting.MetricDataItem{Timestamp: timestamp, Value: math.NaN()})
}
continue
}
}
targetData.Data["result"] = append(targetData.Data["result"], []interface{}{timestamp, result})
targetData.Data["result"] = append(targetData.Data["result"], alerting.MetricDataItem{Timestamp: timestamp, Value: result})
}
}
targetMetricData = append(targetMetricData, targetData)
@ -554,6 +561,9 @@ func (engine *Engine) CheckCondition(rule *alerting.Rule) (*alerting.ConditionRe
if err != nil {
return conditionResult, err
}
if rule.BucketConditions != nil {
return engine.CheckBucketCondition(rule, targetMetricData, queryResult)
}
for idx, targetData := range targetMetricData {
if idx == 0 {
sort.Slice(rule.Conditions.Items, func(i, j int) bool {
@ -579,16 +589,16 @@ func (engine *Engine) CheckCondition(rule *alerting.Rule) (*alerting.ConditionRe
triggerCount := 0
for i := 0; i < dataLength; i++ {
//clear nil value
if targetData.Data[dataKey][i][1] == nil {
if targetData.Data[dataKey][i].Value == nil {
continue
}
if r, ok := targetData.Data[dataKey][i][1].(float64); ok {
if r, ok := targetData.Data[dataKey][i].Value.(float64); ok {
if math.IsNaN(r) {
continue
}
}
evaluateResult, err := expression.Evaluate(map[string]interface{}{
"result": targetData.Data[dataKey][i][1],
"result": targetData.Data[dataKey][i].Value,
})
if err != nil {
return conditionResult, fmt.Errorf("evaluate rule [%s] error: %w", rule.ID, err)
@ -603,12 +613,12 @@ func (engine *Engine) CheckCondition(rule *alerting.Rule) (*alerting.ConditionRe
resultItem := alerting.ConditionResultItem{
GroupValues: targetData.GroupValues,
ConditionItem: &cond,
ResultValue: targetData.Data[dataKey][i][1],
IssueTimestamp: targetData.Data[dataKey][i][0],
ResultValue: targetData.Data[dataKey][i].Value,
IssueTimestamp: targetData.Data[dataKey][i].Timestamp,
RelationValues: map[string]interface{}{},
}
for _, metric := range rule.Metrics.Items {
resultItem.RelationValues[metric.Name] = queryResult.MetricData[idx].Data[metric.Name][i][1]
resultItem.RelationValues[metric.Name] = queryResult.MetricData[idx].Data[metric.Name][i].Value
}
resultItems = append(resultItems, resultItem)
break LoopCondition
@ -621,6 +631,155 @@ func (engine *Engine) CheckCondition(rule *alerting.Rule) (*alerting.ConditionRe
conditionResult.ResultItems = resultItems
return conditionResult, nil
}
type BucketDiffState struct {
ContentChangeState int
DocCount int
}
func (engine *Engine) CheckBucketCondition(rule *alerting.Rule, targetMetricData []alerting.MetricData, queryResult *alerting.QueryResult) (*alerting.ConditionResult, error) {
var resultItems []alerting.ConditionResultItem
conditionResult := &alerting.ConditionResult{
QueryResult: queryResult,
}
//transform targetMetricData
var (
times = map[int64]struct{}{}
buckets = map[string]map[int64]int{}
maxTime int64
minTime = time.Now().UnixMilli()
)
for _, targetData := range targetMetricData {
for _, v := range targetData.Data {
for _, item := range v {
if tv, ok := item.Timestamp.(float64); ok {
timestamp := int64(tv)
if timestamp < minTime {
minTime = timestamp
}
if timestamp > maxTime {
maxTime = timestamp
}
if _, ok = times[timestamp]; !ok {
times[timestamp] = struct{}{}
}
bucketKey := strings.Join(targetData.GroupValues, "*")
if _, ok = buckets[bucketKey]; !ok {
buckets[bucketKey] = map[int64]int{}
}
buckets[bucketKey][timestamp] = item.DocCount
} else {
log.Warnf("invalid timestamp type: %T", item.Timestamp)
}
}
}
}
var timesArr []int64
for t := range times {
timesArr = append(timesArr, t)
}
sort.Slice(timesArr, func(i, j int) bool {
return timesArr[i] < timesArr[j] // Ascending order
})
// Remove the first bucket if its timestamp equals minTime, and
// the last bucket if its timestamp equals maxTime
if len(timesArr) > 0 && timesArr[0] == minTime {
// Remove first bucket if timestamp matches minTime
timesArr = timesArr[1:]
}
if len(timesArr) > 0 && timesArr[len(timesArr)-1] == maxTime {
// Remove last bucket if timestamp matches maxTime
timesArr = timesArr[:len(timesArr)-1]
}
//check bucket diff
diffResult := map[string]map[int64]BucketDiffState{}
for grps, bk := range buckets {
hasPre := false
if _, ok := diffResult[grps]; !ok {
diffResult[grps] = map[int64]BucketDiffState{}
}
for i, t := range timesArr {
if v, ok := bk[t]; !ok {
if hasPre {
diffResult[grps][t] = BucketDiffState{
ContentChangeState: -1,
}
}
// reset hasPre to false
hasPre = false
} else {
if !hasPre {
if i > 0 {
diffResult[grps][t] = BucketDiffState{
ContentChangeState: 1,
}
}
} else {
diffResult[grps][t] = BucketDiffState{
ContentChangeState: 0,
DocCount: v - bk[timesArr[i-1]],
}
}
hasPre = true
}
}
}
sort.Slice(rule.BucketConditions.Items, func(i, j int) bool {
return alerting.PriorityWeights[rule.BucketConditions.Items[i].Priority] > alerting.PriorityWeights[rule.BucketConditions.Items[j].Priority]
})
for grps, states := range diffResult {
LoopCondition:
for _, cond := range rule.BucketConditions.Items {
conditionExpression, err := cond.GenerateConditionExpression()
if err != nil {
return conditionResult, err
}
expression, err := govaluate.NewEvaluableExpression(conditionExpression)
if err != nil {
return conditionResult, err
}
triggerCount := 0
for t, state := range states {
resultValue := state.DocCount
if cond.Type == alerting.BucketDiffTypeContent {
resultValue = state.ContentChangeState
}
evaluateResult, err := expression.Evaluate(map[string]interface{}{
"result": resultValue,
})
if err != nil {
return conditionResult, fmt.Errorf("evaluate rule [%s] error: %w", rule.ID, err)
}
if evaluateResult == true {
triggerCount += 1
} else {
triggerCount = 0
}
if triggerCount >= cond.MinimumPeriodMatch {
groupValues := strings.Split(grps, "*")
log.Debugf("triggered condition %v, groups: %v\n", cond, groupValues)
resultItem := alerting.ConditionResultItem{
GroupValues: groupValues,
ConditionItem: &cond,
ResultValue: resultValue,
IssueTimestamp: t,
RelationValues: map[string]interface{}{},
}
resultItems = append(resultItems, resultItem)
break LoopCondition
}
}
}
}
conditionResult.QueryResult.MetricData = targetMetricData
conditionResult.ResultItems = resultItems
return conditionResult, nil
}
func (engine *Engine) Do(rule *alerting.Rule) error {
var (
@ -755,15 +914,9 @@ func (engine *Engine) Do(rule *alerting.Rule) error {
})
alertItem.Priority = priority
title, message := rule.GetNotificationTitleAndMessage()
err = attachTitleMessageToCtx(title, message, paramsCtx)
if err != nil {
return err
}
alertItem.Message = paramsCtx[alerting2.ParamMessage].(string)
alertItem.Title = paramsCtx[alerting2.ParamTitle].(string)
var newAlertMessage *alerting.AlertMessage
if alertMessage == nil || alertMessage.Status == alerting.MessageStateRecovered {
msg := &alerting.AlertMessage{
newAlertMessage = &alerting.AlertMessage{
RuleID: rule.ID,
Created: alertItem.Created,
Updated: time.Now(),
@ -772,13 +925,25 @@ func (engine *Engine) Do(rule *alerting.Rule) error {
ResourceName: rule.Resource.Name,
Status: alerting.MessageStateAlerting,
Priority: priority,
Title: alertItem.Title,
Message: alertItem.Message,
Tags: rule.Tags,
Category: rule.Category,
}
alertMessage = msg
err = saveAlertMessage(msg)
paramsCtx[alerting2.ParamEventID] = newAlertMessage.ID
} else {
paramsCtx[alerting2.ParamEventID] = alertMessage.ID
}
title, message := rule.GetNotificationTitleAndMessage()
err = attachTitleMessageToCtx(title, message, paramsCtx)
if err != nil {
return err
}
alertItem.Message = paramsCtx[alerting2.ParamMessage].(string)
alertItem.Title = paramsCtx[alerting2.ParamTitle].(string)
if newAlertMessage != nil {
alertMessage = newAlertMessage
alertMessage.Title = alertItem.Title
alertMessage.Message = alertItem.Message
err = saveAlertMessage(newAlertMessage)
if err != nil {
return fmt.Errorf("save alert message error: %w", err)
}
@ -813,10 +978,10 @@ func (engine *Engine) Do(rule *alerting.Rule) error {
log.Debugf("check condition result of rule %s is %v", conditionResults, rule.ID)
// if alert message status equals ignored , then skip sending message to channel
if alertMessage != nil && alertMessage.Status == alerting.MessageStateIgnored {
if alertMessage.Status == alerting.MessageStateIgnored {
return nil
}
if alertMessage != nil && paramsCtx != nil {
if paramsCtx != nil {
paramsCtx[alerting2.ParamEventID] = alertMessage.ID
}
// if channel is not enabled return
@ -1135,12 +1300,16 @@ func collectMetricData(agg interface{}, groupValues string, metricData *[]alerti
if timeBks, ok := aggM["time_buckets"].(map[string]interface{}); ok {
if bks, ok := timeBks["buckets"].([]interface{}); ok {
md := alerting.MetricData{
Data: map[string][]alerting.TimeMetricData{},
Data: map[string][]alerting.MetricDataItem{},
GroupValues: strings.Split(groupValues, "*"),
}
for _, bk := range bks {
if bkM, ok := bk.(map[string]interface{}); ok {
var docCount int
if v, ok := bkM["doc_count"]; ok {
docCount = int(v.(float64))
}
for k, v := range bkM {
if k == "key" || k == "key_as_string" || k == "doc_count" {
continue
@ -1150,20 +1319,20 @@ func collectMetricData(agg interface{}, groupValues string, metricData *[]alerti
}
if vm, ok := v.(map[string]interface{}); ok {
if metricVal, ok := vm["value"]; ok {
md.Data[k] = append(md.Data[k], alerting.TimeMetricData{bkM["key"], metricVal})
md.Data[k] = append(md.Data[k], alerting.MetricDataItem{Timestamp: bkM["key"], Value: metricVal, DocCount: docCount})
} else {
//percentiles agg type
switch vm["values"].(type) {
case []interface{}:
for _, val := range vm["values"].([]interface{}) {
if valM, ok := val.(map[string]interface{}); ok {
md.Data[k] = append(md.Data[k], alerting.TimeMetricData{bkM["key"], valM["value"]})
md.Data[k] = append(md.Data[k], alerting.MetricDataItem{Timestamp: bkM["key"], Value: valM["value"], DocCount: docCount})
}
break
}
case map[string]interface{}:
for _, val := range vm["values"].(map[string]interface{}) {
md.Data[k] = append(md.Data[k], alerting.TimeMetricData{bkM["key"], val})
md.Data[k] = append(md.Data[k], alerting.MetricDataItem{Timestamp: bkM["key"], Value: val, DocCount: docCount})
break
}
}