diff --git a/api/internal/scheduler/algorithm/weightDistributing/weightDistributing.go b/api/internal/scheduler/algorithm/weightDistributing/weightDistributing.go index 83c15723..b1fb7d80 100644 --- a/api/internal/scheduler/algorithm/weightDistributing/weightDistributing.go +++ b/api/internal/scheduler/algorithm/weightDistributing/weightDistributing.go @@ -11,6 +11,7 @@ type Weight struct { } func DistributeReplicas(weights []*Weight, replicas int32) { + var weightSum int32 weightSum = 0 for _, w := range weights { @@ -59,6 +60,10 @@ func DistributeReplicas(weights []*Weight, replicas int32) { weightRatio[maxIdx]-- rest-- } else { + if weights[minIdx].Replica == 0 { + weightRatio[minIdx]++ + continue + } weights[minIdx].Replica-- weightRatio[minIdx]++ rest++ diff --git a/api/internal/scheduler/service/inference/imageInfer.go b/api/internal/scheduler/service/inference/imageInfer.go index 732072b2..2cf1e713 100644 --- a/api/internal/scheduler/service/inference/imageInfer.go +++ b/api/internal/scheduler/service/inference/imageInfer.go @@ -63,18 +63,23 @@ func Infer(opt *option.InferOption, id int64, adapterName string, clusters []*st } } + var mutex sync.Mutex + errMap := make(map[string]string) for _, cluster := range clusters { wg.Add(1) c := cluster go func() { imageUrls, err := collectorMap[c.ClusterId].GetInferUrl(ctx, opt) - for i, _ := range imageUrls { - imageUrls[i].Url = imageUrls[i].Url + storeLink.FORWARD_SLASH + "image" - } if err != nil { + mutex.Lock() + errMap[c.ClusterId] = err.Error() + mutex.Unlock() wg.Done() return } + for i, _ := range imageUrls { + imageUrls[i].Url = imageUrls[i].Url + storeLink.FORWARD_SLASH + "image" + } clusterName, _ := storage.GetClusterNameById(c.ClusterId) s := struct { @@ -111,6 +116,9 @@ func Infer(opt *option.InferOption, id int64, adapterName string, clusters []*st for _, t := range aiTaskList { t.Status = constants.Failed t.EndTime = time.Now().Format(time.RFC3339) + if _, ok := errMap[strconv.Itoa(int(t.ClusterId))]; ok { + t.Msg = errMap[strconv.Itoa(int(t.ClusterId))] + } err := storage.UpdateAiTask(t) if err != nil { logx.Errorf(err.Error()) @@ -142,6 +150,9 @@ func Infer(opt *option.InferOption, id int64, adapterName string, clusters []*st if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) { t.Status = constants.Failed t.EndTime = time.Now().Format(time.RFC3339) + if _, ok := errMap[strconv.Itoa(int(t.ClusterId))]; ok { + t.Msg = errMap[strconv.Itoa(int(t.ClusterId))] + } err := storage.UpdateAiTask(t) if err != nil { logx.Errorf(err.Error())