From ee87491df44eb930e621a21b8e2bb48ccd87538e Mon Sep 17 00:00:00 2001 From: tzwang Date: Thu, 27 Jun 2024 11:20:05 +0800 Subject: [PATCH 1/2] fix imageinfer api bug Former-commit-id: 4efe95aa95c442f0ff0ae32756ea1e418d6be349 --- api/internal/logic/inference/imageinferencelogic.go | 2 +- api/internal/scheduler/service/aiService.go | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/api/internal/logic/inference/imageinferencelogic.go b/api/internal/logic/inference/imageinferencelogic.go index f10a3c53..e37265ab 100644 --- a/api/internal/logic/inference/imageinferencelogic.go +++ b/api/internal/logic/inference/imageinferencelogic.go @@ -115,7 +115,7 @@ func (l *ImageInferenceLogic) ImageInfer(r *http.Request, req *types.ImageInfere l.svcCtx.Scheduler.AiStorages.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "create", "任务创建中") - go l.svcCtx.Scheduler.AiService.ImageInfer(opt, id, adapterName, clusters, ts, l.ctx) + go l.svcCtx.Scheduler.AiService.ImageInfer(opt, id, adapterName, clusters, ts) return resp, nil } diff --git a/api/internal/scheduler/service/aiService.go b/api/internal/scheduler/service/aiService.go index 1e1d9df0..6d07a8e7 100644 --- a/api/internal/scheduler/service/aiService.go +++ b/api/internal/scheduler/service/aiService.go @@ -1,8 +1,7 @@ package service import ( - "context" - "fmt" + "github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/zrpc" hpcacclient "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/config" @@ -17,6 +16,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice" "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice" "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient" + "net/http" "strconv" "sync" ) @@ -91,13 +91,14 @@ func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[st return executorMap, collectorMap } -func (as *AiService) ImageInfer(opt *option.InferOption, id int64, adapterName string, clusters []*strategy.AssignedCluster, ts []*inference.ImageFile, ctx context.Context) { +func (as *AiService) ImageInfer(opt *option.InferOption, id int64, adapterName string, clusters []*strategy.AssignedCluster, ts []*inference.ImageFile) { - res, err := inference.Infer(opt, id, adapterName, clusters, ts, as.AiCollectorAdapterMap, as.Storage, ctx) + r := http.Request{} + _, err := inference.Infer(opt, id, adapterName, clusters, ts, as.AiCollectorAdapterMap, as.Storage, r.Context()) if err != nil { + logx.Errorf(err.Error()) return } - fmt.Println(res) } //func (a *AiService) AddCluster() error { From e5582bbbcb34b709fd4b122bcc0bbb8f5b8a668f Mon Sep 17 00:00:00 2001 From: tzwang Date: Thu, 27 Jun 2024 15:44:58 +0800 Subject: [PATCH 2/2] fix staticweight bug Former-commit-id: fe3017c8e0ca945d333009e9447c7b07ac8c1f4c --- .../weightDistributing/weightDistributing.go | 5 +++++ .../scheduler/service/inference/imageInfer.go | 17 ++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/api/internal/scheduler/algorithm/weightDistributing/weightDistributing.go b/api/internal/scheduler/algorithm/weightDistributing/weightDistributing.go index 83c15723..b1fb7d80 100644 --- a/api/internal/scheduler/algorithm/weightDistributing/weightDistributing.go +++ b/api/internal/scheduler/algorithm/weightDistributing/weightDistributing.go @@ -11,6 +11,7 @@ type Weight struct { } func DistributeReplicas(weights []*Weight, replicas int32) { + var weightSum int32 weightSum = 0 for _, w := range weights { @@ -59,6 +60,10 @@ func DistributeReplicas(weights []*Weight, replicas int32) { weightRatio[maxIdx]-- rest-- } else { + if weights[minIdx].Replica == 0 { + weightRatio[minIdx]++ + continue + } weights[minIdx].Replica-- weightRatio[minIdx]++ rest++ diff --git a/api/internal/scheduler/service/inference/imageInfer.go b/api/internal/scheduler/service/inference/imageInfer.go index 732072b2..2cf1e713 100644 --- a/api/internal/scheduler/service/inference/imageInfer.go +++ b/api/internal/scheduler/service/inference/imageInfer.go @@ -63,18 +63,23 @@ func Infer(opt *option.InferOption, id int64, adapterName string, clusters []*st } } + var mutex sync.Mutex + errMap := make(map[string]string) for _, cluster := range clusters { wg.Add(1) c := cluster go func() { imageUrls, err := collectorMap[c.ClusterId].GetInferUrl(ctx, opt) - for i, _ := range imageUrls { - imageUrls[i].Url = imageUrls[i].Url + storeLink.FORWARD_SLASH + "image" - } if err != nil { + mutex.Lock() + errMap[c.ClusterId] = err.Error() + mutex.Unlock() wg.Done() return } + for i, _ := range imageUrls { + imageUrls[i].Url = imageUrls[i].Url + storeLink.FORWARD_SLASH + "image" + } clusterName, _ := storage.GetClusterNameById(c.ClusterId) s := struct { @@ -111,6 +116,9 @@ func Infer(opt *option.InferOption, id int64, adapterName string, clusters []*st for _, t := range aiTaskList { t.Status = constants.Failed t.EndTime = time.Now().Format(time.RFC3339) + if _, ok := errMap[strconv.Itoa(int(t.ClusterId))]; ok { + t.Msg = errMap[strconv.Itoa(int(t.ClusterId))] + } err := storage.UpdateAiTask(t) if err != nil { logx.Errorf(err.Error()) @@ -142,6 +150,9 @@ func Infer(opt *option.InferOption, id int64, adapterName string, clusters []*st if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) { t.Status = constants.Failed t.EndTime = time.Now().Format(time.RFC3339) + if _, ok := errMap[strconv.Itoa(int(t.ClusterId))]; ok { + t.Msg = errMap[strconv.Itoa(int(t.ClusterId))] + } err := storage.UpdateAiTask(t) if err != nil { logx.Errorf(err.Error())