From 3825703edea016d967beb8733aef73bb10a71c41 Mon Sep 17 00:00:00 2001 From: zhangwei <894646498@qq.com> Date: Thu, 16 May 2024 16:46:02 +0800 Subject: [PATCH 01/14] modify path Former-commit-id: e65fecbe3d93ddbc8787e23a2fea7f83fd3b2ea5 --- api/desc/pcm.api | 2 +- api/internal/handler/routes.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 7278fae0..2cb1e06c 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -1032,7 +1032,7 @@ service pcm { get /monitoring/alert/rule (AlertRulesReq) returns (AlertRulesResp) @handler DeleteAlertRuleHandler - delete /cloud/alert/rule (DeleteAlertRuleReq) + delete /monitoring/alert/rule (DeleteAlertRuleReq) @doc "cluster resource load" @handler clustersLoadHandler diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index 03a8dab5..11006a83 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -1304,7 +1304,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { }, { Method: http.MethodDelete, - Path: "/cloud/alert/rule", + Path: "/monitoring/alert/rule", Handler: monitoring.DeleteAlertRuleHandler(serverCtx), }, { From 62ecd4e2e45167839c92501c164b7aee071d8db2 Mon Sep 17 00:00:00 2001 From: tzwang Date: Thu, 16 May 2024 16:49:50 +0800 Subject: [PATCH 02/14] fix aischeduler bugs Former-commit-id: 887857a40bd3492ac3a412902893cb3990048677 --- .../scheduler/schedulers/aiScheduler.go | 59 ++++++++++--------- api/internal/storeLink/octopus.go | 7 ++- api/internal/storeLink/shuguangai.go | 8 ++- 3 files changed, 45 insertions(+), 29 deletions(-) diff --git a/api/internal/scheduler/schedulers/aiScheduler.go b/api/internal/scheduler/schedulers/aiScheduler.go index 50cf27ca..84b2fe35 100644 --- a/api/internal/scheduler/schedulers/aiScheduler.go +++ b/api/internal/scheduler/schedulers/aiScheduler.go @@ -130,39 +130,44 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa var wg sync.WaitGroup var results []*AiResult var errs []interface{} - var ch = make(chan *AiResult, len(clusters)) - var errCh = make(chan interface{}, len(clusters)) + var taskNum int32 + for _, cluster := range clusters { + taskNum += cluster.Replicas + } + var ch = make(chan *AiResult, taskNum) + var errCh = make(chan interface{}, taskNum) executorMap := as.AiService.AiExecutorAdapterMap[as.option.AdapterId] for _, cluster := range clusters { c := cluster - wg.Add(1) - go func() { - opt, _ := cloneAiOption(as.option) - resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt) - - if err != nil { - e := struct { - err error - clusterId string - }{ - err: err, - clusterId: c.ClusterId, + for i := 0; i < int(c.Replicas); i++ { + wg.Add(1) + go func() { + opt, _ := cloneAiOption(as.option) + resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt) + if err != nil { + e := struct { + err error + clusterId string + }{ + err: err, + clusterId: c.ClusterId, + } + errCh <- e + wg.Done() + return } - errCh <- e + + result, _ := convertType(resp) + result.Replica = c.Replicas + result.ClusterId = c.ClusterId + result.Strategy = as.option.StrategyName + result.Card = opt.ComputeCard + + ch <- result wg.Done() - return - } - - result, _ := convertType(resp) - result.Replica = c.Replicas - result.ClusterId = c.ClusterId - result.Strategy = as.option.StrategyName - result.Card = opt.ComputeCard - - ch <- result - wg.Done() - }() + }() + } } wg.Wait() close(ch) diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index ff4c9f32..97f949d6 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -591,7 +591,10 @@ func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiO } if option.ResourceType == CARD { - err = setResourceIdByCard(option, specResp, GCU) + if option.ComputeCard == "" { + option.ComputeCard = GCU + } + err = setResourceIdByCard(option, specResp, option.ComputeCard) if err != nil { return err } @@ -742,6 +745,8 @@ func (o *OctopusLink) generateCmd(option *option.AiOption) error { switch option.ComputeCard { case GCU: option.Cmd = "cd /code; python3 train.py" + case MLU: + option.Cmd = "su root; cd /torch/venv3/pytorch/bin; source activate; cd /code; python train.py" default: option.Cmd = TRAIN_CMD } diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index b2be408c..415f14b1 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -33,7 +33,7 @@ import ( const ( RAM_SIZE_1G = 1024 // 1G WORKER_NUMBER = 1 - DCU = "dcu" + DCU = "DCU" DCU_TOPS = 24.5 PYTORCH = "Pytorch" TASK_PYTORCH_PREFIX = "PytorchTask" @@ -570,7 +570,13 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error { } if option.ResourceType == CARD { + if option.ComputeCard == "" { + option.ComputeCard = DCU + } + if strings.ToUpper(option.ComputeCard) != DCU { + return errors.New("computeCard not found") + } option.ComputeCard = DCU if 0 <= option.Tops && option.Tops <= DCU_TOPS { From 70a8f4805872b5c5d3ea5d3f7dd52a8335da6033 Mon Sep 17 00:00:00 2001 From: jagger Date: Thu, 16 May 2024 17:50:51 +0800 Subject: [PATCH 03/14] fix Signed-off-by: jagger Former-commit-id: 7898052cec2ba197689bca016811a5f98220f3c4 --- api/Dockerfile | 3 +- api/internal/logic/ai/createalgorithmlogic.go | 2 +- api/internal/logic/ai/createdatasetlogic.go | 2 +- .../logic/ai/createexporttasklogic.go | 2 +- api/internal/logic/ai/createnotebooklogic.go | 2 +- .../logic/ai/createprocessortasklogic.go | 2 +- api/internal/logic/ai/createservicelogic.go | 2 +- api/internal/logic/ai/createtasklogic.go | 2 +- .../logic/ai/createtrainingjoblogic.go | 2 +- api/internal/logic/ai/deletealgorithmlogic.go | 2 +- api/internal/logic/ai/deletedatasetlogic.go | 2 +- api/internal/logic/ai/deleteservicelogic.go | 2 +- .../logic/ai/deletetrainingjoblogic.go | 2 +- .../logic/ai/getexporttasksofdatasetlogic.go | 2 +- .../ai/getexporttaskstatusofdatasetlogic.go | 2 +- .../logic/ai/getlisttrainingjobslogic.go | 2 +- api/internal/logic/ai/listalgorithmslogic.go | 2 +- api/internal/logic/ai/listclusterslogic.go | 2 +- api/internal/logic/ai/listdatasetlogic.go | 2 +- api/internal/logic/ai/listimportlogic.go | 2 +- api/internal/logic/ai/listnotebooklogic.go | 2 +- api/internal/logic/ai/listserviceslogic.go | 2 +- .../logic/ai/showalgorithmbyuuidlogic.go | 2 +- api/internal/logic/ai/showservicelogic.go | 2 +- api/internal/scheduler/service/aiService.go | 4 +-- api/internal/storeLink/modelarts.go | 29 ++++++++++++++++--- api/internal/storeLink/storeLink.go | 6 ++-- api/internal/svc/servicecontext.go | 4 +-- go.mod | 2 +- go.sum | 4 +-- 30 files changed, 59 insertions(+), 39 deletions(-) diff --git a/api/Dockerfile b/api/Dockerfile index a4372ef0..97053deb 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -6,8 +6,7 @@ COPY . . RUN go env -w GO111MODULE=on \ && go env -w GOPROXY=https://goproxy.cn,direct \ -&& go env -w CGO_ENABLED=0 \ -&& go mod download +&& go env -w CGO_ENABLED=0 RUN go build -o pcm-coordinator-api /app/api/pcm.go FROM alpine:3.18 diff --git a/api/internal/logic/ai/createalgorithmlogic.go b/api/internal/logic/ai/createalgorithmlogic.go index f7efe89a..bd46ddb7 100644 --- a/api/internal/logic/ai/createalgorithmlogic.go +++ b/api/internal/logic/ai/createalgorithmlogic.go @@ -22,7 +22,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" ) diff --git a/api/internal/logic/ai/createdatasetlogic.go b/api/internal/logic/ai/createdatasetlogic.go index 9d5741eb..bff8de2f 100644 --- a/api/internal/logic/ai/createdatasetlogic.go +++ b/api/internal/logic/ai/createdatasetlogic.go @@ -22,7 +22,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "github.com/zeromicro/go-zero/core/logx" ) diff --git a/api/internal/logic/ai/createexporttasklogic.go b/api/internal/logic/ai/createexporttasklogic.go index 47390243..f2e7e67b 100644 --- a/api/internal/logic/ai/createexporttasklogic.go +++ b/api/internal/logic/ai/createexporttasklogic.go @@ -23,7 +23,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" error2 "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/error" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" diff --git a/api/internal/logic/ai/createnotebooklogic.go b/api/internal/logic/ai/createnotebooklogic.go index 3859e58f..9d7384b1 100644 --- a/api/internal/logic/ai/createnotebooklogic.go +++ b/api/internal/logic/ai/createnotebooklogic.go @@ -22,7 +22,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "github.com/zeromicro/go-zero/core/logx" ) diff --git a/api/internal/logic/ai/createprocessortasklogic.go b/api/internal/logic/ai/createprocessortasklogic.go index b79b4f82..e071fe54 100644 --- a/api/internal/logic/ai/createprocessortasklogic.go +++ b/api/internal/logic/ai/createprocessortasklogic.go @@ -22,7 +22,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "github.com/zeromicro/go-zero/core/logx" ) diff --git a/api/internal/logic/ai/createservicelogic.go b/api/internal/logic/ai/createservicelogic.go index e99acb27..ed02a7b5 100644 --- a/api/internal/logic/ai/createservicelogic.go +++ b/api/internal/logic/ai/createservicelogic.go @@ -21,7 +21,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" diff --git a/api/internal/logic/ai/createtasklogic.go b/api/internal/logic/ai/createtasklogic.go index 1e94d4c4..25bacfcf 100644 --- a/api/internal/logic/ai/createtasklogic.go +++ b/api/internal/logic/ai/createtasklogic.go @@ -23,7 +23,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" diff --git a/api/internal/logic/ai/createtrainingjoblogic.go b/api/internal/logic/ai/createtrainingjoblogic.go index e4565187..983e71f4 100644 --- a/api/internal/logic/ai/createtrainingjoblogic.go +++ b/api/internal/logic/ai/createtrainingjoblogic.go @@ -21,7 +21,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" error2 "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/error" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" diff --git a/api/internal/logic/ai/deletealgorithmlogic.go b/api/internal/logic/ai/deletealgorithmlogic.go index 1093def0..3297ee84 100644 --- a/api/internal/logic/ai/deletealgorithmlogic.go +++ b/api/internal/logic/ai/deletealgorithmlogic.go @@ -23,7 +23,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "strings" diff --git a/api/internal/logic/ai/deletedatasetlogic.go b/api/internal/logic/ai/deletedatasetlogic.go index 6c9fe0b9..853b8d86 100644 --- a/api/internal/logic/ai/deletedatasetlogic.go +++ b/api/internal/logic/ai/deletedatasetlogic.go @@ -23,7 +23,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" diff --git a/api/internal/logic/ai/deleteservicelogic.go b/api/internal/logic/ai/deleteservicelogic.go index 337d1aeb..c6c6b51b 100644 --- a/api/internal/logic/ai/deleteservicelogic.go +++ b/api/internal/logic/ai/deleteservicelogic.go @@ -19,7 +19,7 @@ import ( "github.com/jinzhu/copier" error2 "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/error" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" diff --git a/api/internal/logic/ai/deletetrainingjoblogic.go b/api/internal/logic/ai/deletetrainingjoblogic.go index bf9b62b9..37792f27 100644 --- a/api/internal/logic/ai/deletetrainingjoblogic.go +++ b/api/internal/logic/ai/deletetrainingjoblogic.go @@ -21,7 +21,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" error2 "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/error" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" diff --git a/api/internal/logic/ai/getexporttasksofdatasetlogic.go b/api/internal/logic/ai/getexporttasksofdatasetlogic.go index baacd49f..3d053fa2 100644 --- a/api/internal/logic/ai/getexporttasksofdatasetlogic.go +++ b/api/internal/logic/ai/getexporttasksofdatasetlogic.go @@ -23,7 +23,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" diff --git a/api/internal/logic/ai/getexporttaskstatusofdatasetlogic.go b/api/internal/logic/ai/getexporttaskstatusofdatasetlogic.go index a6e74d70..2caae709 100644 --- a/api/internal/logic/ai/getexporttaskstatusofdatasetlogic.go +++ b/api/internal/logic/ai/getexporttaskstatusofdatasetlogic.go @@ -19,7 +19,7 @@ import ( "github.com/jinzhu/copier" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" diff --git a/api/internal/logic/ai/getlisttrainingjobslogic.go b/api/internal/logic/ai/getlisttrainingjobslogic.go index dbe104c8..12316436 100644 --- a/api/internal/logic/ai/getlisttrainingjobslogic.go +++ b/api/internal/logic/ai/getlisttrainingjobslogic.go @@ -23,7 +23,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" diff --git a/api/internal/logic/ai/listalgorithmslogic.go b/api/internal/logic/ai/listalgorithmslogic.go index 1807ac06..3e4eaa26 100644 --- a/api/internal/logic/ai/listalgorithmslogic.go +++ b/api/internal/logic/ai/listalgorithmslogic.go @@ -19,7 +19,7 @@ import ( "github.com/jinzhu/copier" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" diff --git a/api/internal/logic/ai/listclusterslogic.go b/api/internal/logic/ai/listclusterslogic.go index 47352fa8..2ee7057c 100644 --- a/api/internal/logic/ai/listclusterslogic.go +++ b/api/internal/logic/ai/listclusterslogic.go @@ -20,7 +20,7 @@ import ( "github.com/pkg/errors" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" diff --git a/api/internal/logic/ai/listdatasetlogic.go b/api/internal/logic/ai/listdatasetlogic.go index 395dfa7f..6089719a 100644 --- a/api/internal/logic/ai/listdatasetlogic.go +++ b/api/internal/logic/ai/listdatasetlogic.go @@ -21,7 +21,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" error2 "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/error" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" diff --git a/api/internal/logic/ai/listimportlogic.go b/api/internal/logic/ai/listimportlogic.go index 5b7ecc0b..e03fabe7 100644 --- a/api/internal/logic/ai/listimportlogic.go +++ b/api/internal/logic/ai/listimportlogic.go @@ -21,7 +21,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" diff --git a/api/internal/logic/ai/listnotebooklogic.go b/api/internal/logic/ai/listnotebooklogic.go index 76aeaa23..24808982 100644 --- a/api/internal/logic/ai/listnotebooklogic.go +++ b/api/internal/logic/ai/listnotebooklogic.go @@ -22,7 +22,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "github.com/zeromicro/go-zero/core/logx" ) diff --git a/api/internal/logic/ai/listserviceslogic.go b/api/internal/logic/ai/listserviceslogic.go index 305cc52f..ad3cc184 100644 --- a/api/internal/logic/ai/listserviceslogic.go +++ b/api/internal/logic/ai/listserviceslogic.go @@ -21,7 +21,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "github.com/zeromicro/go-zero/core/logx" diff --git a/api/internal/logic/ai/showalgorithmbyuuidlogic.go b/api/internal/logic/ai/showalgorithmbyuuidlogic.go index 9e915026..6cf7c127 100644 --- a/api/internal/logic/ai/showalgorithmbyuuidlogic.go +++ b/api/internal/logic/ai/showalgorithmbyuuidlogic.go @@ -21,7 +21,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/xerr" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" diff --git a/api/internal/logic/ai/showservicelogic.go b/api/internal/logic/ai/showservicelogic.go index cc428154..19136a4c 100644 --- a/api/internal/logic/ai/showservicelogic.go +++ b/api/internal/logic/ai/showservicelogic.go @@ -19,7 +19,7 @@ import ( "github.com/jinzhu/copier" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "k8s.io/apimachinery/pkg/util/json" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" diff --git a/api/internal/scheduler/service/aiService.go b/api/internal/scheduler/service/aiService.go index 93188f9d..45b6da6d 100644 --- a/api/internal/scheduler/service/aiService.go +++ b/api/internal/scheduler/service/aiService.go @@ -9,9 +9,9 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/executor" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice" + "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice" "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice" "strconv" ) diff --git a/api/internal/storeLink/modelarts.go b/api/internal/storeLink/modelarts.go index 5843eeff..2186addb 100644 --- a/api/internal/storeLink/modelarts.go +++ b/api/internal/storeLink/modelarts.go @@ -16,12 +16,13 @@ package storeLink import ( "context" + "github.com/pkg/errors" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" + "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice" + "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "strconv" "strings" ) @@ -159,7 +160,27 @@ func (m *ModelArtsLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.Data } func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) { - return nil, nil + var algorithms []*collector.Algorithm + + req := &modelarts.ListAlgorithmsReq{ + Platform: m.platform, + Offset: m.pageIndex, + Limit: m.pageSize, + } + resp, err := m.modelArtsRpc.ListAlgorithms(ctx, req) + if err != nil { + return nil, err + } + if resp.ErrorMsg != "" { + return nil, errors.New("failed to get algorithms") + } + + for _, a := range resp.Items { + //TODO The value of taskType is temporarily fixed to "pytorch" + algorithm := &collector.Algorithm{Name: a.Metadata.Name, Platform: MODELARTS, TaskType: "pytorch"} + algorithms = append(algorithms, algorithm) + } + return algorithms, nil } func (m *ModelArtsLink) GetComputeCards(ctx context.Context) ([]string, error) { diff --git a/api/internal/storeLink/storeLink.go b/api/internal/storeLink/storeLink.go index 2fbf2c12..e3e86a46 100644 --- a/api/internal/storeLink/storeLink.go +++ b/api/internal/storeLink/storeLink.go @@ -25,11 +25,11 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils" + "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice" + "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice" + "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" "gitlink.org.cn/JointCloud/pcm-octopus/octopus" "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/modelarts" "gorm.io/gorm" "strings" "sync" diff --git a/api/internal/svc/servicecontext.go b/api/internal/svc/servicecontext.go index 4bbc6a27..5460862d 100644 --- a/api/internal/svc/servicecontext.go +++ b/api/internal/svc/servicecontext.go @@ -30,12 +30,12 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/rpc/client/participantservice" "gitlink.org.cn/JointCloud/pcm-kubernetes/kubernetesclient" + "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice" + "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice" "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient" "gitlink.org.cn/JointCloud/pcm-openstack/openstackclient" slurmClient "gitlink.org.cn/JointCloud/pcm-slurm/slurmclient" "gitlink.org.cn/jcce-pcm/pcm-participant-ceph/cephclient" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice" - "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice" "gorm.io/driver/mysql" "gorm.io/gorm" "gorm.io/gorm/logger" diff --git a/go.mod b/go.mod index bf50e2ee..1030a0fb 100644 --- a/go.mod +++ b/go.mod @@ -26,11 +26,11 @@ require ( github.com/zeromicro/go-zero v1.6.3 gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c + gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240515005224-689bb339a9c9 gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d - gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231101085149-724c7c4cc090 go.opentelemetry.io/otel/trace v1.25.0 gonum.org/v1/gonum v0.11.0 google.golang.org/grpc v1.63.0 diff --git a/go.sum b/go.sum index e1d1ec5b..e05295cb 100644 --- a/go.sum +++ b/go.sum @@ -1082,6 +1082,8 @@ gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece h1:W3yBnvAVV gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo= gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c h1:2Wl/hvaSFjh6fmCSIQhjkr9llMRREQeqcXNLZ/HPY18= gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c/go.mod h1:lSRfGs+PxFvw7CcndHWRd6UlLlGrZn0b0hp5cfaMNGw= +gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240515005224-689bb339a9c9 h1:FRtOtI9vDFHcyPUdc4PL95CFi/DFk+HXT6JNTf/91d8= +gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240515005224-689bb339a9c9/go.mod h1:2WC5tDApfQNNIBfDNYwdaQiXhfCsG2n03P3ZxX9p9O4= gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 h1:E2QfpS3Y0FjR8Zyv5l2Ti/2NetQFqHG66c8+T/+J1u0= gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI= @@ -1090,8 +1092,6 @@ gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnz gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5/go.mod h1:97AlUXN13g9UN3+9/DzCHpeoU5sbdyv0IQuTEHNexzQ= gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d h1:DHjl/rLuH2gKYtY0MKMGNQDHFT12APg25RlMUQo+tHk= gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d/go.mod h1:r/KLzUpupCV5jdxSfgDhc2pVjP0fBi3VhAWRttsBn30= -gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231101085149-724c7c4cc090 h1:jztlHo72bcWM1jUwvG3Hfk2K+AJL0RvlsdIqlktH/MI= -gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231101085149-724c7c4cc090/go.mod h1:pisJKAI8FRFFUcBaH3Gob+ENXWRM97rpuYmv9s1raag= go.etcd.io/etcd/api/v3 v3.5.7/go.mod h1:9qew1gCdDDLu+VwmeG+iFpL+QlpHTo7iubavdVDgCAA= go.etcd.io/etcd/api/v3 v3.5.13 h1:8WXU2/NBge6AUF1K1gOexB6e07NgsN1hXK0rSTtgSp4= go.etcd.io/etcd/api/v3 v3.5.13/go.mod h1:gBqlqkcMMZMVTMm4NDZloEVJzxQOQIls8splbqBDa0c= From f8a4656d83b34d895cbde7644173789dfd2dffd7 Mon Sep 17 00:00:00 2001 From: tzwang Date: Thu, 16 May 2024 18:17:57 +0800 Subject: [PATCH 04/14] fix aischeduler bugs Former-commit-id: 4df953f2a614800510376210d69f82a646cae02f --- api/internal/scheduler/schedulers/aiScheduler.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/api/internal/scheduler/schedulers/aiScheduler.go b/api/internal/scheduler/schedulers/aiScheduler.go index 84b2fe35..39b7e427 100644 --- a/api/internal/scheduler/schedulers/aiScheduler.go +++ b/api/internal/scheduler/schedulers/aiScheduler.go @@ -129,6 +129,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa var wg sync.WaitGroup var results []*AiResult + var mu sync.Mutex var errs []interface{} var taskNum int32 for _, cluster := range clusters { @@ -158,7 +159,10 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa return } + mu.Lock() result, _ := convertType(resp) + mu.Unlock() + result.Replica = c.Replicas result.ClusterId = c.ClusterId result.Strategy = as.option.StrategyName From 974fcf0590caa0d90c8d71d4deec1bae9051d21c Mon Sep 17 00:00:00 2001 From: jagger Date: Fri, 17 May 2024 16:53:22 +0800 Subject: [PATCH 05/14] fix bug Signed-off-by: jagger Former-commit-id: a8b52619ad241f5c4069b2805b87c752303289b4 --- api/desc/core/pcm-core.api | 3 ++- api/internal/types/types.go | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index 7daa303c..4c42c45b 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -408,7 +408,8 @@ type ( DeletedAt string `json:"deletedAt,omitempty" gorm:"index" db:"deleted_at"` NsID string `json:"nsId,omitempty" db:"ns_id"` TenantId string `json:"tenantId,omitempty" db:"tenant_id"` - CreateTime string `json:"createTime,omitempty" db:"create_time" gorm:"autoCreateTime"` + CreatedTime string `json:"createdTime,omitempty" db:"created_time" gorm:"autoCreateTime"` + UpdatedTime string `json:"updatedTime,omitempty" db:"updated_time" gorm:"autoUpdateTime"` AdapterTypeDict int `json:"adapterTypeDict" db:"create_time" gorm:"adapter_type_dict"` //任务类型(对应字典表的值 } ) diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 317a225d..a90e9057 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -344,7 +344,8 @@ type TaskModel struct { DeletedAt string `json:"deletedAt,omitempty" gorm:"index" db:"deleted_at"` NsID string `json:"nsId,omitempty" db:"ns_id"` TenantId string `json:"tenantId,omitempty" db:"tenant_id"` - CreateTime string `json:"createTime,omitempty" db:"create_time" gorm:"autoCreateTime"` + CreatedTime string `json:"createdTime,omitempty" db:"created_time" gorm:"autoCreateTime"` + UpdatedTime string `json:"updatedTime,omitempty" db:"updated_time" gorm:"autoUpdateTime"` AdapterTypeDict int `json:"adapterTypeDict" db:"create_time" gorm:"adapter_type_dict"` //任务类型(对应字典表的值 } From 258258b04fa5248b89e9cfe6377fbb345c2e08aa Mon Sep 17 00:00:00 2001 From: tzwang Date: Fri, 17 May 2024 17:21:31 +0800 Subject: [PATCH 06/14] fix submit bugs Former-commit-id: b3b4da6986f71dfee5fdb682866aabd43b830a37 --- api/internal/logic/core/pagelisttasklogic.go | 98 +++++++++++-------- .../logic/schedule/schedulesubmitlogic.go | 1 + api/internal/scheduler/common/common.go | 9 ++ .../scheduler/schedulers/aiScheduler.go | 19 ++++ api/internal/storeLink/octopus.go | 2 +- 5 files changed, 85 insertions(+), 44 deletions(-) diff --git a/api/internal/logic/core/pagelisttasklogic.go b/api/internal/logic/core/pagelisttasklogic.go index ec1b8945..8877a03a 100644 --- a/api/internal/logic/core/pagelisttasklogic.go +++ b/api/internal/logic/core/pagelisttasklogic.go @@ -165,53 +165,65 @@ func (l *PageListTaskLogic) updateTaskStatus(tasks []*types.TaskModel, ch chan<- } func (l *PageListTaskLogic) updateAiTaskStatus(tasks []*types.TaskModel, ch chan<- struct{}) { + for i := len(tasks) - 1; i >= 0; i-- { + if tasks[i].AdapterTypeDict == 0 || tasks[i].Status == constants.Succeeded || tasks[i].Status == constants.Failed { + tasks = append(tasks[:i], tasks[i+1:]...) + } + } + + if len(tasks) == 0 { + ch <- struct{}{} + return + } + + task := tasks[0] + for i, _ := range tasks { + earliest, _ := time.Parse(constants.Layout, task.UpdatedTime) + latest, _ := time.Parse(constants.Layout, tasks[i].UpdatedTime) + if earliest.Before(latest) { + task = tasks[i] + } + } + + var aiTaskList []*models.TaskAi + tx := l.svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", task.Id).Scan(&aiTaskList) + if tx.Error != nil { + logx.Errorf(tx.Error.Error()) + return + } + + if len(aiTaskList) == 0 { + ch <- struct{}{} + return + } + var wg sync.WaitGroup - for _, task := range tasks { - if task.AdapterTypeDict != 1 { + for _, aitask := range aiTaskList { + t := aitask + if t.Status == constants.Completed { continue } - if task.Status == constants.Succeeded || task.Status == constants.Failed { - continue - } - - var aiTaskList []*models.TaskAi - tx := l.svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", task.Id).Scan(&aiTaskList) - if tx.Error != nil { - logx.Errorf(tx.Error.Error()) - return - } - - if len(aiTaskList) == 0 { - continue - } - - for _, aitask := range aiTaskList { - t := aitask - if t.Status == constants.Completed { - continue - } - wg.Add(1) - go func() { - trainingTask, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[strconv.FormatInt(t.AdapterId, 10)][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(l.ctx, t.JobId) - if err != nil { - msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error()) - logx.Errorf(errors.New(msg).Error()) - wg.Done() - return - } - t.Status = trainingTask.Status - t.StartTime = trainingTask.Start - t.EndTime = trainingTask.End - err = l.svcCtx.Scheduler.AiStorages.UpdateAiTask(t) - if err != nil { - msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error()) - logx.Errorf(errors.New(msg).Error()) - wg.Done() - return - } + wg.Add(1) + go func() { + trainingTask, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[strconv.FormatInt(t.AdapterId, 10)][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(l.ctx, t.JobId) + if err != nil { + msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error()) + logx.Errorf(errors.New(msg).Error()) wg.Done() - }() - } + return + } + t.Status = trainingTask.Status + t.StartTime = trainingTask.Start + t.EndTime = trainingTask.End + err = l.svcCtx.Scheduler.AiStorages.UpdateAiTask(t) + if err != nil { + msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error()) + logx.Errorf(errors.New(msg).Error()) + wg.Done() + return + } + wg.Done() + }() } wg.Wait() ch <- struct{}{} diff --git a/api/internal/logic/schedule/schedulesubmitlogic.go b/api/internal/logic/schedule/schedulesubmitlogic.go index 98f5f916..5ce7a6ee 100644 --- a/api/internal/logic/schedule/schedulesubmitlogic.go +++ b/api/internal/logic/schedule/schedulesubmitlogic.go @@ -31,6 +31,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type resp = &types.ScheduleResp{} opt := &option.AiOption{ AdapterId: req.AiOption.AdapterId, + ClusterIds: req.AiOption.AiClusterIds, TaskName: req.AiOption.TaskName, ResourceType: req.AiOption.ResourceType, Replica: req.AiOption.Replica, diff --git a/api/internal/scheduler/common/common.go b/api/internal/scheduler/common/common.go index ce2ee5e7..68bfaa32 100644 --- a/api/internal/scheduler/common/common.go +++ b/api/internal/scheduler/common/common.go @@ -88,3 +88,12 @@ func RoundFloat(val float64, precision uint) float64 { ratio := math.Pow(10, float64(precision)) return math.Round(val*ratio) / ratio } + +func Contains(s []string, e string) bool { + for _, a := range s { + if a == e { + return true + } + } + return false +} diff --git a/api/internal/scheduler/schedulers/aiScheduler.go b/api/internal/scheduler/schedulers/aiScheduler.go index 39b7e427..f3bd56e1 100644 --- a/api/internal/scheduler/schedulers/aiScheduler.go +++ b/api/internal/scheduler/schedulers/aiScheduler.go @@ -22,6 +22,7 @@ import ( "github.com/zeromicro/go-zero/core/logx" "gitlink.org.cn/JointCloud/pcm-ac/hpcAC" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/common" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy" @@ -67,6 +68,24 @@ func (as *AiScheduler) GetNewStructForDb(task *response.TaskInfo, resource strin } func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) { + if as.option.ComputeCard != "" { + m, ok := as.AiService.AiCollectorAdapterMap[as.option.AdapterId] + if ok { + for _, id := range as.option.ClusterIds { + cm, ok := m[id] + if ok { + cards, err := cm.GetComputeCards(as.ctx) + if err != nil { + return nil, err + } + if common.Contains(cards, as.option.ComputeCard) { + return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ClusterId: id, Replicas: 1}}, nil + } + } + } + } + } + if len(as.option.ClusterIds) == 1 { return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ClusterId: as.option.ClusterIds[0], Replicas: 1}}, nil } diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index 97f949d6..4457ccd2 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -746,7 +746,7 @@ func (o *OctopusLink) generateCmd(option *option.AiOption) error { case GCU: option.Cmd = "cd /code; python3 train.py" case MLU: - option.Cmd = "su root; cd /torch/venv3/pytorch/bin; source activate; cd /code; python train.py" + option.Cmd = ". /torch/venv3/pytorch/bin/activate; cd /code; python train.py" default: option.Cmd = TRAIN_CMD } From f42a48b27f1d744e51d6e7f4cea2b3a9baa5147b Mon Sep 17 00:00:00 2001 From: tzwang Date: Fri, 17 May 2024 17:38:48 +0800 Subject: [PATCH 07/14] fix submit bugs Former-commit-id: 249398644e9177b761dc6d47c1a2c795effd29af --- api/internal/logic/core/pagelisttasklogic.go | 1 + 1 file changed, 1 insertion(+) diff --git a/api/internal/logic/core/pagelisttasklogic.go b/api/internal/logic/core/pagelisttasklogic.go index 8877a03a..abcbde00 100644 --- a/api/internal/logic/core/pagelisttasklogic.go +++ b/api/internal/logic/core/pagelisttasklogic.go @@ -155,6 +155,7 @@ func (l *PageListTaskLogic) updateTaskStatus(tasks []*types.TaskModel, ch chan<- task.EndTime = end.Format(constants.Layout) } + task.UpdatedTime = time.Now().Format(constants.Layout) tx = l.svcCtx.DbEngin.Table("task").Updates(task) if tx.Error != nil { logx.Errorf(tx.Error.Error()) From ce0d5a9b323e2d90ce0719be35ad5e99df530097 Mon Sep 17 00:00:00 2001 From: tzwang Date: Fri, 17 May 2024 19:43:25 +0800 Subject: [PATCH 08/14] fix db bugs Former-commit-id: 55a997bc7669f4b207da153a3fdd1cb3b1b2935d --- api/internal/logic/core/pagelisttasklogic.go | 22 ++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/api/internal/logic/core/pagelisttasklogic.go b/api/internal/logic/core/pagelisttasklogic.go index abcbde00..f2f6d9e8 100644 --- a/api/internal/logic/core/pagelisttasklogic.go +++ b/api/internal/logic/core/pagelisttasklogic.go @@ -80,8 +80,6 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa for _, ch := range chs { select { case <-ch: - case <-time.After(2 * time.Second): - return } } return @@ -107,6 +105,26 @@ func (l *PageListTaskLogic) updateTaskStatus(tasks []*types.TaskModel, ch chan<- continue } + if len(aiTask) == 1 { + task.Status = aiTask[0].Status + task.StartTime = aiTask[0].StartTime + task.EndTime = aiTask[0].EndTime + tx = l.svcCtx.DbEngin.Model(&types.TaskModel{}).Table("task") + tx = tx.Where("deleted_at is null") + tx = tx.Updates(task) + if tx.Error != nil { + logx.Errorf(tx.Error.Error()) + return + } + continue + } + + for i := len(aiTask) - 1; i >= 0; i-- { + if aiTask[i].StartTime == "" { + aiTask = append(aiTask[:i], aiTask[i+1:]...) + } + } + start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local) end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local) From a977f63f6374a3ec832a8a35dfef22c07e1ff78f Mon Sep 17 00:00:00 2001 From: devad Date: Fri, 17 May 2024 21:06:32 +0800 Subject: [PATCH 09/14] fix Former-commit-id: 443ebf0046a411542e28d142c4784a4be8a827ae --- api/desc/core/pcm-core.api | 2 +- api/internal/logic/core/pagelisttasklogic.go | 6 ++---- api/internal/types/types.go | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index 4c42c45b..9d9a5a20 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -409,7 +409,7 @@ type ( NsID string `json:"nsId,omitempty" db:"ns_id"` TenantId string `json:"tenantId,omitempty" db:"tenant_id"` CreatedTime string `json:"createdTime,omitempty" db:"created_time" gorm:"autoCreateTime"` - UpdatedTime string `json:"updatedTime,omitempty" db:"updated_time" gorm:"autoUpdateTime"` + UpdatedTime string `json:"updatedTime,omitempty" db:"updated_time"` AdapterTypeDict int `json:"adapterTypeDict" db:"create_time" gorm:"adapter_type_dict"` //任务类型(对应字典表的值 } ) diff --git a/api/internal/logic/core/pagelisttasklogic.go b/api/internal/logic/core/pagelisttasklogic.go index f2f6d9e8..09b731df 100644 --- a/api/internal/logic/core/pagelisttasklogic.go +++ b/api/internal/logic/core/pagelisttasklogic.go @@ -109,9 +109,7 @@ func (l *PageListTaskLogic) updateTaskStatus(tasks []*types.TaskModel, ch chan<- task.Status = aiTask[0].Status task.StartTime = aiTask[0].StartTime task.EndTime = aiTask[0].EndTime - tx = l.svcCtx.DbEngin.Model(&types.TaskModel{}).Table("task") - tx = tx.Where("deleted_at is null") - tx = tx.Updates(task) + tx = l.svcCtx.DbEngin.Model(task).Table("task").Where("deleted_at is null").Updates(task) if tx.Error != nil { logx.Errorf(tx.Error.Error()) return @@ -174,7 +172,7 @@ func (l *PageListTaskLogic) updateTaskStatus(tasks []*types.TaskModel, ch chan<- } task.UpdatedTime = time.Now().Format(constants.Layout) - tx = l.svcCtx.DbEngin.Table("task").Updates(task) + tx = l.svcCtx.DbEngin.Table("task").Model(task).Updates(task) if tx.Error != nil { logx.Errorf(tx.Error.Error()) return diff --git a/api/internal/types/types.go b/api/internal/types/types.go index a90e9057..54a3c5d1 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -345,7 +345,7 @@ type TaskModel struct { NsID string `json:"nsId,omitempty" db:"ns_id"` TenantId string `json:"tenantId,omitempty" db:"tenant_id"` CreatedTime string `json:"createdTime,omitempty" db:"created_time" gorm:"autoCreateTime"` - UpdatedTime string `json:"updatedTime,omitempty" db:"updated_time" gorm:"autoUpdateTime"` + UpdatedTime string `json:"updatedTime,omitempty" db:"updated_time"` AdapterTypeDict int `json:"adapterTypeDict" db:"create_time" gorm:"adapter_type_dict"` //任务类型(对应字典表的值 } From 6c724b4ab96e6d1e83560e5214b64a40fe7fbf81 Mon Sep 17 00:00:00 2001 From: tzwang Date: Sun, 19 May 2024 22:52:41 +0800 Subject: [PATCH 10/14] fix task updatedtime bugs Former-commit-id: 226201747e9a5aabd47cb93a57d66448df3248b5 --- .../logic/ai/getcenteroverviewlogic.go | 2 +- .../logic/ai/getcentertasklistlogic.go | 2 +- api/internal/logic/core/pagelisttasklogic.go | 214 ++++++++++-------- 3 files changed, 117 insertions(+), 101 deletions(-) diff --git a/api/internal/logic/ai/getcenteroverviewlogic.go b/api/internal/logic/ai/getcenteroverviewlogic.go index d2669709..a4ecf111 100644 --- a/api/internal/logic/ai/getcenteroverviewlogic.go +++ b/api/internal/logic/ai/getcenteroverviewlogic.go @@ -76,7 +76,7 @@ func (l *GetCenterOverviewLogic) GetCenterOverview() (resp *types.CenterOverview select { case _ = <-ch: return resp, nil - case <-time.After(2 * time.Second): + case <-time.After(1 * time.Second): return resp, nil } diff --git a/api/internal/logic/ai/getcentertasklistlogic.go b/api/internal/logic/ai/getcentertasklistlogic.go index 28393c71..bbae384b 100644 --- a/api/internal/logic/ai/getcentertasklistlogic.go +++ b/api/internal/logic/ai/getcentertasklistlogic.go @@ -101,7 +101,7 @@ func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<- } for _, task := range taskList { t := task - if t.Status == constants.Completed || task.Status == constants.Failed { + if t.Status == constants.Completed || task.Status == constants.Failed || task.Status == constants.Stopped { continue } wg.Add(1) diff --git a/api/internal/logic/core/pagelisttasklogic.go b/api/internal/logic/core/pagelisttasklogic.go index 09b731df..ba0c4382 100644 --- a/api/internal/logic/core/pagelisttasklogic.go +++ b/api/internal/logic/core/pagelisttasklogic.go @@ -86,104 +86,8 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa } func (l *PageListTaskLogic) updateTaskStatus(tasks []*types.TaskModel, ch chan<- struct{}) { - for _, task := range tasks { - if task.AdapterTypeDict != 1 { - continue - } - if task.Status == constants.Succeeded || task.Status == constants.Failed { - continue - } - - var aiTask []*models.TaskAi - tx := l.svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", task.Id).Scan(&aiTask) - if tx.Error != nil { - logx.Errorf(tx.Error.Error()) - return - } - - if len(aiTask) == 0 { - continue - } - - if len(aiTask) == 1 { - task.Status = aiTask[0].Status - task.StartTime = aiTask[0].StartTime - task.EndTime = aiTask[0].EndTime - tx = l.svcCtx.DbEngin.Model(task).Table("task").Where("deleted_at is null").Updates(task) - if tx.Error != nil { - logx.Errorf(tx.Error.Error()) - return - } - continue - } - - for i := len(aiTask) - 1; i >= 0; i-- { - if aiTask[i].StartTime == "" { - aiTask = append(aiTask[:i], aiTask[i+1:]...) - } - } - - start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local) - end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local) - - var status string - var count int - for _, a := range aiTask { - s, _ := time.ParseInLocation(constants.Layout, a.StartTime, time.Local) - e, _ := time.ParseInLocation(constants.Layout, a.EndTime, time.Local) - - if s.Before(start) { - start = s - } - - if e.After(end) { - end = e - } - - if a.Status == constants.Failed { - status = a.Status - break - } - - if a.Status == constants.Pending { - status = a.Status - continue - } - - if a.Status == constants.Running { - status = a.Status - continue - } - - if a.Status == constants.Completed { - count++ - continue - } - } - - if count == len(aiTask) { - status = constants.Succeeded - } - - if status != "" { - task.Status = status - task.StartTime = start.Format(constants.Layout) - task.EndTime = end.Format(constants.Layout) - } - - task.UpdatedTime = time.Now().Format(constants.Layout) - tx = l.svcCtx.DbEngin.Table("task").Model(task).Updates(task) - if tx.Error != nil { - logx.Errorf(tx.Error.Error()) - return - } - } - ch <- struct{}{} -} - -func (l *PageListTaskLogic) updateAiTaskStatus(tasks []*types.TaskModel, ch chan<- struct{}) { for i := len(tasks) - 1; i >= 0; i-- { - if tasks[i].AdapterTypeDict == 0 || tasks[i].Status == constants.Succeeded || tasks[i].Status == constants.Failed { + if tasks[i].AdapterTypeDict != 1 || tasks[i].Status == constants.Succeeded || tasks[i].Status == constants.Failed { tasks = append(tasks[:i], tasks[i+1:]...) } } @@ -197,7 +101,118 @@ func (l *PageListTaskLogic) updateAiTaskStatus(tasks []*types.TaskModel, ch chan for i, _ := range tasks { earliest, _ := time.Parse(constants.Layout, task.UpdatedTime) latest, _ := time.Parse(constants.Layout, tasks[i].UpdatedTime) - if earliest.Before(latest) { + if latest.Before(earliest) { + task = tasks[i] + } + } + + var aiTask []*models.TaskAi + tx := l.svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", task.Id).Scan(&aiTask) + if tx.Error != nil { + logx.Errorf(tx.Error.Error()) + ch <- struct{}{} + return + } + + if len(aiTask) == 0 { + ch <- struct{}{} + return + } + + if len(aiTask) == 1 { + task.Status = aiTask[0].Status + task.StartTime = aiTask[0].StartTime + task.EndTime = aiTask[0].EndTime + tx = l.svcCtx.DbEngin.Model(task).Table("task").Where("deleted_at is null").Updates(task) + if tx.Error != nil { + logx.Errorf(tx.Error.Error()) + return + } + ch <- struct{}{} + return + } + + for i := len(aiTask) - 1; i >= 0; i-- { + if aiTask[i].StartTime == "" { + aiTask = append(aiTask[:i], aiTask[i+1:]...) + } + } + + start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local) + end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local) + + var status string + var count int + for _, a := range aiTask { + s, _ := time.ParseInLocation(constants.Layout, a.StartTime, time.Local) + e, _ := time.ParseInLocation(constants.Layout, a.EndTime, time.Local) + + if s.Before(start) { + start = s + } + + if e.After(end) { + end = e + } + + if a.Status == constants.Failed { + status = a.Status + break + } + + if a.Status == constants.Pending { + status = a.Status + continue + } + + if a.Status == constants.Running { + status = a.Status + continue + } + + if a.Status == constants.Completed { + count++ + continue + } + } + + if count == len(aiTask) { + status = constants.Succeeded + } + + if status != "" { + task.Status = status + task.StartTime = start.Format(constants.Layout) + task.EndTime = end.Format(constants.Layout) + } + + task.UpdatedTime = time.Now().Format(constants.Layout) + tx = l.svcCtx.DbEngin.Table("task").Model(task).Updates(task) + if tx.Error != nil { + logx.Errorf(tx.Error.Error()) + ch <- struct{}{} + return + } + ch <- struct{}{} +} + +func (l *PageListTaskLogic) updateAiTaskStatus(tasks []*types.TaskModel, ch chan<- struct{}) { + for i := len(tasks) - 1; i >= 0; i-- { + if tasks[i].AdapterTypeDict != 1 || tasks[i].Status == constants.Succeeded || tasks[i].Status == constants.Failed { + tasks = append(tasks[:i], tasks[i+1:]...) + } + } + + if len(tasks) == 0 { + ch <- struct{}{} + return + } + + task := tasks[0] + for i, _ := range tasks { + earliest, _ := time.Parse(constants.Layout, task.UpdatedTime) + latest, _ := time.Parse(constants.Layout, tasks[i].UpdatedTime) + if latest.Before(earliest) { task = tasks[i] } } @@ -206,6 +221,7 @@ func (l *PageListTaskLogic) updateAiTaskStatus(tasks []*types.TaskModel, ch chan tx := l.svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", task.Id).Scan(&aiTaskList) if tx.Error != nil { logx.Errorf(tx.Error.Error()) + ch <- struct{}{} return } @@ -217,7 +233,7 @@ func (l *PageListTaskLogic) updateAiTaskStatus(tasks []*types.TaskModel, ch chan var wg sync.WaitGroup for _, aitask := range aiTaskList { t := aitask - if t.Status == constants.Completed { + if t.Status == constants.Completed || t.Status == constants.Failed { continue } wg.Add(1) From 79db3cd44413a787635961978f76eb6490197d84 Mon Sep 17 00:00:00 2001 From: tzwang Date: Mon, 20 May 2024 11:06:19 +0800 Subject: [PATCH 11/14] fix task time bugs Former-commit-id: ea14c8fc26cce49b1151d9d2cec9500c54f3242e --- api/internal/logic/core/pagelisttasklogic.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/api/internal/logic/core/pagelisttasklogic.go b/api/internal/logic/core/pagelisttasklogic.go index ba0c4382..32cc2240 100644 --- a/api/internal/logic/core/pagelisttasklogic.go +++ b/api/internal/logic/core/pagelisttasklogic.go @@ -120,9 +120,14 @@ func (l *PageListTaskLogic) updateTaskStatus(tasks []*types.TaskModel, ch chan<- } if len(aiTask) == 1 { - task.Status = aiTask[0].Status + if aiTask[0].Status == constants.Completed { + task.Status = constants.Succeeded + } else { + task.Status = aiTask[0].Status + } task.StartTime = aiTask[0].StartTime task.EndTime = aiTask[0].EndTime + task.UpdatedTime = time.Now().Format(constants.Layout) tx = l.svcCtx.DbEngin.Model(task).Table("task").Where("deleted_at is null").Updates(task) if tx.Error != nil { logx.Errorf(tx.Error.Error()) From 130b116b5216ead96005035e8dc4f9f4c796e978 Mon Sep 17 00:00:00 2001 From: tzwang Date: Mon, 20 May 2024 15:46:49 +0800 Subject: [PATCH 12/14] added get balance api Former-commit-id: 2c8df408652a3e2bede8652ff11e9460ba8fc7e0 --- api/desc/pcm.api | 3 +++ api/desc/schedule/pcm-schedule.api | 9 +++++++++ api/internal/logic/core/pagelisttasklogic.go | 5 +++++ 3 files changed, 17 insertions(+) diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 2cb1e06c..f7290a96 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -978,6 +978,9 @@ service pcm { @handler GetComputeCardsByClusterHandler get /schedule/getComputeCardsByCluster/:adapterId/:clusterId (GetComputeCardsByClusterReq) returns (GetComputeCardsByClusterResp) + + @handler GetClusterBalanceByIdHandler + get /schedule/getClusterBalanceById/:adapterId/:clusterId (GetClusterBalanceByIdReq) returns (GetClusterBalanceByIdResp) } @server( diff --git a/api/desc/schedule/pcm-schedule.api b/api/desc/schedule/pcm-schedule.api index 9be2c829..e612f9fb 100644 --- a/api/desc/schedule/pcm-schedule.api +++ b/api/desc/schedule/pcm-schedule.api @@ -141,4 +141,13 @@ type ( GetComputeCardsByClusterResp { Cards []string `json:"cards"` } + + GetClusterBalanceByIdReq{ + AdapterId string `path:"adapterId"` + ClusterId string `path:"clusterId"` + } + + GetClusterBalanceByIdResp{ + Balance float64 `json:"balance"` + } ) \ No newline at end of file diff --git a/api/internal/logic/core/pagelisttasklogic.go b/api/internal/logic/core/pagelisttasklogic.go index 32cc2240..f1858a4a 100644 --- a/api/internal/logic/core/pagelisttasklogic.go +++ b/api/internal/logic/core/pagelisttasklogic.go @@ -143,6 +143,11 @@ func (l *PageListTaskLogic) updateTaskStatus(tasks []*types.TaskModel, ch chan<- } } + if len(aiTask) == 0 { + ch <- struct{}{} + return + } + start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local) end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local) From ff117751b05ca33b15a0a80616264b8f0e187961 Mon Sep 17 00:00:00 2001 From: tzwang Date: Mon, 20 May 2024 16:19:06 +0800 Subject: [PATCH 13/14] updated types Former-commit-id: 898388856c31253f46aca54df8b6d72b5d15333a --- api/internal/handler/routes.go | 5 ++++ .../schedule/getclusterbalancebyidhandler.go | 28 +++++++++++++++++ .../schedule/getclusterbalancebyidlogic.go | 30 +++++++++++++++++++ api/internal/types/types.go | 9 ++++++ 4 files changed, 72 insertions(+) create mode 100644 api/internal/handler/schedule/getclusterbalancebyidhandler.go create mode 100644 api/internal/logic/schedule/getclusterbalancebyidlogic.go diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index 11006a83..05583d79 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -1225,6 +1225,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/schedule/getComputeCardsByCluster/:adapterId/:clusterId", Handler: schedule.GetComputeCardsByClusterHandler(serverCtx), }, + { + Method: http.MethodGet, + Path: "/schedule/getClusterBalanceById/:adapterId/:clusterId", + Handler: schedule.GetClusterBalanceByIdHandler(serverCtx), + }, }, rest.WithPrefix("/pcm/v1"), ) diff --git a/api/internal/handler/schedule/getclusterbalancebyidhandler.go b/api/internal/handler/schedule/getclusterbalancebyidhandler.go new file mode 100644 index 00000000..ace98e2f --- /dev/null +++ b/api/internal/handler/schedule/getclusterbalancebyidhandler.go @@ -0,0 +1,28 @@ +package schedule + +import ( + "net/http" + + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" +) + +func GetClusterBalanceByIdHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.GetClusterBalanceByIdReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + + l := schedule.NewGetClusterBalanceByIdLogic(r.Context(), svcCtx) + resp, err := l.GetClusterBalanceById(&req) + if err != nil { + httpx.ErrorCtx(r.Context(), w, err) + } else { + httpx.OkJsonCtx(r.Context(), w, resp) + } + } +} diff --git a/api/internal/logic/schedule/getclusterbalancebyidlogic.go b/api/internal/logic/schedule/getclusterbalancebyidlogic.go new file mode 100644 index 00000000..1f38f351 --- /dev/null +++ b/api/internal/logic/schedule/getclusterbalancebyidlogic.go @@ -0,0 +1,30 @@ +package schedule + +import ( + "context" + + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + + "github.com/zeromicro/go-zero/core/logx" +) + +type GetClusterBalanceByIdLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewGetClusterBalanceByIdLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetClusterBalanceByIdLogic { + return &GetClusterBalanceByIdLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +func (l *GetClusterBalanceByIdLogic) GetClusterBalanceById(req *types.GetClusterBalanceByIdReq) (resp *types.GetClusterBalanceByIdResp, err error) { + // todo: add your logic here and delete this line + + return +} diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 54a3c5d1..fed98b44 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -5746,6 +5746,15 @@ type GetComputeCardsByClusterResp struct { Cards []string `json:"cards"` } +type GetClusterBalanceByIdReq struct { + AdapterId string `path:"adapterId"` + ClusterId string `path:"clusterId"` +} + +type GetClusterBalanceByIdResp struct { + Balance float64 `json:"balance"` +} + type CreateAlertRuleReq struct { CLusterId string `json:"clusterId"` ClusterName string `json:"clusterName"` From 766c862af7ab90cfb71cd28b9c93619ec915dca1 Mon Sep 17 00:00:00 2001 From: tzwang Date: Mon, 20 May 2024 16:48:09 +0800 Subject: [PATCH 14/14] added getclusterbalance api Former-commit-id: 856cdfb1b2c380bf848782bbf85275f34e925cfd --- .../schedule/getclusterbalancebyidhandler.go | 10 ++++------ .../schedule/getclusterbalancebyidlogic.go | 9 +++++++-- .../scheduler/service/collector/collector.go | 1 + api/internal/storeLink/modelarts.go | 4 ++++ api/internal/storeLink/octopus.go | 19 +++++++++++++++++++ api/internal/storeLink/shuguangai.go | 10 ++++++++++ 6 files changed, 45 insertions(+), 8 deletions(-) diff --git a/api/internal/handler/schedule/getclusterbalancebyidhandler.go b/api/internal/handler/schedule/getclusterbalancebyidhandler.go index ace98e2f..27e273f4 100644 --- a/api/internal/handler/schedule/getclusterbalancebyidhandler.go +++ b/api/internal/handler/schedule/getclusterbalancebyidhandler.go @@ -1,6 +1,7 @@ package schedule import ( + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "net/http" "github.com/zeromicro/go-zero/rest/httpx" @@ -13,16 +14,13 @@ func GetClusterBalanceByIdHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { var req types.GetClusterBalanceByIdReq if err := httpx.Parse(r, &req); err != nil { - httpx.ErrorCtx(r.Context(), w, err) + result.ParamErrorResult(r, w, err) return } l := schedule.NewGetClusterBalanceByIdLogic(r.Context(), svcCtx) resp, err := l.GetClusterBalanceById(&req) - if err != nil { - httpx.ErrorCtx(r.Context(), w, err) - } else { - httpx.OkJsonCtx(r.Context(), w, resp) - } + result.HttpResult(r, w, resp, err) + } } diff --git a/api/internal/logic/schedule/getclusterbalancebyidlogic.go b/api/internal/logic/schedule/getclusterbalancebyidlogic.go index 1f38f351..09f70b2c 100644 --- a/api/internal/logic/schedule/getclusterbalancebyidlogic.go +++ b/api/internal/logic/schedule/getclusterbalancebyidlogic.go @@ -24,7 +24,12 @@ func NewGetClusterBalanceByIdLogic(ctx context.Context, svcCtx *svc.ServiceConte } func (l *GetClusterBalanceByIdLogic) GetClusterBalanceById(req *types.GetClusterBalanceByIdReq) (resp *types.GetClusterBalanceByIdResp, err error) { - // todo: add your logic here and delete this line + resp = &types.GetClusterBalanceByIdResp{} + balance, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].GetUserBalance(l.ctx) + if err != nil { + return nil, err + } + resp.Balance = balance - return + return resp, nil } diff --git a/api/internal/scheduler/service/collector/collector.go b/api/internal/scheduler/service/collector/collector.go index 5e6a7940..2c8d51a8 100644 --- a/api/internal/scheduler/service/collector/collector.go +++ b/api/internal/scheduler/service/collector/collector.go @@ -11,6 +11,7 @@ type AiCollector interface { DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error GetComputeCards(ctx context.Context) ([]string, error) + GetUserBalance(ctx context.Context) (float64, error) } type ResourceStats struct { diff --git a/api/internal/storeLink/modelarts.go b/api/internal/storeLink/modelarts.go index 2186addb..ee8df706 100644 --- a/api/internal/storeLink/modelarts.go +++ b/api/internal/storeLink/modelarts.go @@ -187,6 +187,10 @@ func (m *ModelArtsLink) GetComputeCards(ctx context.Context) ([]string, error) { return nil, nil } +func (m *ModelArtsLink) GetUserBalance(ctx context.Context) (float64, error) { + return 0, nil +} + func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) { return "", nil } diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index 4457ccd2..77d957d9 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -359,6 +359,25 @@ func (o *OctopusLink) GetComputeCards(ctx context.Context) ([]string, error) { return cards, nil } +func (o *OctopusLink) GetUserBalance(ctx context.Context) (float64, error) { + balanceReq := &octopus.GetUserBalanceReq{ + Platform: o.platform, + } + balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq) + if err != nil { + return 0, err + } + if !balanceResp.Success { + if balanceResp.Error != nil { + return 0, errors.New(balanceResp.Error.Message) + } else { + return 0, errors.New("failed to get user balance") + } + } + balance := float64(balanceResp.Payload.BillingUser.Amount) + return balance, nil +} + func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) { var name string if resourceType == CARD { diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index 415f14b1..cd17229a 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -453,6 +453,16 @@ func (s *ShuguangAi) GetComputeCards(ctx context.Context) ([]string, error) { return cards, nil } +func (s *ShuguangAi) GetUserBalance(ctx context.Context) (float64, error) { + userReq := &hpcAC.GetUserInfoReq{} + userinfo, err := s.aCRpc.GetUserInfo(ctx, userReq) + if err != nil { + return 0, err + } + balance, _ := strconv.ParseFloat(userinfo.Data.AccountBalance, 64) + return balance, nil +} + func (s *ShuguangAi) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) { algoName := dataset + DASH + algorithm req := &hpcAC.GetFileReq{