From 31d0096029ecfc6ef8e3d2b0d6397028b46e4aeb Mon Sep 17 00:00:00 2001 From: tzwang Date: Tue, 13 Aug 2024 15:39:29 +0800 Subject: [PATCH] updated startall apis Former-commit-id: fb95b3126f6f99e0a5de9ca0f12d410b97f1757b --- .../inference/startallbydeploytaskidlogic.go | 42 +++++++++++++++++-- .../inference/stopallbydeploytaskidlogic.go | 42 +++++++++++++++++-- .../scheduler/service/inference/inference.go | 1 + internal/storeLink/modelarts.go | 2 + internal/storeLink/octopus.go | 3 ++ internal/storeLink/shuguangai.go | 4 ++ 6 files changed, 88 insertions(+), 6 deletions(-) diff --git a/internal/logic/inference/startallbydeploytaskidlogic.go b/internal/logic/inference/startallbydeploytaskidlogic.go index 6ce13065..db87a515 100644 --- a/internal/logic/inference/startallbydeploytaskidlogic.go +++ b/internal/logic/inference/startallbydeploytaskidlogic.go @@ -4,6 +4,8 @@ import ( "context" "errors" "github.com/zeromicro/go-zero/core/logx" + "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference" + "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "strconv" @@ -34,9 +36,15 @@ func (l *StartAllByDeployTaskIdLogic) StartAllByDeployTaskId(req *types.StartAll } for _, ins := range list { - success := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(ins.AdapterId, 10)][strconv.FormatInt(ins.ClusterId, 10)].StartInferDeployInstance(l.ctx, ins.InstanceId) - if !success { - return nil, errors.New(ins.InstanceName + " start failed") + in, err := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(ins.AdapterId, 10)][strconv.FormatInt(ins.ClusterId, 10)].GetInferDeployInstance(l.ctx, ins.InstanceId) + if err != nil { + return nil, err + } + if checkStopStatus(in) { + success := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(ins.AdapterId, 10)][strconv.FormatInt(ins.ClusterId, 10)].StartInferDeployInstance(l.ctx, ins.InstanceId) + if !success { + return nil, errors.New(ins.InstanceName + " start failed") + } } } @@ -47,3 +55,31 @@ func (l *StartAllByDeployTaskIdLogic) StartAllByDeployTaskId(req *types.StartAll return resp, nil } + +func checkStopStatus(in *inference.DeployInstance) bool { + switch in.ClusterType { + case storeLink.TYPE_OCTOPUS: + switch in.Status { + case "stopped": + return true + default: + return false + } + case storeLink.TYPE_MODELARTS: + switch in.Status { + case "stopped": + return true + default: + return false + } + case storeLink.TYPE_SHUGUANGAI: + switch in.Status { + case "Terminated": + return true + default: + return false + } + default: + return false + } +} diff --git a/internal/logic/inference/stopallbydeploytaskidlogic.go b/internal/logic/inference/stopallbydeploytaskidlogic.go index 544795ca..d0ebc23a 100644 --- a/internal/logic/inference/stopallbydeploytaskidlogic.go +++ b/internal/logic/inference/stopallbydeploytaskidlogic.go @@ -3,6 +3,8 @@ package inference import ( "context" "errors" + "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference" + "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "strconv" @@ -35,9 +37,15 @@ func (l *StopAllByDeployTaskIdLogic) StopAllByDeployTaskId(req *types.StopAllByD } for _, ins := range list { - success := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(ins.AdapterId, 10)][strconv.FormatInt(ins.ClusterId, 10)].StopInferDeployInstance(l.ctx, ins.InstanceId) - if !success { - return nil, errors.New(ins.InstanceName + " stop failed") + in, err := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(ins.AdapterId, 10)][strconv.FormatInt(ins.ClusterId, 10)].GetInferDeployInstance(l.ctx, ins.InstanceId) + if err != nil { + return nil, err + } + if checkStatus(in) { + success := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(ins.AdapterId, 10)][strconv.FormatInt(ins.ClusterId, 10)].StopInferDeployInstance(l.ctx, ins.InstanceId) + if !success { + return nil, errors.New(ins.InstanceName + " stop failed") + } } } @@ -48,3 +56,31 @@ func (l *StopAllByDeployTaskIdLogic) StopAllByDeployTaskId(req *types.StopAllByD return resp, nil } + +func checkStatus(in *inference.DeployInstance) bool { + switch in.ClusterType { + case storeLink.TYPE_OCTOPUS: + switch in.Status { + case "running": + return true + default: + return false + } + case storeLink.TYPE_MODELARTS: + switch in.Status { + case "running": + return true + default: + return false + } + case storeLink.TYPE_SHUGUANGAI: + switch in.Status { + case "Running": + return true + default: + return false + } + default: + return false + } +} diff --git a/internal/scheduler/service/inference/inference.go b/internal/scheduler/service/inference/inference.go index 10cce695..e052230c 100644 --- a/internal/scheduler/service/inference/inference.go +++ b/internal/scheduler/service/inference/inference.go @@ -38,6 +38,7 @@ type DeployInstance struct { ModelType string InferCard string ClusterName string + ClusterType string Status string CreatedTime string } diff --git a/internal/storeLink/modelarts.go b/internal/storeLink/modelarts.go index f950b884..aae89de0 100644 --- a/internal/storeLink/modelarts.go +++ b/internal/storeLink/modelarts.go @@ -471,6 +471,7 @@ func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context) ([]*infe ins.InferCard = "NPU" ins.ClusterName = m.platform ins.CreatedTime = string(services.StartTime) + ins.ClusterType = TYPE_MODELARTS insList = append(insList, ins) } @@ -525,5 +526,6 @@ func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) ( ins.InferCard = "NPU" ins.ClusterName = m.platform ins.CreatedTime = string(resp.StartTime) + ins.ClusterType = TYPE_MODELARTS return ins, nil } diff --git a/internal/storeLink/octopus.go b/internal/storeLink/octopus.go index b847613e..82ae7efd 100644 --- a/internal/storeLink/octopus.go +++ b/internal/storeLink/octopus.go @@ -926,6 +926,7 @@ func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context) ([]*infere ins.InstanceId = notebook.Id ins.ClusterName = o.platform ins.Status = notebook.Status + ins.ClusterType = TYPE_OCTOPUS insList = append(insList, ins) } return insList, nil @@ -974,5 +975,7 @@ func (o *OctopusLink) GetInferDeployInstance(ctx context.Context, id string) (*i ins.InstanceId = resp.Payload.Notebook.Id ins.ClusterName = o.platform ins.Status = resp.Payload.Notebook.Status + ins.ClusterType = TYPE_OCTOPUS + return ins, nil } diff --git a/internal/storeLink/shuguangai.go b/internal/storeLink/shuguangai.go index 6ad0f23c..49d9ec1b 100644 --- a/internal/storeLink/shuguangai.go +++ b/internal/storeLink/shuguangai.go @@ -815,6 +815,8 @@ func (s *ShuguangAi) GetInferDeployInstanceList(ctx context.Context) ([]*inferen ins.Status = datum.Status ins.InferCard = DCU ins.CreatedTime = datum.CreateTime + ins.ClusterType = TYPE_SHUGUANGAI + insList = append(insList, ins) } @@ -865,5 +867,7 @@ func (s *ShuguangAi) GetInferDeployInstance(ctx context.Context, id string) (*in ins.Status = resp.Data.Status ins.InferCard = DCU ins.CreatedTime = resp.Data.CreateTime + ins.ClusterType = TYPE_SHUGUANGAI + return ins, nil }