From dc52eaee49b11fe01644f49d45ee7f02d3f844a0 Mon Sep 17 00:00:00 2001 From: tzwang Date: Wed, 21 Aug 2024 17:27:20 +0800 Subject: [PATCH] added CreateInferDeployInstance logic Former-commit-id: c4c2ab8fdda0cabdea7d97d5fef136c3dcc56701 --- internal/scheduler/database/aiStorage.go | 19 ++ .../schedulers/option/inferOption.go | 4 + .../scheduler/service/inference/inference.go | 1 + internal/storeLink/modelarts.go | 4 + internal/storeLink/octopus.go | 235 ++++++++++++++++-- internal/storeLink/shuguangai.go | 4 + 6 files changed, 247 insertions(+), 20 deletions(-) diff --git a/internal/scheduler/database/aiStorage.go b/internal/scheduler/database/aiStorage.go index 169e5040..70ac99c2 100644 --- a/internal/scheduler/database/aiStorage.go +++ b/internal/scheduler/database/aiStorage.go @@ -544,3 +544,22 @@ func (s *AiStorage) GetTrainingTaskRunningNum() (int32, error) { } return total, nil } + +func (s *AiStorage) SaveInferDeployTask(taskName string, modelName string, modelType string, desc string) (int64, error) { + startTime := time.Now().Format(time.RFC3339) + // 构建主任务结构体 + taskModel := models.AiDeployInstanceTask{ + Name: taskName, + ModelName: modelName, + ModelType: modelType, + Desc: desc, + CreateTime: startTime, + UpdateTime: startTime, + } + // 保存任务数据到数据库 + tx := s.DbEngin.Table("ai_deploy_instance_task").Create(&taskModel) + if tx.Error != nil { + return 0, tx.Error + } + return taskModel.Id, nil +} diff --git a/internal/scheduler/schedulers/option/inferOption.go b/internal/scheduler/schedulers/option/inferOption.go index 65249955..b5f45969 100644 --- a/internal/scheduler/schedulers/option/inferOption.go +++ b/internal/scheduler/schedulers/option/inferOption.go @@ -15,6 +15,10 @@ type InferOption struct { Envs []string `json:"envs,optional"` Cmd string `json:"cmd,optional"` Replica int32 `json:"replicas,optional"` + + ResourceId string + AlgorithmId string + ImageId string } func (a InferOption) GetOptionType() string { diff --git a/internal/scheduler/service/inference/inference.go b/internal/scheduler/service/inference/inference.go index 59d8273a..1e8442bc 100644 --- a/internal/scheduler/service/inference/inference.go +++ b/internal/scheduler/service/inference/inference.go @@ -17,6 +17,7 @@ type ICluster interface { StartInferDeployInstance(ctx context.Context, id string) bool StopInferDeployInstance(ctx context.Context, id string) bool GetInferDeployInstance(ctx context.Context, id string) (*DeployInstance, error) + CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) } type IInference interface { diff --git a/internal/storeLink/modelarts.go b/internal/storeLink/modelarts.go index 21266dc5..50f96398 100644 --- a/internal/storeLink/modelarts.go +++ b/internal/storeLink/modelarts.go @@ -539,3 +539,7 @@ func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) ( func (m *ModelArtsLink) GetInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) { return "", nil } + +func (m *ModelArtsLink) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) { + return "", nil +} diff --git a/internal/storeLink/octopus.go b/internal/storeLink/octopus.go index e74a6301..2651930a 100644 --- a/internal/storeLink/octopus.go +++ b/internal/storeLink/octopus.go @@ -561,7 +561,7 @@ func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (int } func (o *OctopusLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error { - err := o.generateResourceId(ctx, option) + err := o.generateResourceId(ctx, option, nil) if err != nil { return err } @@ -569,15 +569,15 @@ func (o *OctopusLink) GenerateSubmitParams(ctx context.Context, option *option.A if err != nil { return err } - err = o.generateImageId(ctx, option) + err = o.generateImageId(ctx, option, nil) if err != nil { return err } - err = o.generateAlgorithmId(ctx, option) + err = o.generateAlgorithmId(ctx, option, nil) if err != nil { return err } - err = o.generateCmd(option) + err = o.generateCmd(option, nil) if err != nil { return err } @@ -592,10 +592,7 @@ func (o *OctopusLink) GenerateSubmitParams(ctx context.Context, option *option.A return nil } -func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiOption) error { - if option.ResourceType == "" { - return errors.New("ResourceType not set") - } +func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error { req := &octopus.GetResourceSpecsReq{ Platform: o.platform, ResourcePool: RESOURCE_POOL, @@ -608,6 +605,30 @@ func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiO return errors.New(specResp.Error.Message) } + if option != nil { + err = generateResourceIdForTraining(option, specResp) + if err != nil { + return err + } + return nil + } + + if ifoption != nil { + err = generateResourceIdForInferDeployInstance(ifoption, specResp) + if err != nil { + return err + } + return nil + } + + return errors.New("failed to set ResourceId") +} + +func generateResourceIdForTraining(option *option.AiOption, specResp *octopus.GetResourceSpecsResp) error { + if option.ResourceType == "" { + return errors.New("ResourceType not set") + } + if option.ResourceType == CPU { for _, spec := range specResp.TrainResourceSpecs { if spec.Price == 0 { @@ -621,14 +642,38 @@ func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiO if option.ComputeCard == "" { option.ComputeCard = GCU } - err = setResourceIdByCard(option, specResp, option.ComputeCard) + err := setResourceIdByCard(option, specResp, option.ComputeCard) if err != nil { return err } return nil } - return errors.New("failed to get ResourceId") + return errors.New("ResourceType not set") +} + +func generateResourceIdForInferDeployInstance(option *option.InferOption, specResp *octopus.GetResourceSpecsResp) error { + // temporarily use bi-v100 + cardName := cardCnMap[BIV100] + + for _, spec := range specResp.TrainResourceSpecs { + names := strings.Split(spec.Name, COMMA) + if len(names) != 4 { + continue + } + + ns := strings.Split(names[0], STAR) + if len(ns) != 2 { + continue + } + + if ns[0] == "1" && ns[1] == cardName { + option.ResourceId = spec.Id + return nil + } + } + + return errors.New("failed to set ResourceId") } func (o *OctopusLink) generateDatasetsId(ctx context.Context, option *option.AiOption) error { @@ -656,7 +701,7 @@ func (o *OctopusLink) generateDatasetsId(ctx context.Context, option *option.AiO return errors.New("failed to get DatasetsId") } -func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOption) error { +func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error { if option.TaskType == "" { return errors.New("TaskType not set") } @@ -696,6 +741,26 @@ func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOpti return errors.New("failed to get PresetImages") } + if option != nil { + err = generateImageIdForTraining(option, preImgResp) + if err != nil { + return err + } + return nil + } + + if ifoption != nil { + err = generateImageIdForInferDeployInstance(ifoption, preImgResp) + if err != nil { + return err + } + return nil + } + + return errors.New("failed to get ImageId") +} + +func generateImageIdForTraining(option *option.AiOption, preImgResp *octopus.GetPresetImageListResp) error { if option.ResourceType == CARD { for _, image := range preImgResp.Payload.Images { if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(option.ComputeCard)]) { @@ -717,11 +782,35 @@ func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOpti } } } - - return errors.New("failed to get ImageId") + return errors.New("failed to set ImageId") } -func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error { +func generateImageIdForInferDeployInstance(option *option.InferOption, preImgResp *octopus.GetPresetImageListResp) error { + for _, image := range preImgResp.Payload.Images { + // temporarily use bi-v100 + if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(BIV100)]) { + switch strings.ToUpper(BIV100) { + case GCU: + if strings.HasPrefix(image.ImageVersion, "t20_") { + option.ImageId = image.Id + return nil + } + case BIV100: + if strings.HasPrefix(image.ImageVersion, "bi_") { + option.ImageId = image.Id + return nil + } + case MLU: + option.ImageId = image.Id + return nil + } + } + } + + return errors.New("failed to set ImageId") +} + +func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error { req := &octopus.GetMyAlgorithmListReq{ Platform: o.platform, PageIndex: o.pageIndex, @@ -735,6 +824,26 @@ func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.Ai return errors.New("failed to get algorithmId") } + if option != nil { + err = generateAlgorithmIdForTraining(option, resp) + if err != nil { + return err + } + return nil + } + + if ifoption != nil { + err = generateAlgorithmIdForInferDeployInstance(ifoption, resp) + if err != nil { + return err + } + return nil + } + + return errors.New("failed to set AlgorithmId") +} + +func generateAlgorithmIdForTraining(option *option.AiOption, resp *octopus.GetMyAlgorithmListResp) error { for _, algorithm := range resp.Payload.Algorithms { if algorithm.FrameworkName == strings.Title(option.TaskType) { ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE) @@ -760,14 +869,40 @@ func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.Ai } } - if option.AlgorithmId == "" { - return errors.New("Algorithm does not exist") - } - - return errors.New("failed to get AlgorithmId") + return errors.New("Algorithm does not exist") } -func (o *OctopusLink) generateCmd(option *option.AiOption) error { +func generateAlgorithmIdForInferDeployInstance(option *option.InferOption, resp *octopus.GetMyAlgorithmListResp) error { + for _, algorithm := range resp.Payload.Algorithms { + if strings.Contains(algorithm.AlgorithmName, option.ModelName) { + option.AlgorithmId = algorithm.AlgorithmId + return nil + } + } + return errors.New("Algorithm does not exist") +} + +func (o *OctopusLink) generateCmd(option *option.AiOption, ifoption *option.InferOption) error { + if option != nil { + err := generateCmdForTraining(option) + if err != nil { + return err + } + return nil + } + + if ifoption != nil { + err := generateCmdForInferDeployInstance(ifoption) + if err != nil { + return err + } + return nil + } + + return errors.New("failed to set cmd") +} + +func generateCmdForTraining(option *option.AiOption) error { if option.Cmd == "" { switch option.ComputeCard { case GCU: @@ -782,6 +917,14 @@ func (o *OctopusLink) generateCmd(option *option.AiOption) error { return nil } +func generateCmdForInferDeployInstance(option *option.InferOption) error { + if option.Cmd == "" { + option.Cmd = "su root; pip install fastapi uvicorn[standard]; cd /code/infer; python infer_biv100.py" + } + + return nil +} + func (o *OctopusLink) generateEnv(option *option.AiOption) error { return nil @@ -1020,3 +1163,55 @@ func (o *OctopusLink) GetInferResult(ctx context.Context, url string, file multi return recv.Result, nil } + +func (o *OctopusLink) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) { + err := o.generateResourceId(ctx, nil, option) + if err != nil { + return "", err + } + + err = o.generateAlgorithmId(ctx, nil, option) + if err != nil { + return "", err + } + + err = o.generateImageId(ctx, nil, option) + if err != nil { + return "", err + } + + err = o.generateCmd(nil, option) + if err != nil { + return "", err + } + + desc := option.ModelType + FORWARD_SLASH + option.ModelName + FORWARD_SLASH + BIV100 + param := &octopus.CreateNotebookParam{ + Name: option.TaskName, + ResourcePool: RESOURCE_POOL, + ResourceSpecId: option.ResourceId, + AlgorithmId: option.AlgorithmId, + AlgorithmVersion: VERSION, + ImageId: option.ImageId, + DatasetId: "", + DatasetVersion: "", + Command: option.Cmd, + Desc: desc, + TaskNumber: 1, + } + req := &octopus.CreateNotebookReq{ + Platform: o.platform, + Params: param, + } + + resp, err := o.octopusRpc.CreateNotebook(ctx, req) + if err != nil { + return "", err + } + + if !resp.Success { + return "", errors.New(resp.Error.Message) + } + + return resp.Payload.Id, nil +} diff --git a/internal/storeLink/shuguangai.go b/internal/storeLink/shuguangai.go index d5c41d87..a756de3f 100644 --- a/internal/storeLink/shuguangai.go +++ b/internal/storeLink/shuguangai.go @@ -881,3 +881,7 @@ func (s *ShuguangAi) GetInferDeployInstance(ctx context.Context, id string) (*in func (s *ShuguangAi) GetInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) { return "", nil } + +func (s *ShuguangAi) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) { + return "", nil +}