From bfe75976445a1be604021bc261b64ce0de6c5487 Mon Sep 17 00:00:00 2001 From: tzwang Date: Thu, 29 Feb 2024 17:30:40 +0800 Subject: [PATCH 1/2] modified shuguangai submit parameters Former-commit-id: 5d86dc8ff44ea73e6a403bb4e9b0ea0b7d38be12 --- api/internal/storeLink/shuguangai.go | 96 +++++++++++++++++++++------- api/internal/storeLink/storeLink.go | 40 +----------- 2 files changed, 76 insertions(+), 60 deletions(-) diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index 4315a3d1..3e6af2d3 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -28,21 +28,19 @@ import ( ) const ( - RAM_SIZE_1G = 1024 // 1G - WORKER_NUMBER = 1 - WORKER_CPU_NUMBER = 5 - WORKER_GPU_NUMBER = 1 - SHUGUANGAI_CUSTOM_RESOURCE_ID = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" - SHUGUANGAI_CUSTOM_RESOURCE_NAME = "1*DCU, CPU:5, 内存:10GB" - DCU = "dcu" - PYTORCH = "Pytorch" - TASK_PYTORCH_PREFIX = "PytorchTask" - TENSORFLOW = "Tensorflow" - RESOURCE_GROUP = "wzhdtest" - WorkPath = "/work/home/acgnnmfbwo/pcmv1/" - TimeoutLimit = "10:00:00" - PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py" - DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" + RAM_SIZE_1G = 1024 // 1G + WORKER_NUMBER = 1 + DCU = "dcu" + PYTORCH = "Pytorch" + TASK_PYTORCH_PREFIX = "PytorchTask" + TENSORFLOW = "Tensorflow" + RESOURCE_GROUP = "wzhdtest" + WorkPath = "/work/home/acgnnmfbwo/pcmv1/" + TimeoutLimit = "10:00:00" + PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py" + DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" + ALGORITHM_DIR = "/work/home/acgnnmfbwo/pcmv1/algorithm" + TRAIN_FILE = "train.py" ) var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{ @@ -120,7 +118,7 @@ func (s *ShuguangAi) QueryImageList() (interface{}, error) { return resp, nil } -func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) { +func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string) (interface{}, error) { //判断是否resourceId匹配自定义资源Id _, isMapContainsKey := RESOURCESPECSAI[resourceId] if !isMapContainsKey { @@ -148,10 +146,15 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string env += s[0] + "=" + s[1] + SPACE } + //set paths + paths := strings.Split(algorithmId, DASH) + workPath := ALGORITHM_DIR + FORWARD_SLASH + paths[0] + FORWARD_SLASH + paths[1] + DASH + paths[2] + codePath := workPath + FORWARD_SLASH + TRAIN_FILE + req := &hpcAC.SubmitPytorchTaskReq{ Params: &hpcAC.SubmitPytorchTaskParams{ TaskName: TASK_PYTORCH_PREFIX + UNDERSCORE + utils.RandomString(10), - WorkPath: WorkPath, + WorkPath: workPath, IsDistributed: false, IsHvd: false, Env: env, @@ -161,7 +164,7 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string WorkerNumber: WORKER_NUMBER, ResourceGroup: RESOURCE_GROUP, TimeoutLimit: TimeoutLimit, - PythonCodePath: PythonCodePath, + PythonCodePath: codePath, PythonArg: pythonArg, }, } @@ -183,7 +186,7 @@ func updateSGAIRequestByResourceId(resourceId string, req *hpcAC.SubmitPytorchTa req.Params.WorkerRamSize = spec.RAM } -func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) { +func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string) (interface{}, error) { //req := &hpcAC.SubmitTensorflowTaskReq{ // Params: &hpcAC.SubmitTensorflowTaskParams{ // @@ -193,16 +196,21 @@ func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []str } func (s *ShuguangAi) SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) { + // set algorithmId temporarily + if algorithmId == "" { + algorithmId = "pytorch-mnist-fully_connected_network" + } + // shuguangAi提交任务 switch aiType { case PYTORCH_TASK: - task, err := s.SubmitPytorchTask(imageId, cmd, envs, params, resourceId) + task, err := s.SubmitPytorchTask(imageId, cmd, envs, params, resourceId, datasetsId, algorithmId) if err != nil { return nil, err } return task, nil case TENSORFLOW_TASK: - task, err := s.SubmitTensorflowTask(imageId, cmd, envs, params, resourceId) + task, err := s.SubmitTensorflowTask(imageId, cmd, envs, params, resourceId, datasetsId, algorithmId) if err != nil { return nil, err } @@ -317,6 +325,10 @@ func (s *ShuguangAi) GenerateSubmitParams(option *option.AiOption) error { if err != nil { return err } + err = s.generateAlgorithmId(option) + if err != nil { + return err + } err = s.generateCmd(option) if err != nil { return err @@ -338,10 +350,50 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error { } func (s *ShuguangAi) generateImageId(option *option.AiOption) error { - + if option.TaskType == "" { + return errors.New("TaskType not set") + } + taskType := strings.Title(option.TaskType) + req := &hpcAC.GetImageListAiReq{ + AcceleratorType: DCU, + TaskType: taskType, + } + resp, err := s.svcCtx.ACRpc.GetImageListAi(s.ctx, req) + if err != nil { + return err + } + if resp.Code != "0" { + return errors.New("failed to get imageId") + } return nil } +func (s *ShuguangAi) generateAlgorithmId(option *option.AiOption) error { + if option.DatasetsName == "" { + return errors.New("DatasetsName not set") + } + req := &hpcAC.GetFileListReq{Limit: 100, Path: ALGORITHM_DIR + FORWARD_SLASH + option.TaskType, Start: 0} + list, err := s.svcCtx.ACRpc.GetFileList(s.ctx, req) + if err != nil { + return err + } + if list.Code != "0" { + return errors.New(list.Msg) + } + + var algorithmId string + for _, file := range list.Data.FileList { + ns := strings.Split(file.Name, DASH) + if ns[0] == option.DatasetsName { + algorithmId = option.TaskType + DASH + file.Name + option.AlgorithmId = algorithmId + return nil + } + } + + return errors.New("failed to get AlgorithmId") +} + func (s *ShuguangAi) generateCmd(option *option.AiOption) error { return nil diff --git a/api/internal/storeLink/storeLink.go b/api/internal/storeLink/storeLink.go index 824e196a..54809550 100644 --- a/api/internal/storeLink/storeLink.go +++ b/api/internal/storeLink/storeLink.go @@ -44,6 +44,8 @@ const ( SPACE = " " UNDERSCORE = "_" EQUAL = "=" + DASH = "-" + FORWARD_SLASH = "/" COMMA = "," STAR = "*" TYPE_OCTOPUS = "1" @@ -526,27 +528,6 @@ func ConvertType(in interface{}, out interface{}, participant *models.StorelinkC } return nil, nil - case *hpcAC.GetResourceSpecResp: - inresp := (interface{})(in).(*hpcAC.GetResourceSpecResp) - switch (interface{})(out).(type) { - case *types.GetResourceSpecsResp: - resp := (interface{})(out).(*types.GetResourceSpecsResp) - if inresp.Code != "0" { - resp.Success = false - resp.ResourceSpecs = nil - } else { - var spec types.ResourceSpecSl - resp.Success = true - spec.ParticipantName = participant.Name - spec.ParticipantId = participant.Id - spec.SpecName = SHUGUANGAI_CUSTOM_RESOURCE_NAME - spec.SpecId = SHUGUANGAI_CUSTOM_RESOURCE_ID - resp.ResourceSpecs = append(resp.ResourceSpecs, &spec) - } - return resp, nil - } - return nil, nil - case *modelarts.TrainingJobFlavorsResp: inresp := (interface{})(in).(*modelarts.TrainingJobFlavorsResp) switch (interface{})(out).(type) { @@ -862,23 +843,6 @@ func ConvertTypeOld[T any](in *T, participant *models.StorelinkCenter) (interfac resp.ResourceSpecs = append(resp.ResourceSpecs, &respec) } - return resp, nil - case *hpcAC.GetResourceSpecResp: - var resp types.GetResourceSpecsResp - inresp := (interface{})(in).(*hpcAC.GetResourceSpecResp) - - if inresp.Code != "0" { - resp.Success = false - resp.ResourceSpecs = nil - } else { - var spec types.ResourceSpecSl - resp.Success = true - spec.ParticipantName = participant.Name - spec.ParticipantId = participant.Id - spec.SpecName = SHUGUANGAI_CUSTOM_RESOURCE_NAME - spec.SpecId = SHUGUANGAI_CUSTOM_RESOURCE_ID - resp.ResourceSpecs = append(resp.ResourceSpecs, &spec) - } return resp, nil case *modelarts.TrainingJobFlavorsResp: var resp types.GetResourceSpecsResp From 7487707f3d63e7baa973f1f430f7e1e7a809e6d1 Mon Sep 17 00:00:00 2001 From: tzwang Date: Fri, 1 Mar 2024 17:07:41 +0800 Subject: [PATCH 2/2] modified ai option Former-commit-id: dc33df489a557efeecabb26e01f422b97311212a --- .../scheduler/schedulers/option/aiOption.go | 9 +- api/internal/storeLink/octopus.go | 170 +++++++++--------- api/internal/storeLink/shuguangai.go | 29 ++- 3 files changed, 118 insertions(+), 90 deletions(-) diff --git a/api/internal/scheduler/schedulers/option/aiOption.go b/api/internal/scheduler/schedulers/option/aiOption.go index 72bf9b3c..9024d907 100644 --- a/api/internal/scheduler/schedulers/option/aiOption.go +++ b/api/internal/scheduler/schedulers/option/aiOption.go @@ -2,6 +2,7 @@ package option type AiOption struct { AiClusterId string // shuguangAi /octopus ClusterId + TaskName string ResourceType string // cpu/gpu/compute card TaskType string // pytorch/tensorflow/mindspore DatasetsName string // mnist/imageNet/iris @@ -10,6 +11,7 @@ type AiOption struct { Tops float64 ComputeCard string CodeType string + AlgorithmName string ImageId string SpecId string @@ -22,7 +24,8 @@ type AiOption struct { Envs []string Params []string - Datasets string - Code string - Model interface{} + Datasets string + AlgorithmCode string + Image string + Model interface{} } diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index afdc0037..20d5a138 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -351,91 +351,7 @@ func (o *OctopusLink) generateResourceId(option *option.AiOption) error { } } - return nil -} - -func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error { - if option.Tops == 0 { - for _, spec := range specs.TrainResourceSpecs { - if spec.Price == 1 { - ns := strings.Split(spec.Name, COMMA) - cardSpecs := strings.Split(ns[0], STAR) - if cardSpecs[1] == computeCard { - option.ResourceId = spec.Id - option.ComputeCard = computeCard - return nil - } - } else { - continue - } - } - } else { - cardNum := math.Ceil(option.Tops / float64(BASE_TOPS)) - for _, spec := range specs.TrainResourceSpecs { - if option.Tops < BASE_TOPS { - if spec.Price == 1 { - ns := strings.Split(spec.Name, COMMA) - cardSpecs := strings.Split(ns[0], STAR) - if cardSpecs[1] == computeCard { - option.ResourceId = spec.Id - option.ComputeCard = computeCard - return nil - } - } else { - continue - } - } else { - ns := strings.Split(spec.Name, COMMA) - if len(ns) != 4 { - continue - } - cardSpecs := strings.Split(ns[0], STAR) - if cardSpecs[1] != computeCard { - continue - } - s, err := strconv.ParseFloat(cardSpecs[0], 64) - if err != nil { - return err - } - switch computeCard { - case GCU: - if cardNum == s { // 1, 4, 8 - option.ResourceId = spec.Id - option.ComputeCard = computeCard - return nil - } - if 1 < cardNum && cardNum <= 4 && s == 4 { - option.ResourceId = spec.Id - option.ComputeCard = computeCard - return nil - } - if 4 < cardNum && s == 8 { - option.ResourceId = spec.Id - option.ComputeCard = computeCard - return nil - } - - case MLU: // 1, 2, 4 - if cardNum/2 == s { - option.ResourceId = spec.Id - option.ComputeCard = computeCard - return nil - } - if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 { - option.ResourceId = spec.Id - option.ComputeCard = computeCard - return nil - } - if 2 < cardNum/2 && s == 4 { - option.ResourceId = spec.Id - option.ComputeCard = computeCard - return nil - } - } - } - } - } - return errors.New("set ResourceId error") + return errors.New("failed to get ResourceId") } func (o *OctopusLink) generateDatasetsId(option *option.AiOption) error { @@ -552,3 +468,87 @@ func (o *OctopusLink) generateParams(option *option.AiOption) error { return nil } + +func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error { + if option.Tops == 0 { + for _, spec := range specs.TrainResourceSpecs { + if spec.Price == 1 { + ns := strings.Split(spec.Name, COMMA) + cardSpecs := strings.Split(ns[0], STAR) + if cardSpecs[1] == computeCard { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + } else { + continue + } + } + } else { + cardNum := math.Ceil(option.Tops / float64(BASE_TOPS)) + for _, spec := range specs.TrainResourceSpecs { + if option.Tops < BASE_TOPS { + if spec.Price == 1 { + ns := strings.Split(spec.Name, COMMA) + cardSpecs := strings.Split(ns[0], STAR) + if cardSpecs[1] == computeCard { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + } else { + continue + } + } else { + ns := strings.Split(spec.Name, COMMA) + if len(ns) != 4 { + continue + } + cardSpecs := strings.Split(ns[0], STAR) + if cardSpecs[1] != computeCard { + continue + } + s, err := strconv.ParseFloat(cardSpecs[0], 64) + if err != nil { + return err + } + switch computeCard { + case GCU: + if cardNum == s { // 1, 4, 8 + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + if 1 < cardNum && cardNum <= 4 && s == 4 { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + if 4 < cardNum && s == 8 { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + + case MLU: // 1, 2, 4 + if cardNum/2 == s { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + if 2 < cardNum/2 && s == 4 { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + } + } + } + } + return errors.New("set ResourceId error") +} diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index 3e6af2d3..d97fd96c 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -31,6 +31,7 @@ const ( RAM_SIZE_1G = 1024 // 1G WORKER_NUMBER = 1 DCU = "dcu" + DCU_TOPS = 24.5 PYTORCH = "Pytorch" TASK_PYTORCH_PREFIX = "PytorchTask" TENSORFLOW = "Tensorflow" @@ -345,8 +346,27 @@ func (s *ShuguangAi) GenerateSubmitParams(option *option.AiOption) error { } func (s *ShuguangAi) generateResourceId(option *option.AiOption) error { + if option.ResourceType == "" { + return errors.New("ResourceType not set") + } - return nil + if option.ResourceType == CPU { + option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" + } + + if option.ResourceType == CARD { + if option.Tops == 0 { + option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" + } + + if option.Tops > DCU_TOPS { + option.ResourceId = "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2" + } + + //Todo add more dcu specs + } + + return errors.New("failed to get ResourceId") } func (s *ShuguangAi) generateImageId(option *option.AiOption) error { @@ -365,7 +385,12 @@ func (s *ShuguangAi) generateImageId(option *option.AiOption) error { if resp.Code != "0" { return errors.New("failed to get imageId") } - return nil + + if option.ResourceType == CPU { + + } + + return errors.New("failed to get ImageId") } func (s *ShuguangAi) generateAlgorithmId(option *option.AiOption) error {