modified shuguangai submit parameters

Former-commit-id: 5d86dc8ff44ea73e6a403bb4e9b0ea0b7d38be12
This commit is contained in:
tzwang 2024-02-29 17:30:40 +08:00
parent 88b3c355d8
commit bfe7597644
2 changed files with 76 additions and 60 deletions

View File

@ -28,21 +28,19 @@ import (
) )
const ( const (
RAM_SIZE_1G = 1024 // 1G RAM_SIZE_1G = 1024 // 1G
WORKER_NUMBER = 1 WORKER_NUMBER = 1
WORKER_CPU_NUMBER = 5 DCU = "dcu"
WORKER_GPU_NUMBER = 1 PYTORCH = "Pytorch"
SHUGUANGAI_CUSTOM_RESOURCE_ID = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" TASK_PYTORCH_PREFIX = "PytorchTask"
SHUGUANGAI_CUSTOM_RESOURCE_NAME = "1*DCU, CPU:5, 内存:10GB" TENSORFLOW = "Tensorflow"
DCU = "dcu" RESOURCE_GROUP = "wzhdtest"
PYTORCH = "Pytorch" WorkPath = "/work/home/acgnnmfbwo/pcmv1/"
TASK_PYTORCH_PREFIX = "PytorchTask" TimeoutLimit = "10:00:00"
TENSORFLOW = "Tensorflow" PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py"
RESOURCE_GROUP = "wzhdtest" DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset"
WorkPath = "/work/home/acgnnmfbwo/pcmv1/" ALGORITHM_DIR = "/work/home/acgnnmfbwo/pcmv1/algorithm"
TimeoutLimit = "10:00:00" TRAIN_FILE = "train.py"
PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py"
DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset"
) )
var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{ var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
@ -120,7 +118,7 @@ func (s *ShuguangAi) QueryImageList() (interface{}, error) {
return resp, nil return resp, nil
} }
func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) { func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string) (interface{}, error) {
//判断是否resourceId匹配自定义资源Id //判断是否resourceId匹配自定义资源Id
_, isMapContainsKey := RESOURCESPECSAI[resourceId] _, isMapContainsKey := RESOURCESPECSAI[resourceId]
if !isMapContainsKey { if !isMapContainsKey {
@ -148,10 +146,15 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string
env += s[0] + "=" + s[1] + SPACE env += s[0] + "=" + s[1] + SPACE
} }
//set paths
paths := strings.Split(algorithmId, DASH)
workPath := ALGORITHM_DIR + FORWARD_SLASH + paths[0] + FORWARD_SLASH + paths[1] + DASH + paths[2]
codePath := workPath + FORWARD_SLASH + TRAIN_FILE
req := &hpcAC.SubmitPytorchTaskReq{ req := &hpcAC.SubmitPytorchTaskReq{
Params: &hpcAC.SubmitPytorchTaskParams{ Params: &hpcAC.SubmitPytorchTaskParams{
TaskName: TASK_PYTORCH_PREFIX + UNDERSCORE + utils.RandomString(10), TaskName: TASK_PYTORCH_PREFIX + UNDERSCORE + utils.RandomString(10),
WorkPath: WorkPath, WorkPath: workPath,
IsDistributed: false, IsDistributed: false,
IsHvd: false, IsHvd: false,
Env: env, Env: env,
@ -161,7 +164,7 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string
WorkerNumber: WORKER_NUMBER, WorkerNumber: WORKER_NUMBER,
ResourceGroup: RESOURCE_GROUP, ResourceGroup: RESOURCE_GROUP,
TimeoutLimit: TimeoutLimit, TimeoutLimit: TimeoutLimit,
PythonCodePath: PythonCodePath, PythonCodePath: codePath,
PythonArg: pythonArg, PythonArg: pythonArg,
}, },
} }
@ -183,7 +186,7 @@ func updateSGAIRequestByResourceId(resourceId string, req *hpcAC.SubmitPytorchTa
req.Params.WorkerRamSize = spec.RAM req.Params.WorkerRamSize = spec.RAM
} }
func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) { func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string) (interface{}, error) {
//req := &hpcAC.SubmitTensorflowTaskReq{ //req := &hpcAC.SubmitTensorflowTaskReq{
// Params: &hpcAC.SubmitTensorflowTaskParams{ // Params: &hpcAC.SubmitTensorflowTaskParams{
// //
@ -193,16 +196,21 @@ func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []str
} }
func (s *ShuguangAi) SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) { func (s *ShuguangAi) SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
// set algorithmId temporarily
if algorithmId == "" {
algorithmId = "pytorch-mnist-fully_connected_network"
}
// shuguangAi提交任务 // shuguangAi提交任务
switch aiType { switch aiType {
case PYTORCH_TASK: case PYTORCH_TASK:
task, err := s.SubmitPytorchTask(imageId, cmd, envs, params, resourceId) task, err := s.SubmitPytorchTask(imageId, cmd, envs, params, resourceId, datasetsId, algorithmId)
if err != nil { if err != nil {
return nil, err return nil, err
} }
return task, nil return task, nil
case TENSORFLOW_TASK: case TENSORFLOW_TASK:
task, err := s.SubmitTensorflowTask(imageId, cmd, envs, params, resourceId) task, err := s.SubmitTensorflowTask(imageId, cmd, envs, params, resourceId, datasetsId, algorithmId)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -317,6 +325,10 @@ func (s *ShuguangAi) GenerateSubmitParams(option *option.AiOption) error {
if err != nil { if err != nil {
return err return err
} }
err = s.generateAlgorithmId(option)
if err != nil {
return err
}
err = s.generateCmd(option) err = s.generateCmd(option)
if err != nil { if err != nil {
return err return err
@ -338,10 +350,50 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error {
} }
func (s *ShuguangAi) generateImageId(option *option.AiOption) error { func (s *ShuguangAi) generateImageId(option *option.AiOption) error {
if option.TaskType == "" {
return errors.New("TaskType not set")
}
taskType := strings.Title(option.TaskType)
req := &hpcAC.GetImageListAiReq{
AcceleratorType: DCU,
TaskType: taskType,
}
resp, err := s.svcCtx.ACRpc.GetImageListAi(s.ctx, req)
if err != nil {
return err
}
if resp.Code != "0" {
return errors.New("failed to get imageId")
}
return nil return nil
} }
func (s *ShuguangAi) generateAlgorithmId(option *option.AiOption) error {
if option.DatasetsName == "" {
return errors.New("DatasetsName not set")
}
req := &hpcAC.GetFileListReq{Limit: 100, Path: ALGORITHM_DIR + FORWARD_SLASH + option.TaskType, Start: 0}
list, err := s.svcCtx.ACRpc.GetFileList(s.ctx, req)
if err != nil {
return err
}
if list.Code != "0" {
return errors.New(list.Msg)
}
var algorithmId string
for _, file := range list.Data.FileList {
ns := strings.Split(file.Name, DASH)
if ns[0] == option.DatasetsName {
algorithmId = option.TaskType + DASH + file.Name
option.AlgorithmId = algorithmId
return nil
}
}
return errors.New("failed to get AlgorithmId")
}
func (s *ShuguangAi) generateCmd(option *option.AiOption) error { func (s *ShuguangAi) generateCmd(option *option.AiOption) error {
return nil return nil

View File

@ -44,6 +44,8 @@ const (
SPACE = " " SPACE = " "
UNDERSCORE = "_" UNDERSCORE = "_"
EQUAL = "=" EQUAL = "="
DASH = "-"
FORWARD_SLASH = "/"
COMMA = "," COMMA = ","
STAR = "*" STAR = "*"
TYPE_OCTOPUS = "1" TYPE_OCTOPUS = "1"
@ -526,27 +528,6 @@ func ConvertType(in interface{}, out interface{}, participant *models.StorelinkC
} }
return nil, nil return nil, nil
case *hpcAC.GetResourceSpecResp:
inresp := (interface{})(in).(*hpcAC.GetResourceSpecResp)
switch (interface{})(out).(type) {
case *types.GetResourceSpecsResp:
resp := (interface{})(out).(*types.GetResourceSpecsResp)
if inresp.Code != "0" {
resp.Success = false
resp.ResourceSpecs = nil
} else {
var spec types.ResourceSpecSl
resp.Success = true
spec.ParticipantName = participant.Name
spec.ParticipantId = participant.Id
spec.SpecName = SHUGUANGAI_CUSTOM_RESOURCE_NAME
spec.SpecId = SHUGUANGAI_CUSTOM_RESOURCE_ID
resp.ResourceSpecs = append(resp.ResourceSpecs, &spec)
}
return resp, nil
}
return nil, nil
case *modelarts.TrainingJobFlavorsResp: case *modelarts.TrainingJobFlavorsResp:
inresp := (interface{})(in).(*modelarts.TrainingJobFlavorsResp) inresp := (interface{})(in).(*modelarts.TrainingJobFlavorsResp)
switch (interface{})(out).(type) { switch (interface{})(out).(type) {
@ -862,23 +843,6 @@ func ConvertTypeOld[T any](in *T, participant *models.StorelinkCenter) (interfac
resp.ResourceSpecs = append(resp.ResourceSpecs, &respec) resp.ResourceSpecs = append(resp.ResourceSpecs, &respec)
} }
return resp, nil
case *hpcAC.GetResourceSpecResp:
var resp types.GetResourceSpecsResp
inresp := (interface{})(in).(*hpcAC.GetResourceSpecResp)
if inresp.Code != "0" {
resp.Success = false
resp.ResourceSpecs = nil
} else {
var spec types.ResourceSpecSl
resp.Success = true
spec.ParticipantName = participant.Name
spec.ParticipantId = participant.Id
spec.SpecName = SHUGUANGAI_CUSTOM_RESOURCE_NAME
spec.SpecId = SHUGUANGAI_CUSTOM_RESOURCE_ID
resp.ResourceSpecs = append(resp.ResourceSpecs, &spec)
}
return resp, nil return resp, nil
case *modelarts.TrainingJobFlavorsResp: case *modelarts.TrainingJobFlavorsResp:
var resp types.GetResourceSpecsResp var resp types.GetResourceSpecsResp