From 1356ad593249a9f97a0a87858bc61dad651e752d Mon Sep 17 00:00:00 2001 From: tzwang Date: Wed, 28 Feb 2024 16:38:03 +0800 Subject: [PATCH] modified octopus, shuguangai implications Former-commit-id: 62cac010a042f1c1fa932fad479eccfcb6c58156 --- api/internal/storeLink/octopus.go | 9 +++++++++ api/internal/storeLink/shuguangHpc.go | 4 ++-- api/internal/storeLink/shuguangai.go | 23 +++++++++++++++-------- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index d5f4c5aa..afdc0037 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -50,6 +50,8 @@ const ( EnflameT20 = 128 BASE_TOPS = 128 CAMBRICON = "cambricon" + TRAIN_CMD = "cd /code; python train.py" + VERSION = "V1" ) var ( @@ -163,6 +165,10 @@ func (o *OctopusLink) SubmitTask(imageId string, cmd string, envs []string, para Envs: envMap, }, }, + DataSetId: datasetsId, + DataSetVersion: VERSION, + AlgorithmId: algorithmId, + AlgorithmVersion: VERSION, }, } resp, err := o.svcCtx.OctopusRpc.CreateTrainJob(o.ctx, req) @@ -530,6 +536,9 @@ func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error { } func (o *OctopusLink) generateCmd(option *option.AiOption) error { + if option.Cmd == "" { + option.Cmd = TRAIN_CMD + } return nil } diff --git a/api/internal/storeLink/shuguangHpc.go b/api/internal/storeLink/shuguangHpc.go index ed7197c4..1f2b466e 100644 --- a/api/internal/storeLink/shuguangHpc.go +++ b/api/internal/storeLink/shuguangHpc.go @@ -192,7 +192,7 @@ func (s ShuguangHpc) SubmitTask(imageId string, cmd string, envs []string, param }, } - updateRequestByResourceId(resourceId, req) + updateSGHpcRequestByResourceId(resourceId, req) resp, err := s.svcCtx.ACRpc.SubmitJob(s.ctx, req) if err != nil { @@ -261,7 +261,7 @@ func (s ShuguangHpc) DeleteTask(taskId string) (interface{}, error) { return resp, nil } -func updateRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) { +func updateSGHpcRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) { spec := RESOURCEMAP[resourceId] req.MapAppJobInfo.GAP_NNODE = spec.GAP_NNODE req.MapAppJobInfo.GAP_NPROC = spec.GAP_NPROC diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index 0d908469..4315a3d1 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -28,8 +28,7 @@ import ( ) const ( - RAM_SIZE_1G = 1024 // 1G - WORKER_RAM_SIZE = 10240 // 10G + RAM_SIZE_1G = 1024 // 1G WORKER_NUMBER = 1 WORKER_CPU_NUMBER = 5 WORKER_GPU_NUMBER = 1 @@ -46,7 +45,7 @@ const ( DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" ) -var RESOURCESGMAP = map[string]ResourceSpecSG{ +var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{ "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": { CPU: 1, GPU: 1, @@ -82,7 +81,7 @@ var RESOURCESPECSAI = map[string]string{ "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G", } -type ResourceSpecSG struct { +type ResourceSpecSGAI struct { CPU int64 GPU int64 RAM int64 @@ -123,7 +122,8 @@ func (s *ShuguangAi) QueryImageList() (interface{}, error) { func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) { //判断是否resourceId匹配自定义资源Id - if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID { + _, isMapContainsKey := RESOURCESPECSAI[resourceId] + if !isMapContainsKey { return nil, errors.New("shuguangAi资源Id不存在") } @@ -159,15 +159,15 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string Version: imageResp.Image.Version, ImagePath: imageResp.Image.Path, WorkerNumber: WORKER_NUMBER, - WorkerCpuNumber: WORKER_CPU_NUMBER, - WorkerGpuNumber: WORKER_GPU_NUMBER, - WorkerRamSize: WORKER_RAM_SIZE, ResourceGroup: RESOURCE_GROUP, TimeoutLimit: TimeoutLimit, PythonCodePath: PythonCodePath, PythonArg: pythonArg, }, } + + updateSGAIRequestByResourceId(resourceId, req) + resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req) if err != nil { return nil, err @@ -176,6 +176,13 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string return resp, nil } +func updateSGAIRequestByResourceId(resourceId string, req *hpcAC.SubmitPytorchTaskReq) { + spec := RESOURCESGAIMAP[resourceId] + req.Params.WorkerCpuNumber = spec.CPU + req.Params.WorkerGpuNumber = spec.GPU + req.Params.WorkerRamSize = spec.RAM +} + func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) { //req := &hpcAC.SubmitTensorflowTaskReq{ // Params: &hpcAC.SubmitTensorflowTaskParams{