modified octopus, shuguangai implications

Former-commit-id: 62cac010a042f1c1fa932fad479eccfcb6c58156
This commit is contained in:
tzwang 2024-02-28 16:38:03 +08:00
parent e2ffaeee37
commit 1356ad5932
3 changed files with 26 additions and 10 deletions

View File

@ -50,6 +50,8 @@ const (
EnflameT20 = 128 EnflameT20 = 128
BASE_TOPS = 128 BASE_TOPS = 128
CAMBRICON = "cambricon" CAMBRICON = "cambricon"
TRAIN_CMD = "cd /code; python train.py"
VERSION = "V1"
) )
var ( var (
@ -163,6 +165,10 @@ func (o *OctopusLink) SubmitTask(imageId string, cmd string, envs []string, para
Envs: envMap, Envs: envMap,
}, },
}, },
DataSetId: datasetsId,
DataSetVersion: VERSION,
AlgorithmId: algorithmId,
AlgorithmVersion: VERSION,
}, },
} }
resp, err := o.svcCtx.OctopusRpc.CreateTrainJob(o.ctx, req) resp, err := o.svcCtx.OctopusRpc.CreateTrainJob(o.ctx, req)
@ -530,6 +536,9 @@ func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
} }
func (o *OctopusLink) generateCmd(option *option.AiOption) error { func (o *OctopusLink) generateCmd(option *option.AiOption) error {
if option.Cmd == "" {
option.Cmd = TRAIN_CMD
}
return nil return nil
} }

View File

@ -192,7 +192,7 @@ func (s ShuguangHpc) SubmitTask(imageId string, cmd string, envs []string, param
}, },
} }
updateRequestByResourceId(resourceId, req) updateSGHpcRequestByResourceId(resourceId, req)
resp, err := s.svcCtx.ACRpc.SubmitJob(s.ctx, req) resp, err := s.svcCtx.ACRpc.SubmitJob(s.ctx, req)
if err != nil { if err != nil {
@ -261,7 +261,7 @@ func (s ShuguangHpc) DeleteTask(taskId string) (interface{}, error) {
return resp, nil return resp, nil
} }
func updateRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) { func updateSGHpcRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) {
spec := RESOURCEMAP[resourceId] spec := RESOURCEMAP[resourceId]
req.MapAppJobInfo.GAP_NNODE = spec.GAP_NNODE req.MapAppJobInfo.GAP_NNODE = spec.GAP_NNODE
req.MapAppJobInfo.GAP_NPROC = spec.GAP_NPROC req.MapAppJobInfo.GAP_NPROC = spec.GAP_NPROC

View File

@ -28,8 +28,7 @@ import (
) )
const ( const (
RAM_SIZE_1G = 1024 // 1G RAM_SIZE_1G = 1024 // 1G
WORKER_RAM_SIZE = 10240 // 10G
WORKER_NUMBER = 1 WORKER_NUMBER = 1
WORKER_CPU_NUMBER = 5 WORKER_CPU_NUMBER = 5
WORKER_GPU_NUMBER = 1 WORKER_GPU_NUMBER = 1
@ -46,7 +45,7 @@ const (
DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset"
) )
var RESOURCESGMAP = map[string]ResourceSpecSG{ var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": { "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": {
CPU: 1, CPU: 1,
GPU: 1, GPU: 1,
@ -82,7 +81,7 @@ var RESOURCESPECSAI = map[string]string{
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G", "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G",
} }
type ResourceSpecSG struct { type ResourceSpecSGAI struct {
CPU int64 CPU int64
GPU int64 GPU int64
RAM int64 RAM int64
@ -123,7 +122,8 @@ func (s *ShuguangAi) QueryImageList() (interface{}, error) {
func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) { func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
//判断是否resourceId匹配自定义资源Id //判断是否resourceId匹配自定义资源Id
if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID { _, isMapContainsKey := RESOURCESPECSAI[resourceId]
if !isMapContainsKey {
return nil, errors.New("shuguangAi资源Id不存在") return nil, errors.New("shuguangAi资源Id不存在")
} }
@ -159,15 +159,15 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string
Version: imageResp.Image.Version, Version: imageResp.Image.Version,
ImagePath: imageResp.Image.Path, ImagePath: imageResp.Image.Path,
WorkerNumber: WORKER_NUMBER, WorkerNumber: WORKER_NUMBER,
WorkerCpuNumber: WORKER_CPU_NUMBER,
WorkerGpuNumber: WORKER_GPU_NUMBER,
WorkerRamSize: WORKER_RAM_SIZE,
ResourceGroup: RESOURCE_GROUP, ResourceGroup: RESOURCE_GROUP,
TimeoutLimit: TimeoutLimit, TimeoutLimit: TimeoutLimit,
PythonCodePath: PythonCodePath, PythonCodePath: PythonCodePath,
PythonArg: pythonArg, PythonArg: pythonArg,
}, },
} }
updateSGAIRequestByResourceId(resourceId, req)
resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req) resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req)
if err != nil { if err != nil {
return nil, err return nil, err
@ -176,6 +176,13 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string
return resp, nil return resp, nil
} }
func updateSGAIRequestByResourceId(resourceId string, req *hpcAC.SubmitPytorchTaskReq) {
spec := RESOURCESGAIMAP[resourceId]
req.Params.WorkerCpuNumber = spec.CPU
req.Params.WorkerGpuNumber = spec.GPU
req.Params.WorkerRamSize = spec.RAM
}
func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) { func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
//req := &hpcAC.SubmitTensorflowTaskReq{ //req := &hpcAC.SubmitTensorflowTaskReq{
// Params: &hpcAC.SubmitTensorflowTaskParams{ // Params: &hpcAC.SubmitTensorflowTaskParams{