modified octopus, shuguangai implications
Former-commit-id: 62cac010a042f1c1fa932fad479eccfcb6c58156
This commit is contained in:
parent
e2ffaeee37
commit
1356ad5932
|
@ -50,6 +50,8 @@ const (
|
||||||
EnflameT20 = 128
|
EnflameT20 = 128
|
||||||
BASE_TOPS = 128
|
BASE_TOPS = 128
|
||||||
CAMBRICON = "cambricon"
|
CAMBRICON = "cambricon"
|
||||||
|
TRAIN_CMD = "cd /code; python train.py"
|
||||||
|
VERSION = "V1"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -163,6 +165,10 @@ func (o *OctopusLink) SubmitTask(imageId string, cmd string, envs []string, para
|
||||||
Envs: envMap,
|
Envs: envMap,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
DataSetId: datasetsId,
|
||||||
|
DataSetVersion: VERSION,
|
||||||
|
AlgorithmId: algorithmId,
|
||||||
|
AlgorithmVersion: VERSION,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
resp, err := o.svcCtx.OctopusRpc.CreateTrainJob(o.ctx, req)
|
resp, err := o.svcCtx.OctopusRpc.CreateTrainJob(o.ctx, req)
|
||||||
|
@ -530,6 +536,9 @@ func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o *OctopusLink) generateCmd(option *option.AiOption) error {
|
func (o *OctopusLink) generateCmd(option *option.AiOption) error {
|
||||||
|
if option.Cmd == "" {
|
||||||
|
option.Cmd = TRAIN_CMD
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -192,7 +192,7 @@ func (s ShuguangHpc) SubmitTask(imageId string, cmd string, envs []string, param
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
updateRequestByResourceId(resourceId, req)
|
updateSGHpcRequestByResourceId(resourceId, req)
|
||||||
|
|
||||||
resp, err := s.svcCtx.ACRpc.SubmitJob(s.ctx, req)
|
resp, err := s.svcCtx.ACRpc.SubmitJob(s.ctx, req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -261,7 +261,7 @@ func (s ShuguangHpc) DeleteTask(taskId string) (interface{}, error) {
|
||||||
return resp, nil
|
return resp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func updateRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) {
|
func updateSGHpcRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) {
|
||||||
spec := RESOURCEMAP[resourceId]
|
spec := RESOURCEMAP[resourceId]
|
||||||
req.MapAppJobInfo.GAP_NNODE = spec.GAP_NNODE
|
req.MapAppJobInfo.GAP_NNODE = spec.GAP_NNODE
|
||||||
req.MapAppJobInfo.GAP_NPROC = spec.GAP_NPROC
|
req.MapAppJobInfo.GAP_NPROC = spec.GAP_NPROC
|
||||||
|
|
|
@ -28,8 +28,7 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
RAM_SIZE_1G = 1024 // 1G
|
RAM_SIZE_1G = 1024 // 1G
|
||||||
WORKER_RAM_SIZE = 10240 // 10G
|
|
||||||
WORKER_NUMBER = 1
|
WORKER_NUMBER = 1
|
||||||
WORKER_CPU_NUMBER = 5
|
WORKER_CPU_NUMBER = 5
|
||||||
WORKER_GPU_NUMBER = 1
|
WORKER_GPU_NUMBER = 1
|
||||||
|
@ -46,7 +45,7 @@ const (
|
||||||
DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset"
|
DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset"
|
||||||
)
|
)
|
||||||
|
|
||||||
var RESOURCESGMAP = map[string]ResourceSpecSG{
|
var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
|
||||||
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": {
|
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": {
|
||||||
CPU: 1,
|
CPU: 1,
|
||||||
GPU: 1,
|
GPU: 1,
|
||||||
|
@ -82,7 +81,7 @@ var RESOURCESPECSAI = map[string]string{
|
||||||
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G",
|
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G",
|
||||||
}
|
}
|
||||||
|
|
||||||
type ResourceSpecSG struct {
|
type ResourceSpecSGAI struct {
|
||||||
CPU int64
|
CPU int64
|
||||||
GPU int64
|
GPU int64
|
||||||
RAM int64
|
RAM int64
|
||||||
|
@ -123,7 +122,8 @@ func (s *ShuguangAi) QueryImageList() (interface{}, error) {
|
||||||
|
|
||||||
func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
|
func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
|
||||||
//判断是否resourceId匹配自定义资源Id
|
//判断是否resourceId匹配自定义资源Id
|
||||||
if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID {
|
_, isMapContainsKey := RESOURCESPECSAI[resourceId]
|
||||||
|
if !isMapContainsKey {
|
||||||
return nil, errors.New("shuguangAi资源Id不存在")
|
return nil, errors.New("shuguangAi资源Id不存在")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,15 +159,15 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string
|
||||||
Version: imageResp.Image.Version,
|
Version: imageResp.Image.Version,
|
||||||
ImagePath: imageResp.Image.Path,
|
ImagePath: imageResp.Image.Path,
|
||||||
WorkerNumber: WORKER_NUMBER,
|
WorkerNumber: WORKER_NUMBER,
|
||||||
WorkerCpuNumber: WORKER_CPU_NUMBER,
|
|
||||||
WorkerGpuNumber: WORKER_GPU_NUMBER,
|
|
||||||
WorkerRamSize: WORKER_RAM_SIZE,
|
|
||||||
ResourceGroup: RESOURCE_GROUP,
|
ResourceGroup: RESOURCE_GROUP,
|
||||||
TimeoutLimit: TimeoutLimit,
|
TimeoutLimit: TimeoutLimit,
|
||||||
PythonCodePath: PythonCodePath,
|
PythonCodePath: PythonCodePath,
|
||||||
PythonArg: pythonArg,
|
PythonArg: pythonArg,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
updateSGAIRequestByResourceId(resourceId, req)
|
||||||
|
|
||||||
resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req)
|
resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -176,6 +176,13 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string
|
||||||
return resp, nil
|
return resp, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func updateSGAIRequestByResourceId(resourceId string, req *hpcAC.SubmitPytorchTaskReq) {
|
||||||
|
spec := RESOURCESGAIMAP[resourceId]
|
||||||
|
req.Params.WorkerCpuNumber = spec.CPU
|
||||||
|
req.Params.WorkerGpuNumber = spec.GPU
|
||||||
|
req.Params.WorkerRamSize = spec.RAM
|
||||||
|
}
|
||||||
|
|
||||||
func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
|
func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
|
||||||
//req := &hpcAC.SubmitTensorflowTaskReq{
|
//req := &hpcAC.SubmitTensorflowTaskReq{
|
||||||
// Params: &hpcAC.SubmitTensorflowTaskParams{
|
// Params: &hpcAC.SubmitTensorflowTaskParams{
|
||||||
|
|
Loading…
Reference in New Issue