modified octopus, shuguangai implications
Former-commit-id: 62cac010a042f1c1fa932fad479eccfcb6c58156
This commit is contained in:
parent
e2ffaeee37
commit
1356ad5932
|
@ -50,6 +50,8 @@ const (
|
|||
EnflameT20 = 128
|
||||
BASE_TOPS = 128
|
||||
CAMBRICON = "cambricon"
|
||||
TRAIN_CMD = "cd /code; python train.py"
|
||||
VERSION = "V1"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -163,6 +165,10 @@ func (o *OctopusLink) SubmitTask(imageId string, cmd string, envs []string, para
|
|||
Envs: envMap,
|
||||
},
|
||||
},
|
||||
DataSetId: datasetsId,
|
||||
DataSetVersion: VERSION,
|
||||
AlgorithmId: algorithmId,
|
||||
AlgorithmVersion: VERSION,
|
||||
},
|
||||
}
|
||||
resp, err := o.svcCtx.OctopusRpc.CreateTrainJob(o.ctx, req)
|
||||
|
@ -530,6 +536,9 @@ func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
|
|||
}
|
||||
|
||||
func (o *OctopusLink) generateCmd(option *option.AiOption) error {
|
||||
if option.Cmd == "" {
|
||||
option.Cmd = TRAIN_CMD
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -192,7 +192,7 @@ func (s ShuguangHpc) SubmitTask(imageId string, cmd string, envs []string, param
|
|||
},
|
||||
}
|
||||
|
||||
updateRequestByResourceId(resourceId, req)
|
||||
updateSGHpcRequestByResourceId(resourceId, req)
|
||||
|
||||
resp, err := s.svcCtx.ACRpc.SubmitJob(s.ctx, req)
|
||||
if err != nil {
|
||||
|
@ -261,7 +261,7 @@ func (s ShuguangHpc) DeleteTask(taskId string) (interface{}, error) {
|
|||
return resp, nil
|
||||
}
|
||||
|
||||
func updateRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) {
|
||||
func updateSGHpcRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) {
|
||||
spec := RESOURCEMAP[resourceId]
|
||||
req.MapAppJobInfo.GAP_NNODE = spec.GAP_NNODE
|
||||
req.MapAppJobInfo.GAP_NPROC = spec.GAP_NPROC
|
||||
|
|
|
@ -28,8 +28,7 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
RAM_SIZE_1G = 1024 // 1G
|
||||
WORKER_RAM_SIZE = 10240 // 10G
|
||||
RAM_SIZE_1G = 1024 // 1G
|
||||
WORKER_NUMBER = 1
|
||||
WORKER_CPU_NUMBER = 5
|
||||
WORKER_GPU_NUMBER = 1
|
||||
|
@ -46,7 +45,7 @@ const (
|
|||
DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset"
|
||||
)
|
||||
|
||||
var RESOURCESGMAP = map[string]ResourceSpecSG{
|
||||
var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
|
||||
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": {
|
||||
CPU: 1,
|
||||
GPU: 1,
|
||||
|
@ -82,7 +81,7 @@ var RESOURCESPECSAI = map[string]string{
|
|||
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G",
|
||||
}
|
||||
|
||||
type ResourceSpecSG struct {
|
||||
type ResourceSpecSGAI struct {
|
||||
CPU int64
|
||||
GPU int64
|
||||
RAM int64
|
||||
|
@ -123,7 +122,8 @@ func (s *ShuguangAi) QueryImageList() (interface{}, error) {
|
|||
|
||||
func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
|
||||
//判断是否resourceId匹配自定义资源Id
|
||||
if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID {
|
||||
_, isMapContainsKey := RESOURCESPECSAI[resourceId]
|
||||
if !isMapContainsKey {
|
||||
return nil, errors.New("shuguangAi资源Id不存在")
|
||||
}
|
||||
|
||||
|
@ -159,15 +159,15 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string
|
|||
Version: imageResp.Image.Version,
|
||||
ImagePath: imageResp.Image.Path,
|
||||
WorkerNumber: WORKER_NUMBER,
|
||||
WorkerCpuNumber: WORKER_CPU_NUMBER,
|
||||
WorkerGpuNumber: WORKER_GPU_NUMBER,
|
||||
WorkerRamSize: WORKER_RAM_SIZE,
|
||||
ResourceGroup: RESOURCE_GROUP,
|
||||
TimeoutLimit: TimeoutLimit,
|
||||
PythonCodePath: PythonCodePath,
|
||||
PythonArg: pythonArg,
|
||||
},
|
||||
}
|
||||
|
||||
updateSGAIRequestByResourceId(resourceId, req)
|
||||
|
||||
resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -176,6 +176,13 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string
|
|||
return resp, nil
|
||||
}
|
||||
|
||||
func updateSGAIRequestByResourceId(resourceId string, req *hpcAC.SubmitPytorchTaskReq) {
|
||||
spec := RESOURCESGAIMAP[resourceId]
|
||||
req.Params.WorkerCpuNumber = spec.CPU
|
||||
req.Params.WorkerGpuNumber = spec.GPU
|
||||
req.Params.WorkerRamSize = spec.RAM
|
||||
}
|
||||
|
||||
func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
|
||||
//req := &hpcAC.SubmitTensorflowTaskReq{
|
||||
// Params: &hpcAC.SubmitTensorflowTaskParams{
|
||||
|
|
Loading…
Reference in New Issue