Merge pull request 'ai platforms implications modified' (#28) from tzwang/pcm-coordinator:master into master
Former-commit-id: fe276e98a1242c332e476c160f2e0435a3bd5fc0
This commit is contained in:
commit
88b3c355d8
|
@ -637,5 +637,5 @@ service pcm {
|
|||
get /schedule/ai/getStrategies returns (AiStrategyResp)
|
||||
|
||||
@handler ScheduleSubmitHandler
|
||||
post /schedule/submit (ScheduleResp) returns (ScheduleResp)
|
||||
post /schedule/submit (ScheduleReq) returns (ScheduleResp)
|
||||
}
|
|
@ -11,7 +11,7 @@ import (
|
|||
|
||||
func ScheduleSubmitHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
var req types.ScheduleResp
|
||||
var req types.ScheduleReq
|
||||
if err := httpx.Parse(r, &req); err != nil {
|
||||
result.ParamErrorResult(r, w, err)
|
||||
return
|
||||
|
|
|
@ -23,7 +23,7 @@ func NewScheduleSubmitLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Sc
|
|||
}
|
||||
}
|
||||
|
||||
func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleResp) (resp *types.ScheduleResp, err error) {
|
||||
func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *types.ScheduleResp, err error) {
|
||||
// todo: add your logic here and delete this line
|
||||
|
||||
return
|
||||
|
|
|
@ -7,6 +7,8 @@ type AiOption struct {
|
|||
DatasetsName string // mnist/imageNet/iris
|
||||
StrategyName string
|
||||
ClusterToStaticWeight map[string]int32
|
||||
Tops float64
|
||||
ComputeCard string
|
||||
CodeType string
|
||||
|
||||
ImageId string
|
||||
|
|
|
@ -12,7 +12,8 @@ type ResourceStats struct {
|
|||
MemAvail float64
|
||||
DiskAvail float64
|
||||
GpuAvail float64
|
||||
CardAvail []Card
|
||||
CardToHours map[Card]float64
|
||||
CpuToHours map[int]float64
|
||||
Balance float64
|
||||
}
|
||||
|
||||
|
@ -20,6 +21,7 @@ type Card struct {
|
|||
Type string
|
||||
Name string
|
||||
TOpsAtFp16 float64
|
||||
Price int32
|
||||
}
|
||||
|
||||
type DatasetsSpecs struct {
|
||||
|
|
|
@ -22,6 +22,8 @@ import (
|
|||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-participant-octopus/octopus"
|
||||
"math"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
|
@ -39,6 +41,24 @@ const (
|
|||
IMG_VERSION_PREFIX = "version_"
|
||||
TASK_NAME_PREFIX = "trainJob"
|
||||
RESOURCE_POOL = "common-pool"
|
||||
HANWUJI = "hanwuji"
|
||||
SUIYUAN = "suiyuan"
|
||||
SAILINGSI = "sailingsi"
|
||||
MLU = "MLU"
|
||||
CAMBRICONMLU290 = 256
|
||||
GCU = "enflame"
|
||||
EnflameT20 = 128
|
||||
BASE_TOPS = 128
|
||||
CAMBRICON = "cambricon"
|
||||
TRAIN_CMD = "cd /code; python train.py"
|
||||
VERSION = "V1"
|
||||
)
|
||||
|
||||
var (
|
||||
cardAliasMap = map[string]string{
|
||||
MLU: CAMBRICON,
|
||||
GCU: GCU,
|
||||
}
|
||||
)
|
||||
|
||||
func NewOctopusLink(ctx context.Context, svcCtx *svc.ServiceContext, name string, id int64) *OctopusLink {
|
||||
|
@ -145,6 +165,10 @@ func (o *OctopusLink) SubmitTask(imageId string, cmd string, envs []string, para
|
|||
Envs: envMap,
|
||||
},
|
||||
},
|
||||
DataSetId: datasetsId,
|
||||
DataSetVersion: VERSION,
|
||||
AlgorithmId: algorithmId,
|
||||
AlgorithmVersion: VERSION,
|
||||
},
|
||||
}
|
||||
resp, err := o.svcCtx.OctopusRpc.CreateTrainJob(o.ctx, req)
|
||||
|
@ -187,7 +211,7 @@ func (o *OctopusLink) QuerySpecs() (interface{}, error) {
|
|||
// octopus查询资源规格
|
||||
req := &octopus.GetResourceSpecsReq{
|
||||
Platform: o.platform,
|
||||
ResourcePool: "common-pool",
|
||||
ResourcePool: RESOURCE_POOL,
|
||||
}
|
||||
resp, err := o.svcCtx.OctopusRpc.GetResourceSpecs(o.ctx, req)
|
||||
if err != nil {
|
||||
|
@ -198,6 +222,34 @@ func (o *OctopusLink) QuerySpecs() (interface{}, error) {
|
|||
}
|
||||
|
||||
func (o *OctopusLink) GetResourceStats() (*collector.ResourceStats, error) {
|
||||
req := &octopus.GetResourceSpecsReq{
|
||||
Platform: o.platform,
|
||||
ResourcePool: RESOURCE_POOL,
|
||||
}
|
||||
specResp, err := o.svcCtx.OctopusRpc.GetResourceSpecs(o.ctx, req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !specResp.Success {
|
||||
return nil, errors.New(specResp.Error.Message)
|
||||
}
|
||||
balanceReq := &octopus.GetUserBalanceReq{
|
||||
Platform: o.platform,
|
||||
}
|
||||
balanceResp, err := o.svcCtx.OctopusRpc.GetUserBalance(o.ctx, balanceReq)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !balanceResp.Success {
|
||||
return nil, errors.New(balanceResp.Error.Message)
|
||||
}
|
||||
|
||||
//resourceStat := collector.ResourceStats{}
|
||||
//
|
||||
//for _, spec := range specResp.TrainResourceSpecs {
|
||||
//
|
||||
//}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
|
@ -247,6 +299,10 @@ func (o *OctopusLink) GenerateSubmitParams(option *option.AiOption) error {
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = o.generateAlgorithmId(option)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = o.generateCmd(option)
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -263,9 +319,125 @@ func (o *OctopusLink) GenerateSubmitParams(option *option.AiOption) error {
|
|||
}
|
||||
|
||||
func (o *OctopusLink) generateResourceId(option *option.AiOption) error {
|
||||
if option.ResourceType == "" {
|
||||
return errors.New("ResourceType not set")
|
||||
}
|
||||
req := &octopus.GetResourceSpecsReq{
|
||||
Platform: o.platform,
|
||||
ResourcePool: RESOURCE_POOL,
|
||||
}
|
||||
specResp, err := o.svcCtx.OctopusRpc.GetResourceSpecs(o.ctx, req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !specResp.Success {
|
||||
return errors.New(specResp.Error.Message)
|
||||
}
|
||||
|
||||
if option.ResourceType == CPU {
|
||||
for _, spec := range specResp.TrainResourceSpecs {
|
||||
if spec.Price == 0 {
|
||||
option.ResourceId = spec.Id
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if option.ResourceType == CARD {
|
||||
err = setResourceIdByCard(option, specResp, MLU)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error {
|
||||
if option.Tops == 0 {
|
||||
for _, spec := range specs.TrainResourceSpecs {
|
||||
if spec.Price == 1 {
|
||||
ns := strings.Split(spec.Name, COMMA)
|
||||
cardSpecs := strings.Split(ns[0], STAR)
|
||||
if cardSpecs[1] == computeCard {
|
||||
option.ResourceId = spec.Id
|
||||
option.ComputeCard = computeCard
|
||||
return nil
|
||||
}
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
}
|
||||
} else {
|
||||
cardNum := math.Ceil(option.Tops / float64(BASE_TOPS))
|
||||
for _, spec := range specs.TrainResourceSpecs {
|
||||
if option.Tops < BASE_TOPS {
|
||||
if spec.Price == 1 {
|
||||
ns := strings.Split(spec.Name, COMMA)
|
||||
cardSpecs := strings.Split(ns[0], STAR)
|
||||
if cardSpecs[1] == computeCard {
|
||||
option.ResourceId = spec.Id
|
||||
option.ComputeCard = computeCard
|
||||
return nil
|
||||
}
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
ns := strings.Split(spec.Name, COMMA)
|
||||
if len(ns) != 4 {
|
||||
continue
|
||||
}
|
||||
cardSpecs := strings.Split(ns[0], STAR)
|
||||
if cardSpecs[1] != computeCard {
|
||||
continue
|
||||
}
|
||||
s, err := strconv.ParseFloat(cardSpecs[0], 64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
switch computeCard {
|
||||
case GCU:
|
||||
if cardNum == s { // 1, 4, 8
|
||||
option.ResourceId = spec.Id
|
||||
option.ComputeCard = computeCard
|
||||
return nil
|
||||
}
|
||||
if 1 < cardNum && cardNum <= 4 && s == 4 {
|
||||
option.ResourceId = spec.Id
|
||||
option.ComputeCard = computeCard
|
||||
return nil
|
||||
}
|
||||
if 4 < cardNum && s == 8 {
|
||||
option.ResourceId = spec.Id
|
||||
option.ComputeCard = computeCard
|
||||
return nil
|
||||
}
|
||||
|
||||
case MLU: // 1, 2, 4
|
||||
if cardNum/2 == s {
|
||||
option.ResourceId = spec.Id
|
||||
option.ComputeCard = computeCard
|
||||
return nil
|
||||
}
|
||||
if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 {
|
||||
option.ResourceId = spec.Id
|
||||
option.ComputeCard = computeCard
|
||||
return nil
|
||||
}
|
||||
if 2 < cardNum/2 && s == 4 {
|
||||
option.ResourceId = spec.Id
|
||||
option.ComputeCard = computeCard
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return errors.New("set ResourceId error")
|
||||
}
|
||||
|
||||
func (o *OctopusLink) generateDatasetsId(option *option.AiOption) error {
|
||||
if option.DatasetsName == "" {
|
||||
return errors.New("DatasetsName not set")
|
||||
|
@ -292,11 +464,81 @@ func (o *OctopusLink) generateDatasetsId(option *option.AiOption) error {
|
|||
}
|
||||
|
||||
func (o *OctopusLink) generateImageId(option *option.AiOption) error {
|
||||
if option.TaskType == "" {
|
||||
return errors.New("TaskType not set")
|
||||
}
|
||||
|
||||
req := &octopus.GetUserImageListReq{
|
||||
Platform: o.platform,
|
||||
PageIndex: o.pageIndex,
|
||||
PageSize: o.pageSize,
|
||||
}
|
||||
resp, err := o.svcCtx.OctopusRpc.GetUserImageList(o.ctx, req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !resp.Success {
|
||||
return errors.New("failed to get imageId")
|
||||
}
|
||||
|
||||
if option.ResourceType == CPU {
|
||||
for _, img := range resp.Payload.Images {
|
||||
if img.Image.ImageName == "test-image" {
|
||||
option.ImageId = img.Image.Id
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
preImgReq := &octopus.GetPresetImageListReq{
|
||||
Platform: o.platform,
|
||||
PageIndex: o.pageIndex,
|
||||
PageSize: o.pageSize,
|
||||
}
|
||||
preImgResp, err := o.svcCtx.OctopusRpc.GetPresetImageList(o.ctx, preImgReq)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !preImgResp.Success {
|
||||
return errors.New("failed to get PresetImages")
|
||||
}
|
||||
for _, image := range preImgResp.Payload.Images {
|
||||
if strings.Contains(image.ImageName, option.TaskType) && strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) {
|
||||
option.ImageId = image.Id
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return errors.New("failed to get ImageId")
|
||||
}
|
||||
|
||||
func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
|
||||
req := &octopus.GetMyAlgorithmListReq{
|
||||
Platform: o.platform,
|
||||
PageIndex: o.pageIndex,
|
||||
PageSize: o.pageSize,
|
||||
}
|
||||
resp, err := o.svcCtx.OctopusRpc.GetMyAlgorithmList(o.ctx, req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !resp.Success {
|
||||
return errors.New("failed to get algorithmId")
|
||||
}
|
||||
|
||||
for _, algorithm := range resp.Payload.Algorithms {
|
||||
if algorithm.FrameworkName == strings.Title(option.TaskType) && strings.Contains(algorithm.AlgorithmName, option.DatasetsName) {
|
||||
option.AlgorithmId = algorithm.AlgorithmId
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *OctopusLink) generateCmd(option *option.AiOption) error {
|
||||
if option.Cmd == "" {
|
||||
option.Cmd = TRAIN_CMD
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -45,7 +45,7 @@ const (
|
|||
Username = "acgnnmfbwo"
|
||||
)
|
||||
|
||||
var RESOURCEMAP = map[string]ResourceSpec{
|
||||
var RESOURCEMAP = map[string]ResourceSpecHpc{
|
||||
"FPOqD5Cx8iNYqawEgDrAxLdrszp4Tmhl": {
|
||||
GAP_NNODE: "1",
|
||||
GAP_NPROC: "1",
|
||||
|
@ -98,7 +98,7 @@ var RESOURCEMAP = map[string]ResourceSpec{
|
|||
},
|
||||
}
|
||||
|
||||
var RESOURCESPECS = map[string]string{
|
||||
var RESOURCESPECSHPC = map[string]string{
|
||||
"FPOqD5Cx8iNYqawEgDrAxLdrszp4Tmhl": "1*NODE, CPU:1, 1*DCU",
|
||||
"Nd99eGNoBFC2ZTycKDlqD37heWTOmrMS": "1*NODE, CPU:2, 1*DCU",
|
||||
"uAmLkz6jgSZkC6o8JywG7Yo2aiFPPOBO": "1*NODE, CPU:4, 2*DCU",
|
||||
|
@ -122,7 +122,7 @@ var AcStatus = map[string]string{
|
|||
"statX": "Other",
|
||||
}
|
||||
|
||||
type ResourceSpec struct {
|
||||
type ResourceSpecHpc struct {
|
||||
GAP_NNODE string
|
||||
GAP_NPROC string
|
||||
GAP_NDCU string
|
||||
|
@ -148,7 +148,7 @@ func (s ShuguangHpc) SubmitTask(imageId string, cmd string, envs []string, param
|
|||
// shuguangHpc提交任务
|
||||
|
||||
//判断是否resourceId匹配自定义资源Id
|
||||
_, isMapContainsKey := RESOURCESPECS[resourceId]
|
||||
_, isMapContainsKey := RESOURCESPECSHPC[resourceId]
|
||||
if !isMapContainsKey {
|
||||
return nil, errors.New("shuguangHpc资源Id不存在")
|
||||
}
|
||||
|
@ -192,7 +192,7 @@ func (s ShuguangHpc) SubmitTask(imageId string, cmd string, envs []string, param
|
|||
},
|
||||
}
|
||||
|
||||
updateRequestByResourceId(resourceId, req)
|
||||
updateSGHpcRequestByResourceId(resourceId, req)
|
||||
|
||||
resp, err := s.svcCtx.ACRpc.SubmitJob(s.ctx, req)
|
||||
if err != nil {
|
||||
|
@ -233,9 +233,9 @@ func (s ShuguangHpc) QueryTask(taskId string) (interface{}, error) {
|
|||
}
|
||||
|
||||
func (s ShuguangHpc) QuerySpecs() (interface{}, error) {
|
||||
var resp types.GetResourceSpecsResp
|
||||
resp := &types.GetResourceSpecsResp{}
|
||||
|
||||
for k, v := range RESOURCESPECS {
|
||||
for k, v := range RESOURCESPECSHPC {
|
||||
var respec types.ResourceSpecSl
|
||||
respec.SpecId = k
|
||||
respec.SpecName = v
|
||||
|
@ -261,7 +261,7 @@ func (s ShuguangHpc) DeleteTask(taskId string) (interface{}, error) {
|
|||
return resp, nil
|
||||
}
|
||||
|
||||
func updateRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) {
|
||||
func updateSGHpcRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) {
|
||||
spec := RESOURCEMAP[resourceId]
|
||||
req.MapAppJobInfo.GAP_NNODE = spec.GAP_NNODE
|
||||
req.MapAppJobInfo.GAP_NPROC = spec.GAP_NPROC
|
||||
|
|
|
@ -21,20 +21,14 @@ import (
|
|||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/scheduler/schedulers/option"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/scheduler/service/collector"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/types"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type ShuguangAi struct {
|
||||
ctx context.Context
|
||||
svcCtx *svc.ServiceContext
|
||||
platform string
|
||||
participantId int64
|
||||
}
|
||||
|
||||
const (
|
||||
WORKER_RAM_SIZE = 10240 // 10G
|
||||
RAM_SIZE_1G = 1024 // 1G
|
||||
WORKER_NUMBER = 1
|
||||
WORKER_CPU_NUMBER = 5
|
||||
WORKER_GPU_NUMBER = 1
|
||||
|
@ -45,12 +39,61 @@ const (
|
|||
TASK_PYTORCH_PREFIX = "PytorchTask"
|
||||
TENSORFLOW = "Tensorflow"
|
||||
RESOURCE_GROUP = "wzhdtest"
|
||||
WorkPath = "/work/home/acgnnmfbwo/111111/py/"
|
||||
WorkPath = "/work/home/acgnnmfbwo/pcmv1/"
|
||||
TimeoutLimit = "10:00:00"
|
||||
PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py"
|
||||
DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset"
|
||||
)
|
||||
|
||||
var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
|
||||
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": {
|
||||
CPU: 1,
|
||||
GPU: 1,
|
||||
RAM: 2 * RAM_SIZE_1G,
|
||||
},
|
||||
"6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": {
|
||||
CPU: 1,
|
||||
GPU: 2,
|
||||
RAM: 2 * RAM_SIZE_1G,
|
||||
},
|
||||
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": {
|
||||
CPU: 2,
|
||||
GPU: 1,
|
||||
RAM: 4 * RAM_SIZE_1G,
|
||||
},
|
||||
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": {
|
||||
CPU: 5,
|
||||
GPU: 1,
|
||||
RAM: 10 * RAM_SIZE_1G,
|
||||
},
|
||||
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": {
|
||||
CPU: 5,
|
||||
GPU: 2,
|
||||
RAM: 10 * RAM_SIZE_1G,
|
||||
},
|
||||
}
|
||||
|
||||
var RESOURCESPECSAI = map[string]string{
|
||||
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G",
|
||||
"6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G",
|
||||
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:1, RAM:4G",
|
||||
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:5, DCU:1, RAM:10G",
|
||||
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G",
|
||||
}
|
||||
|
||||
type ResourceSpecSGAI struct {
|
||||
CPU int64
|
||||
GPU int64
|
||||
RAM int64
|
||||
}
|
||||
|
||||
type ShuguangAi struct {
|
||||
ctx context.Context
|
||||
svcCtx *svc.ServiceContext
|
||||
platform string
|
||||
participantId int64
|
||||
}
|
||||
|
||||
func NewShuguangAi(ctx context.Context, svcCtx *svc.ServiceContext, name string, id int64) *ShuguangAi {
|
||||
return &ShuguangAi{ctx: ctx, svcCtx: svcCtx, platform: name, participantId: id}
|
||||
}
|
||||
|
@ -79,7 +122,8 @@ func (s *ShuguangAi) QueryImageList() (interface{}, error) {
|
|||
|
||||
func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
|
||||
//判断是否resourceId匹配自定义资源Id
|
||||
if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID {
|
||||
_, isMapContainsKey := RESOURCESPECSAI[resourceId]
|
||||
if !isMapContainsKey {
|
||||
return nil, errors.New("shuguangAi资源Id不存在")
|
||||
}
|
||||
|
||||
|
@ -115,15 +159,15 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string
|
|||
Version: imageResp.Image.Version,
|
||||
ImagePath: imageResp.Image.Path,
|
||||
WorkerNumber: WORKER_NUMBER,
|
||||
WorkerCpuNumber: WORKER_CPU_NUMBER,
|
||||
WorkerGpuNumber: WORKER_GPU_NUMBER,
|
||||
WorkerRamSize: WORKER_RAM_SIZE,
|
||||
ResourceGroup: RESOURCE_GROUP,
|
||||
TimeoutLimit: TimeoutLimit,
|
||||
PythonCodePath: PythonCodePath,
|
||||
PythonArg: pythonArg,
|
||||
},
|
||||
}
|
||||
|
||||
updateSGAIRequestByResourceId(resourceId, req)
|
||||
|
||||
resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -132,6 +176,13 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string
|
|||
return resp, nil
|
||||
}
|
||||
|
||||
func updateSGAIRequestByResourceId(resourceId string, req *hpcAC.SubmitPytorchTaskReq) {
|
||||
spec := RESOURCESGAIMAP[resourceId]
|
||||
req.Params.WorkerCpuNumber = spec.CPU
|
||||
req.Params.WorkerGpuNumber = spec.GPU
|
||||
req.Params.WorkerRamSize = spec.RAM
|
||||
}
|
||||
|
||||
func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
|
||||
//req := &hpcAC.SubmitTensorflowTaskReq{
|
||||
// Params: &hpcAC.SubmitTensorflowTaskParams{
|
||||
|
@ -187,17 +238,19 @@ func (s *ShuguangAi) DeleteTask(taskId string) (interface{}, error) {
|
|||
}
|
||||
|
||||
func (s *ShuguangAi) QuerySpecs() (interface{}, error) {
|
||||
// ShuguangAi查询资源规格
|
||||
req := &hpcAC.GetResourceSpecReq{
|
||||
AcceleratorType: DCU,
|
||||
ResourceGroup: RESOURCE_GROUP,
|
||||
}
|
||||
specs, err := s.svcCtx.ACRpc.GetResourceSpec(s.ctx, req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
resp := &types.GetResourceSpecsResp{}
|
||||
|
||||
for k, v := range RESOURCESPECSAI {
|
||||
var respec types.ResourceSpecSl
|
||||
respec.SpecId = k
|
||||
respec.SpecName = v
|
||||
respec.ParticipantId = s.participantId
|
||||
respec.ParticipantName = s.platform
|
||||
resp.ResourceSpecs = append(resp.ResourceSpecs, &respec)
|
||||
}
|
||||
|
||||
return specs, nil
|
||||
resp.Success = true
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func (s *ShuguangAi) GetResourceStats() (*collector.ResourceStats, error) {
|
||||
|
|
|
@ -45,6 +45,7 @@ const (
|
|||
UNDERSCORE = "_"
|
||||
EQUAL = "="
|
||||
COMMA = ","
|
||||
STAR = "*"
|
||||
TYPE_OCTOPUS = "1"
|
||||
TYPE_MODELARTS = "2"
|
||||
TYPE_SHUGUANGAI = "3"
|
||||
|
@ -72,7 +73,7 @@ var (
|
|||
"3": SHUGUANGAI,
|
||||
"4": SHUGUANGHPC,
|
||||
}
|
||||
resourceTypes = []string{CPU, GPU, CARD}
|
||||
resourceTypes = []string{CPU, CARD}
|
||||
taskTypes = []string{PYTORCH_TASK, TENSORFLOW_TASK}
|
||||
|
||||
ERROR_RESP_EMPTY = errors.New("resp empty error")
|
||||
|
@ -155,6 +156,8 @@ func GetTaskTypes() []string {
|
|||
func ConvertType(in interface{}, out interface{}, participant *models.StorelinkCenter) (interface{}, error) {
|
||||
|
||||
switch (interface{})(in).(type) {
|
||||
case *types.GetResourceSpecsResp:
|
||||
return in, nil
|
||||
case *octopus.UploadImageResp:
|
||||
inresp := (interface{})(in).(*octopus.UploadImageResp)
|
||||
switch (interface{})(out).(type) {
|
||||
|
|
2
go.mod
2
go.mod
|
@ -28,7 +28,7 @@ require (
|
|||
gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231214084401-de9ac5db7246
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231101085149-724c7c4cc090
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231101091522-38307e241dfd
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20240222124813-e275cfa342f4
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-openstack v0.0.0-20231102023739-81a3d353c10d
|
||||
gitlink.org.cn/jcce-pcm/pcm-slurm v0.0.0-20231107115628-f74106c47dfa
|
||||
go.opentelemetry.io/otel/trace v1.21.0
|
||||
|
|
4
go.sum
4
go.sum
|
@ -1005,8 +1005,8 @@ gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231214084401-de9ac5d
|
|||
gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231214084401-de9ac5db7246/go.mod h1:LM+XeDayimN6b1AY7AhNbbhq9HJyS0u7tszMCNsNmAo=
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231101085149-724c7c4cc090 h1:jztlHo72bcWM1jUwvG3Hfk2K+AJL0RvlsdIqlktH/MI=
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231101085149-724c7c4cc090/go.mod h1:pisJKAI8FRFFUcBaH3Gob+ENXWRM97rpuYmv9s1raag=
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231101091522-38307e241dfd h1:9GIKpN6nel4U5jD91HL/vjzwo+EHTpE13SkT7WKyXtQ=
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231101091522-38307e241dfd/go.mod h1:uyvpVqG1jHDXX+ubXI0RBwnWXzVykD/mliqGQIDvRoo=
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20240222124813-e275cfa342f4 h1:NrxKAZ5uAzshB9EHcPw+XTOTzpxb5HslNRMYBrFC1Qo=
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20240222124813-e275cfa342f4/go.mod h1:uyvpVqG1jHDXX+ubXI0RBwnWXzVykD/mliqGQIDvRoo=
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-openstack v0.0.0-20231102023739-81a3d353c10d h1:hdSxVD+AN7W6j847/GsnNgOAX5IdRQRV1KLz+d4FlS0=
|
||||
gitlink.org.cn/jcce-pcm/pcm-participant-openstack v0.0.0-20231102023739-81a3d353c10d/go.mod h1:m75SVNfNa1TUBlQtBfR0CeETQ0ez2RIUqlSCn1Mb/js=
|
||||
gitlink.org.cn/jcce-pcm/pcm-slurm v0.0.0-20231107115628-f74106c47dfa h1:U0YV9ju5OPpUe8iUk4OEUtYJlINgpI0vgLC1IfZ2JUY=
|
||||
|
|
Loading…
Reference in New Issue