From 05aa692150e0cd748b1afc3788c879d92a1af8dd Mon Sep 17 00:00:00 2001 From: tzwang Date: Thu, 22 Feb 2024 17:20:58 +0800 Subject: [PATCH 1/7] modified pcm.api Former-commit-id: 4ad39580a9edd828447227c920639ae40b477650 --- api/desc/pcm.api | 2 +- .../scheduler/service/collector/collector.go | 18 +++++++----- api/internal/storeLink/octopus.go | 29 ++++++++++++++++++- api/internal/storeLink/storeLink.go | 2 +- 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 2b741d6a..4058a122 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -634,5 +634,5 @@ service pcm { get /schedule/ai/getStrategies returns (AiStrategyResp) @handler ScheduleSubmitHandler - post /schedule/submit (ScheduleResp) returns (ScheduleResp) + post /schedule/submit (ScheduleReq) returns (ScheduleResp) } \ No newline at end of file diff --git a/api/internal/scheduler/service/collector/collector.go b/api/internal/scheduler/service/collector/collector.go index 31a144a6..60a3a355 100644 --- a/api/internal/scheduler/service/collector/collector.go +++ b/api/internal/scheduler/service/collector/collector.go @@ -6,20 +6,22 @@ type AiCollector interface { } type ResourceStats struct { - ParticipantId int64 - Name string - CpuAvail float64 - MemAvail float64 - DiskAvail float64 - GpuAvail float64 - CardAvail []Card - Balance float64 + ParticipantId int64 + Name string + CpuAvail float64 + MemAvail float64 + DiskAvail float64 + GpuAvail float64 + TopsToHoursAvail map[float64]float64 + Cards []Card + Balance float64 } type Card struct { Type string Name string TOpsAtFp16 float64 + Price int32 } type DatasetsSpecs struct { diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index 13fe2b07..7ddeabfd 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -39,6 +39,11 @@ const ( IMG_VERSION_PREFIX = "version_" TASK_NAME_PREFIX = "trainJob" RESOURCE_POOL = "common-pool" + HANWUJI = "hanwuji" + SUIYUAN = "suiyuan" + SAILINGSI = "sailingsi" + MLU = "mlu" + CAMBRICONMLU290 = 256 ) func NewOctopusLink(ctx context.Context, svcCtx *svc.ServiceContext, name string, id int64) *OctopusLink { @@ -187,7 +192,7 @@ func (o *OctopusLink) QuerySpecs() (interface{}, error) { // octopus查询资源规格 req := &octopus.GetResourceSpecsReq{ Platform: o.platform, - ResourcePool: "common-pool", + ResourcePool: RESOURCE_POOL, } resp, err := o.svcCtx.OctopusRpc.GetResourceSpecs(o.ctx, req) if err != nil { @@ -198,6 +203,27 @@ func (o *OctopusLink) QuerySpecs() (interface{}, error) { } func (o *OctopusLink) GetResourceStats() (*collector.ResourceStats, error) { + req := &octopus.GetResourceSpecsReq{ + Platform: o.platform, + ResourcePool: RESOURCE_POOL, + } + specResp, err := o.svcCtx.OctopusRpc.GetResourceSpecs(o.ctx, req) + if err != nil { + return nil, err + } + if !specResp.Success { + return nil, errors.New(specResp.Error.Message) + } + //_, err = o.svcCtx.OctopusRpc.GetUserBalance(o.ctx, req) + //if err != nil { + // return nil, err + //} + + //resourceStat := collector.ResourceStats{} + + //for _, spec := range specResp.TrainResourceSpecs { + //} + return nil, nil } @@ -263,6 +289,7 @@ func (o *OctopusLink) GenerateSubmitParams(option *option.AiOption) error { } func (o *OctopusLink) generateResourceId(option *option.AiOption) error { + return nil } diff --git a/api/internal/storeLink/storeLink.go b/api/internal/storeLink/storeLink.go index b1815758..84c7b2fe 100644 --- a/api/internal/storeLink/storeLink.go +++ b/api/internal/storeLink/storeLink.go @@ -72,7 +72,7 @@ var ( "3": SHUGUANGAI, "4": SHUGUANGHPC, } - resourceTypes = []string{CPU, GPU, CARD} + resourceTypes = []string{CPU, CARD} taskTypes = []string{PYTORCH_TASK, TENSORFLOW_TASK} ERROR_RESP_EMPTY = errors.New("resp empty error") From e62f734614c22e1bb64add5ea25066135c2b77d4 Mon Sep 17 00:00:00 2001 From: tzwang Date: Thu, 22 Feb 2024 21:13:27 +0800 Subject: [PATCH 2/7] updated schedule submit Former-commit-id: d80ce1e7044fae099a899c8641d8fbb40903e31f --- .../handler/schedule/schedulesubmithandler.go | 2 +- api/internal/logic/schedule/schedulesubmitlogic.go | 2 +- api/internal/storeLink/octopus.go | 14 ++++++++++---- go.mod | 2 +- go.sum | 4 ++-- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/api/internal/handler/schedule/schedulesubmithandler.go b/api/internal/handler/schedule/schedulesubmithandler.go index f344df2d..c8725f64 100644 --- a/api/internal/handler/schedule/schedulesubmithandler.go +++ b/api/internal/handler/schedule/schedulesubmithandler.go @@ -11,7 +11,7 @@ import ( func ScheduleSubmitHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { - var req types.ScheduleResp + var req types.ScheduleReq if err := httpx.Parse(r, &req); err != nil { result.ParamErrorResult(r, w, err) return diff --git a/api/internal/logic/schedule/schedulesubmitlogic.go b/api/internal/logic/schedule/schedulesubmitlogic.go index 924aa37b..e2056bc6 100644 --- a/api/internal/logic/schedule/schedulesubmitlogic.go +++ b/api/internal/logic/schedule/schedulesubmitlogic.go @@ -23,7 +23,7 @@ func NewScheduleSubmitLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Sc } } -func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleResp) (resp *types.ScheduleResp, err error) { +func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *types.ScheduleResp, err error) { // todo: add your logic here and delete this line return diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index 7ddeabfd..e7c3dbb5 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -214,10 +214,16 @@ func (o *OctopusLink) GetResourceStats() (*collector.ResourceStats, error) { if !specResp.Success { return nil, errors.New(specResp.Error.Message) } - //_, err = o.svcCtx.OctopusRpc.GetUserBalance(o.ctx, req) - //if err != nil { - // return nil, err - //} + balanceReq := &octopus.GetUserBalanceReq{ + Platform: o.platform, + } + balanceResp, err := o.svcCtx.OctopusRpc.GetUserBalance(o.ctx, balanceReq) + if err != nil { + return nil, err + } + if !balanceResp.Success { + return nil, errors.New(balanceResp.Error.Message) + } //resourceStat := collector.ResourceStats{} diff --git a/go.mod b/go.mod index 9726e06a..21e18c0a 100644 --- a/go.mod +++ b/go.mod @@ -30,7 +30,7 @@ require ( gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231214084401-de9ac5db7246 gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231101085149-724c7c4cc090 - gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231101091522-38307e241dfd + gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20240222124813-e275cfa342f4 gitlink.org.cn/jcce-pcm/pcm-participant-openstack v0.0.0-20231102023739-81a3d353c10d gitlink.org.cn/jcce-pcm/pcm-slurm v0.0.0-20231107115628-f74106c47dfa go.opentelemetry.io/otel/trace v1.21.0 diff --git a/go.sum b/go.sum index 804759ed..89e9474f 100644 --- a/go.sum +++ b/go.sum @@ -1005,8 +1005,8 @@ gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231214084401-de9ac5d gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20231214084401-de9ac5db7246/go.mod h1:LM+XeDayimN6b1AY7AhNbbhq9HJyS0u7tszMCNsNmAo= gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231101085149-724c7c4cc090 h1:jztlHo72bcWM1jUwvG3Hfk2K+AJL0RvlsdIqlktH/MI= gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231101085149-724c7c4cc090/go.mod h1:pisJKAI8FRFFUcBaH3Gob+ENXWRM97rpuYmv9s1raag= -gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231101091522-38307e241dfd h1:9GIKpN6nel4U5jD91HL/vjzwo+EHTpE13SkT7WKyXtQ= -gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20231101091522-38307e241dfd/go.mod h1:uyvpVqG1jHDXX+ubXI0RBwnWXzVykD/mliqGQIDvRoo= +gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20240222124813-e275cfa342f4 h1:NrxKAZ5uAzshB9EHcPw+XTOTzpxb5HslNRMYBrFC1Qo= +gitlink.org.cn/jcce-pcm/pcm-participant-octopus v0.0.0-20240222124813-e275cfa342f4/go.mod h1:uyvpVqG1jHDXX+ubXI0RBwnWXzVykD/mliqGQIDvRoo= gitlink.org.cn/jcce-pcm/pcm-participant-openstack v0.0.0-20231102023739-81a3d353c10d h1:hdSxVD+AN7W6j847/GsnNgOAX5IdRQRV1KLz+d4FlS0= gitlink.org.cn/jcce-pcm/pcm-participant-openstack v0.0.0-20231102023739-81a3d353c10d/go.mod h1:m75SVNfNa1TUBlQtBfR0CeETQ0ez2RIUqlSCn1Mb/js= gitlink.org.cn/jcce-pcm/pcm-slurm v0.0.0-20231107115628-f74106c47dfa h1:U0YV9ju5OPpUe8iUk4OEUtYJlINgpI0vgLC1IfZ2JUY= From c7597592f9edf530f53a3f45f89644a9c3198f7a Mon Sep 17 00:00:00 2001 From: tzwang Date: Fri, 23 Feb 2024 17:52:02 +0800 Subject: [PATCH 3/7] modified octopus implementations Former-commit-id: 163c0a8fd054e1dcf77ad3262d48e70e314f1337 --- .../scheduler/schedulers/option/aiOption.go | 2 + .../scheduler/service/collector/collector.go | 18 +-- api/internal/storeLink/octopus.go | 113 +++++++++++++++++- api/internal/storeLink/storeLink.go | 1 + 4 files changed, 123 insertions(+), 11 deletions(-) diff --git a/api/internal/scheduler/schedulers/option/aiOption.go b/api/internal/scheduler/schedulers/option/aiOption.go index acff16de..72bf9b3c 100644 --- a/api/internal/scheduler/schedulers/option/aiOption.go +++ b/api/internal/scheduler/schedulers/option/aiOption.go @@ -7,6 +7,8 @@ type AiOption struct { DatasetsName string // mnist/imageNet/iris StrategyName string ClusterToStaticWeight map[string]int32 + Tops float64 + ComputeCard string CodeType string ImageId string diff --git a/api/internal/scheduler/service/collector/collector.go b/api/internal/scheduler/service/collector/collector.go index 60a3a355..e0bfe24c 100644 --- a/api/internal/scheduler/service/collector/collector.go +++ b/api/internal/scheduler/service/collector/collector.go @@ -6,15 +6,15 @@ type AiCollector interface { } type ResourceStats struct { - ParticipantId int64 - Name string - CpuAvail float64 - MemAvail float64 - DiskAvail float64 - GpuAvail float64 - TopsToHoursAvail map[float64]float64 - Cards []Card - Balance float64 + ParticipantId int64 + Name string + CpuAvail float64 + MemAvail float64 + DiskAvail float64 + GpuAvail float64 + CardToHours map[Card]float64 + CpuToHours map[int]float64 + Balance float64 } type Card struct { diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index e7c3dbb5..489b7c66 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -22,6 +22,8 @@ import ( "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils" "gitlink.org.cn/jcce-pcm/pcm-participant-octopus/octopus" + "math" + "strconv" "strings" ) @@ -42,8 +44,11 @@ const ( HANWUJI = "hanwuji" SUIYUAN = "suiyuan" SAILINGSI = "sailingsi" - MLU = "mlu" + MLU = "MLU" CAMBRICONMLU290 = 256 + GCU = "GCU" + EnflameT20 = 128 + BASE_TOPS = 128 ) func NewOctopusLink(ctx context.Context, svcCtx *svc.ServiceContext, name string, id int64) *OctopusLink { @@ -226,8 +231,9 @@ func (o *OctopusLink) GetResourceStats() (*collector.ResourceStats, error) { } //resourceStat := collector.ResourceStats{} - + // //for _, spec := range specResp.TrainResourceSpecs { + // //} return nil, nil @@ -295,10 +301,113 @@ func (o *OctopusLink) GenerateSubmitParams(option *option.AiOption) error { } func (o *OctopusLink) generateResourceId(option *option.AiOption) error { + if option.ResourceType == "" { + return errors.New("ResourceType not set") + } + req := &octopus.GetResourceSpecsReq{ + Platform: o.platform, + ResourcePool: RESOURCE_POOL, + } + specResp, err := o.svcCtx.OctopusRpc.GetResourceSpecs(o.ctx, req) + if err != nil { + return err + } + if !specResp.Success { + return errors.New(specResp.Error.Message) + } + + if option.ResourceType == CPU { + for _, spec := range specResp.TrainResourceSpecs { + if spec.Price == 0 { + option.ResourceId = spec.Id + return nil + } + } + + } + + if option.ResourceType == CARD { + err = setResourceIdByCard(option, specResp, GCU) + if err != nil { + return err + } + } return nil } +func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error { + if option.Tops == 0 { + for _, spec := range specs.TrainResourceSpecs { + if spec.Price == 1 { + ns := strings.Split(spec.Name, COMMA) + cardSpecs := strings.Split(ns[0], STAR) + if cardSpecs[1] == computeCard { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + } else { + continue + } + } + } else { + cardNum := math.Ceil(option.Tops / float64(BASE_TOPS)) + for _, spec := range specs.TrainResourceSpecs { + if option.Tops < BASE_TOPS { + if spec.Price == 1 { + ns := strings.Split(spec.Name, COMMA) + cardSpecs := strings.Split(ns[0], STAR) + if cardSpecs[1] == computeCard { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + } else { + continue + } + } else { + ns := strings.Split(spec.Name, COMMA) + if len(ns) != 4 { + continue + } + cardSpecs := strings.Split(ns[0], STAR) + if cardSpecs[1] != computeCard { + continue + } + s, err := strconv.ParseFloat(cardSpecs[0], 64) + if err != nil { + return err + } + switch computeCard { + case GCU: + if cardNum == s { // 1, 4, 8 + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + if 1 < cardNum && cardNum <= 4 && s == 4 { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + if 4 < cardNum && s == 8 { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + + case MLU: // 1, 2, 4 + if cardNum*2 == s { + + } + } + } + } + } + return errors.New("set ResourceId error") +} + func (o *OctopusLink) generateDatasetsId(option *option.AiOption) error { if option.DatasetsName == "" { return errors.New("DatasetsName not set") diff --git a/api/internal/storeLink/storeLink.go b/api/internal/storeLink/storeLink.go index 84c7b2fe..cdaab2ad 100644 --- a/api/internal/storeLink/storeLink.go +++ b/api/internal/storeLink/storeLink.go @@ -45,6 +45,7 @@ const ( UNDERSCORE = "_" EQUAL = "=" COMMA = "," + STAR = "*" TYPE_OCTOPUS = "1" TYPE_MODELARTS = "2" TYPE_SHUGUANGAI = "3" From efdf09649d9bc787030cf88b3a6c271404aebfcc Mon Sep 17 00:00:00 2001 From: tzwang Date: Mon, 26 Feb 2024 00:36:35 +0800 Subject: [PATCH 4/7] updated ai octopus Former-commit-id: e1760e20de1bab6e6d9f4e5366672750be8fcc94 --- api/internal/storeLink/octopus.go | 50 +++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index 489b7c66..d05dd5ce 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -49,6 +49,14 @@ const ( GCU = "GCU" EnflameT20 = 128 BASE_TOPS = 128 + CAMBRICON = "cambricon" +) + +var ( + cardAliasMap = map[string]string{ + MLU: CAMBRICON, + GCU: GCU, + } ) func NewOctopusLink(ctx context.Context, svcCtx *svc.ServiceContext, name string, id int64) *OctopusLink { @@ -327,7 +335,7 @@ func (o *OctopusLink) generateResourceId(option *option.AiOption) error { } if option.ResourceType == CARD { - err = setResourceIdByCard(option, specResp, GCU) + err = setResourceIdByCard(option, specResp, MLU) if err != nil { return err } @@ -398,8 +406,20 @@ func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpec } case MLU: // 1, 2, 4 - if cardNum*2 == s { - + if cardNum/2 == s { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + if 2 < cardNum/2 && s == 4 { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil } } } @@ -434,8 +454,28 @@ func (o *OctopusLink) generateDatasetsId(option *option.AiOption) error { } func (o *OctopusLink) generateImageId(option *option.AiOption) error { - - return nil + if option.TaskType == "" { + return errors.New("TaskType not set") + } + req := &octopus.GetPresetImageListReq{ + Platform: o.platform, + PageIndex: o.pageIndex, + PageSize: o.pageSize, + } + resp, err := o.svcCtx.OctopusRpc.GetPresetImageList(o.ctx, req) + if err != nil { + return err + } + if !resp.Success { + return errors.New("failed to get PresetImages") + } + for _, image := range resp.Payload.Images { + if strings.Contains(image.ImageName, option.TaskType) && strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) { + option.ImageId = image.Id + return nil + } + } + return errors.New("failed to get ImageId") } func (o *OctopusLink) generateCmd(option *option.AiOption) error { From 40bb2d8f04a46b840c8c58b4960c4425a3d5226b Mon Sep 17 00:00:00 2001 From: tzwang Date: Mon, 26 Feb 2024 17:33:08 +0800 Subject: [PATCH 5/7] modified octopus submit options Former-commit-id: 96d68b07d16acad26c8d853e54abe6d2e6e3f1bc --- api/internal/storeLink/octopus.go | 59 ++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index d05dd5ce..d5f4c5aa 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -46,7 +46,7 @@ const ( SAILINGSI = "sailingsi" MLU = "MLU" CAMBRICONMLU290 = 256 - GCU = "GCU" + GCU = "enflame" EnflameT20 = 128 BASE_TOPS = 128 CAMBRICON = "cambricon" @@ -293,6 +293,10 @@ func (o *OctopusLink) GenerateSubmitParams(option *option.AiOption) error { if err != nil { return err } + err = o.generateAlgorithmId(option) + if err != nil { + return err + } err = o.generateCmd(option) if err != nil { return err @@ -457,19 +461,42 @@ func (o *OctopusLink) generateImageId(option *option.AiOption) error { if option.TaskType == "" { return errors.New("TaskType not set") } - req := &octopus.GetPresetImageListReq{ + + req := &octopus.GetUserImageListReq{ Platform: o.platform, PageIndex: o.pageIndex, PageSize: o.pageSize, } - resp, err := o.svcCtx.OctopusRpc.GetPresetImageList(o.ctx, req) + resp, err := o.svcCtx.OctopusRpc.GetUserImageList(o.ctx, req) if err != nil { return err } if !resp.Success { + return errors.New("failed to get imageId") + } + + if option.ResourceType == CPU { + for _, img := range resp.Payload.Images { + if img.Image.ImageName == "test-image" { + option.ImageId = img.Image.Id + return nil + } + } + } + + preImgReq := &octopus.GetPresetImageListReq{ + Platform: o.platform, + PageIndex: o.pageIndex, + PageSize: o.pageSize, + } + preImgResp, err := o.svcCtx.OctopusRpc.GetPresetImageList(o.ctx, preImgReq) + if err != nil { + return err + } + if !preImgResp.Success { return errors.New("failed to get PresetImages") } - for _, image := range resp.Payload.Images { + for _, image := range preImgResp.Payload.Images { if strings.Contains(image.ImageName, option.TaskType) && strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) { option.ImageId = image.Id return nil @@ -478,6 +505,30 @@ func (o *OctopusLink) generateImageId(option *option.AiOption) error { return errors.New("failed to get ImageId") } +func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error { + req := &octopus.GetMyAlgorithmListReq{ + Platform: o.platform, + PageIndex: o.pageIndex, + PageSize: o.pageSize, + } + resp, err := o.svcCtx.OctopusRpc.GetMyAlgorithmList(o.ctx, req) + if err != nil { + return err + } + if !resp.Success { + return errors.New("failed to get algorithmId") + } + + for _, algorithm := range resp.Payload.Algorithms { + if algorithm.FrameworkName == strings.Title(option.TaskType) && strings.Contains(algorithm.AlgorithmName, option.DatasetsName) { + option.AlgorithmId = algorithm.AlgorithmId + return nil + } + } + + return nil +} + func (o *OctopusLink) generateCmd(option *option.AiOption) error { return nil From e2ffaeee3754b1cbf018ae171e240fffd03b903d Mon Sep 17 00:00:00 2001 From: tzwang Date: Tue, 27 Feb 2024 17:46:10 +0800 Subject: [PATCH 6/7] modified storelink queryspec implications Former-commit-id: 41c7236f5274563cfd4f251805a9a366a6814f58 --- api/internal/storeLink/shuguangHpc.go | 12 ++-- api/internal/storeLink/shuguangai.go | 80 +++++++++++++++++++++------ api/internal/storeLink/storeLink.go | 2 + 3 files changed, 71 insertions(+), 23 deletions(-) diff --git a/api/internal/storeLink/shuguangHpc.go b/api/internal/storeLink/shuguangHpc.go index 85b4c32e..ed7197c4 100644 --- a/api/internal/storeLink/shuguangHpc.go +++ b/api/internal/storeLink/shuguangHpc.go @@ -45,7 +45,7 @@ const ( Username = "acgnnmfbwo" ) -var RESOURCEMAP = map[string]ResourceSpec{ +var RESOURCEMAP = map[string]ResourceSpecHpc{ "FPOqD5Cx8iNYqawEgDrAxLdrszp4Tmhl": { GAP_NNODE: "1", GAP_NPROC: "1", @@ -98,7 +98,7 @@ var RESOURCEMAP = map[string]ResourceSpec{ }, } -var RESOURCESPECS = map[string]string{ +var RESOURCESPECSHPC = map[string]string{ "FPOqD5Cx8iNYqawEgDrAxLdrszp4Tmhl": "1*NODE, CPU:1, 1*DCU", "Nd99eGNoBFC2ZTycKDlqD37heWTOmrMS": "1*NODE, CPU:2, 1*DCU", "uAmLkz6jgSZkC6o8JywG7Yo2aiFPPOBO": "1*NODE, CPU:4, 2*DCU", @@ -122,7 +122,7 @@ var AcStatus = map[string]string{ "statX": "Other", } -type ResourceSpec struct { +type ResourceSpecHpc struct { GAP_NNODE string GAP_NPROC string GAP_NDCU string @@ -148,7 +148,7 @@ func (s ShuguangHpc) SubmitTask(imageId string, cmd string, envs []string, param // shuguangHpc提交任务 //判断是否resourceId匹配自定义资源Id - _, isMapContainsKey := RESOURCESPECS[resourceId] + _, isMapContainsKey := RESOURCESPECSHPC[resourceId] if !isMapContainsKey { return nil, errors.New("shuguangHpc资源Id不存在") } @@ -233,9 +233,9 @@ func (s ShuguangHpc) QueryTask(taskId string) (interface{}, error) { } func (s ShuguangHpc) QuerySpecs() (interface{}, error) { - var resp types.GetResourceSpecsResp + resp := &types.GetResourceSpecsResp{} - for k, v := range RESOURCESPECS { + for k, v := range RESOURCESPECSHPC { var respec types.ResourceSpecSl respec.SpecId = k respec.SpecName = v diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index c2898513..0d908469 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -21,19 +21,14 @@ import ( "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/scheduler/schedulers/option" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/scheduler/service/collector" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/types" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils" "strconv" "strings" ) -type ShuguangAi struct { - ctx context.Context - svcCtx *svc.ServiceContext - platform string - participantId int64 -} - const ( + RAM_SIZE_1G = 1024 // 1G WORKER_RAM_SIZE = 10240 // 10G WORKER_NUMBER = 1 WORKER_CPU_NUMBER = 5 @@ -45,12 +40,61 @@ const ( TASK_PYTORCH_PREFIX = "PytorchTask" TENSORFLOW = "Tensorflow" RESOURCE_GROUP = "wzhdtest" - WorkPath = "/work/home/acgnnmfbwo/111111/py/" + WorkPath = "/work/home/acgnnmfbwo/pcmv1/" TimeoutLimit = "10:00:00" PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py" DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" ) +var RESOURCESGMAP = map[string]ResourceSpecSG{ + "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": { + CPU: 1, + GPU: 1, + RAM: 2 * RAM_SIZE_1G, + }, + "6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": { + CPU: 1, + GPU: 2, + RAM: 2 * RAM_SIZE_1G, + }, + "OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": { + CPU: 2, + GPU: 1, + RAM: 4 * RAM_SIZE_1G, + }, + "sBWfpkntUzsWYly11kdwEHZOYYIsFmve": { + CPU: 5, + GPU: 1, + RAM: 10 * RAM_SIZE_1G, + }, + "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": { + CPU: 5, + GPU: 2, + RAM: 10 * RAM_SIZE_1G, + }, +} + +var RESOURCESPECSAI = map[string]string{ + "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G", + "6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G", + "OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:1, RAM:4G", + "sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:5, DCU:1, RAM:10G", + "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G", +} + +type ResourceSpecSG struct { + CPU int64 + GPU int64 + RAM int64 +} + +type ShuguangAi struct { + ctx context.Context + svcCtx *svc.ServiceContext + platform string + participantId int64 +} + func NewShuguangAi(ctx context.Context, svcCtx *svc.ServiceContext, name string, id int64) *ShuguangAi { return &ShuguangAi{ctx: ctx, svcCtx: svcCtx, platform: name, participantId: id} } @@ -187,17 +231,19 @@ func (s *ShuguangAi) DeleteTask(taskId string) (interface{}, error) { } func (s *ShuguangAi) QuerySpecs() (interface{}, error) { - // ShuguangAi查询资源规格 - req := &hpcAC.GetResourceSpecReq{ - AcceleratorType: DCU, - ResourceGroup: RESOURCE_GROUP, - } - specs, err := s.svcCtx.ACRpc.GetResourceSpec(s.ctx, req) - if err != nil { - return nil, err + resp := &types.GetResourceSpecsResp{} + + for k, v := range RESOURCESPECSAI { + var respec types.ResourceSpecSl + respec.SpecId = k + respec.SpecName = v + respec.ParticipantId = s.participantId + respec.ParticipantName = s.platform + resp.ResourceSpecs = append(resp.ResourceSpecs, &respec) } - return specs, nil + resp.Success = true + return resp, nil } func (s *ShuguangAi) GetResourceStats() (*collector.ResourceStats, error) { diff --git a/api/internal/storeLink/storeLink.go b/api/internal/storeLink/storeLink.go index cdaab2ad..824e196a 100644 --- a/api/internal/storeLink/storeLink.go +++ b/api/internal/storeLink/storeLink.go @@ -156,6 +156,8 @@ func GetTaskTypes() []string { func ConvertType(in interface{}, out interface{}, participant *models.StorelinkCenter) (interface{}, error) { switch (interface{})(in).(type) { + case *types.GetResourceSpecsResp: + return in, nil case *octopus.UploadImageResp: inresp := (interface{})(in).(*octopus.UploadImageResp) switch (interface{})(out).(type) { From 1356ad593249a9f97a0a87858bc61dad651e752d Mon Sep 17 00:00:00 2001 From: tzwang Date: Wed, 28 Feb 2024 16:38:03 +0800 Subject: [PATCH 7/7] modified octopus, shuguangai implications Former-commit-id: 62cac010a042f1c1fa932fad479eccfcb6c58156 --- api/internal/storeLink/octopus.go | 9 +++++++++ api/internal/storeLink/shuguangHpc.go | 4 ++-- api/internal/storeLink/shuguangai.go | 23 +++++++++++++++-------- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index d5f4c5aa..afdc0037 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -50,6 +50,8 @@ const ( EnflameT20 = 128 BASE_TOPS = 128 CAMBRICON = "cambricon" + TRAIN_CMD = "cd /code; python train.py" + VERSION = "V1" ) var ( @@ -163,6 +165,10 @@ func (o *OctopusLink) SubmitTask(imageId string, cmd string, envs []string, para Envs: envMap, }, }, + DataSetId: datasetsId, + DataSetVersion: VERSION, + AlgorithmId: algorithmId, + AlgorithmVersion: VERSION, }, } resp, err := o.svcCtx.OctopusRpc.CreateTrainJob(o.ctx, req) @@ -530,6 +536,9 @@ func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error { } func (o *OctopusLink) generateCmd(option *option.AiOption) error { + if option.Cmd == "" { + option.Cmd = TRAIN_CMD + } return nil } diff --git a/api/internal/storeLink/shuguangHpc.go b/api/internal/storeLink/shuguangHpc.go index ed7197c4..1f2b466e 100644 --- a/api/internal/storeLink/shuguangHpc.go +++ b/api/internal/storeLink/shuguangHpc.go @@ -192,7 +192,7 @@ func (s ShuguangHpc) SubmitTask(imageId string, cmd string, envs []string, param }, } - updateRequestByResourceId(resourceId, req) + updateSGHpcRequestByResourceId(resourceId, req) resp, err := s.svcCtx.ACRpc.SubmitJob(s.ctx, req) if err != nil { @@ -261,7 +261,7 @@ func (s ShuguangHpc) DeleteTask(taskId string) (interface{}, error) { return resp, nil } -func updateRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) { +func updateSGHpcRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) { spec := RESOURCEMAP[resourceId] req.MapAppJobInfo.GAP_NNODE = spec.GAP_NNODE req.MapAppJobInfo.GAP_NPROC = spec.GAP_NPROC diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index 0d908469..4315a3d1 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -28,8 +28,7 @@ import ( ) const ( - RAM_SIZE_1G = 1024 // 1G - WORKER_RAM_SIZE = 10240 // 10G + RAM_SIZE_1G = 1024 // 1G WORKER_NUMBER = 1 WORKER_CPU_NUMBER = 5 WORKER_GPU_NUMBER = 1 @@ -46,7 +45,7 @@ const ( DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" ) -var RESOURCESGMAP = map[string]ResourceSpecSG{ +var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{ "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": { CPU: 1, GPU: 1, @@ -82,7 +81,7 @@ var RESOURCESPECSAI = map[string]string{ "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G", } -type ResourceSpecSG struct { +type ResourceSpecSGAI struct { CPU int64 GPU int64 RAM int64 @@ -123,7 +122,8 @@ func (s *ShuguangAi) QueryImageList() (interface{}, error) { func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) { //判断是否resourceId匹配自定义资源Id - if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID { + _, isMapContainsKey := RESOURCESPECSAI[resourceId] + if !isMapContainsKey { return nil, errors.New("shuguangAi资源Id不存在") } @@ -159,15 +159,15 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string Version: imageResp.Image.Version, ImagePath: imageResp.Image.Path, WorkerNumber: WORKER_NUMBER, - WorkerCpuNumber: WORKER_CPU_NUMBER, - WorkerGpuNumber: WORKER_GPU_NUMBER, - WorkerRamSize: WORKER_RAM_SIZE, ResourceGroup: RESOURCE_GROUP, TimeoutLimit: TimeoutLimit, PythonCodePath: PythonCodePath, PythonArg: pythonArg, }, } + + updateSGAIRequestByResourceId(resourceId, req) + resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req) if err != nil { return nil, err @@ -176,6 +176,13 @@ func (s *ShuguangAi) SubmitPytorchTask(imageId string, cmd string, envs []string return resp, nil } +func updateSGAIRequestByResourceId(resourceId string, req *hpcAC.SubmitPytorchTaskReq) { + spec := RESOURCESGAIMAP[resourceId] + req.Params.WorkerCpuNumber = spec.CPU + req.Params.WorkerGpuNumber = spec.GPU + req.Params.WorkerRamSize = spec.RAM +} + func (s *ShuguangAi) SubmitTensorflowTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) { //req := &hpcAC.SubmitTensorflowTaskReq{ // Params: &hpcAC.SubmitTensorflowTaskParams{