diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 2b741d6a..4058a122 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -634,5 +634,5 @@ service pcm { get /schedule/ai/getStrategies returns (AiStrategyResp) @handler ScheduleSubmitHandler - post /schedule/submit (ScheduleResp) returns (ScheduleResp) + post /schedule/submit (ScheduleReq) returns (ScheduleResp) } \ No newline at end of file diff --git a/api/internal/scheduler/service/collector/collector.go b/api/internal/scheduler/service/collector/collector.go index 31a144a6..60a3a355 100644 --- a/api/internal/scheduler/service/collector/collector.go +++ b/api/internal/scheduler/service/collector/collector.go @@ -6,20 +6,22 @@ type AiCollector interface { } type ResourceStats struct { - ParticipantId int64 - Name string - CpuAvail float64 - MemAvail float64 - DiskAvail float64 - GpuAvail float64 - CardAvail []Card - Balance float64 + ParticipantId int64 + Name string + CpuAvail float64 + MemAvail float64 + DiskAvail float64 + GpuAvail float64 + TopsToHoursAvail map[float64]float64 + Cards []Card + Balance float64 } type Card struct { Type string Name string TOpsAtFp16 float64 + Price int32 } type DatasetsSpecs struct { diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index 13fe2b07..7ddeabfd 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -39,6 +39,11 @@ const ( IMG_VERSION_PREFIX = "version_" TASK_NAME_PREFIX = "trainJob" RESOURCE_POOL = "common-pool" + HANWUJI = "hanwuji" + SUIYUAN = "suiyuan" + SAILINGSI = "sailingsi" + MLU = "mlu" + CAMBRICONMLU290 = 256 ) func NewOctopusLink(ctx context.Context, svcCtx *svc.ServiceContext, name string, id int64) *OctopusLink { @@ -187,7 +192,7 @@ func (o *OctopusLink) QuerySpecs() (interface{}, error) { // octopus查询资源规格 req := &octopus.GetResourceSpecsReq{ Platform: o.platform, - ResourcePool: "common-pool", + ResourcePool: RESOURCE_POOL, } resp, err := o.svcCtx.OctopusRpc.GetResourceSpecs(o.ctx, req) if err != nil { @@ -198,6 +203,27 @@ func (o *OctopusLink) QuerySpecs() (interface{}, error) { } func (o *OctopusLink) GetResourceStats() (*collector.ResourceStats, error) { + req := &octopus.GetResourceSpecsReq{ + Platform: o.platform, + ResourcePool: RESOURCE_POOL, + } + specResp, err := o.svcCtx.OctopusRpc.GetResourceSpecs(o.ctx, req) + if err != nil { + return nil, err + } + if !specResp.Success { + return nil, errors.New(specResp.Error.Message) + } + //_, err = o.svcCtx.OctopusRpc.GetUserBalance(o.ctx, req) + //if err != nil { + // return nil, err + //} + + //resourceStat := collector.ResourceStats{} + + //for _, spec := range specResp.TrainResourceSpecs { + //} + return nil, nil } @@ -263,6 +289,7 @@ func (o *OctopusLink) GenerateSubmitParams(option *option.AiOption) error { } func (o *OctopusLink) generateResourceId(option *option.AiOption) error { + return nil } diff --git a/api/internal/storeLink/storeLink.go b/api/internal/storeLink/storeLink.go index b1815758..84c7b2fe 100644 --- a/api/internal/storeLink/storeLink.go +++ b/api/internal/storeLink/storeLink.go @@ -72,7 +72,7 @@ var ( "3": SHUGUANGAI, "4": SHUGUANGHPC, } - resourceTypes = []string{CPU, GPU, CARD} + resourceTypes = []string{CPU, CARD} taskTypes = []string{PYTORCH_TASK, TENSORFLOW_TASK} ERROR_RESP_EMPTY = errors.New("resp empty error")