diff --git a/api/internal/scheduler/schedulers/option/aiOption.go b/api/internal/scheduler/schedulers/option/aiOption.go index acff16de..72bf9b3c 100644 --- a/api/internal/scheduler/schedulers/option/aiOption.go +++ b/api/internal/scheduler/schedulers/option/aiOption.go @@ -7,6 +7,8 @@ type AiOption struct { DatasetsName string // mnist/imageNet/iris StrategyName string ClusterToStaticWeight map[string]int32 + Tops float64 + ComputeCard string CodeType string ImageId string diff --git a/api/internal/scheduler/service/collector/collector.go b/api/internal/scheduler/service/collector/collector.go index 60a3a355..e0bfe24c 100644 --- a/api/internal/scheduler/service/collector/collector.go +++ b/api/internal/scheduler/service/collector/collector.go @@ -6,15 +6,15 @@ type AiCollector interface { } type ResourceStats struct { - ParticipantId int64 - Name string - CpuAvail float64 - MemAvail float64 - DiskAvail float64 - GpuAvail float64 - TopsToHoursAvail map[float64]float64 - Cards []Card - Balance float64 + ParticipantId int64 + Name string + CpuAvail float64 + MemAvail float64 + DiskAvail float64 + GpuAvail float64 + CardToHours map[Card]float64 + CpuToHours map[int]float64 + Balance float64 } type Card struct { diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index e7c3dbb5..489b7c66 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -22,6 +22,8 @@ import ( "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils" "gitlink.org.cn/jcce-pcm/pcm-participant-octopus/octopus" + "math" + "strconv" "strings" ) @@ -42,8 +44,11 @@ const ( HANWUJI = "hanwuji" SUIYUAN = "suiyuan" SAILINGSI = "sailingsi" - MLU = "mlu" + MLU = "MLU" CAMBRICONMLU290 = 256 + GCU = "GCU" + EnflameT20 = 128 + BASE_TOPS = 128 ) func NewOctopusLink(ctx context.Context, svcCtx *svc.ServiceContext, name string, id int64) *OctopusLink { @@ -226,8 +231,9 @@ func (o *OctopusLink) GetResourceStats() (*collector.ResourceStats, error) { } //resourceStat := collector.ResourceStats{} - + // //for _, spec := range specResp.TrainResourceSpecs { + // //} return nil, nil @@ -295,10 +301,113 @@ func (o *OctopusLink) GenerateSubmitParams(option *option.AiOption) error { } func (o *OctopusLink) generateResourceId(option *option.AiOption) error { + if option.ResourceType == "" { + return errors.New("ResourceType not set") + } + req := &octopus.GetResourceSpecsReq{ + Platform: o.platform, + ResourcePool: RESOURCE_POOL, + } + specResp, err := o.svcCtx.OctopusRpc.GetResourceSpecs(o.ctx, req) + if err != nil { + return err + } + if !specResp.Success { + return errors.New(specResp.Error.Message) + } + + if option.ResourceType == CPU { + for _, spec := range specResp.TrainResourceSpecs { + if spec.Price == 0 { + option.ResourceId = spec.Id + return nil + } + } + + } + + if option.ResourceType == CARD { + err = setResourceIdByCard(option, specResp, GCU) + if err != nil { + return err + } + } return nil } +func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error { + if option.Tops == 0 { + for _, spec := range specs.TrainResourceSpecs { + if spec.Price == 1 { + ns := strings.Split(spec.Name, COMMA) + cardSpecs := strings.Split(ns[0], STAR) + if cardSpecs[1] == computeCard { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + } else { + continue + } + } + } else { + cardNum := math.Ceil(option.Tops / float64(BASE_TOPS)) + for _, spec := range specs.TrainResourceSpecs { + if option.Tops < BASE_TOPS { + if spec.Price == 1 { + ns := strings.Split(spec.Name, COMMA) + cardSpecs := strings.Split(ns[0], STAR) + if cardSpecs[1] == computeCard { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + } else { + continue + } + } else { + ns := strings.Split(spec.Name, COMMA) + if len(ns) != 4 { + continue + } + cardSpecs := strings.Split(ns[0], STAR) + if cardSpecs[1] != computeCard { + continue + } + s, err := strconv.ParseFloat(cardSpecs[0], 64) + if err != nil { + return err + } + switch computeCard { + case GCU: + if cardNum == s { // 1, 4, 8 + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + if 1 < cardNum && cardNum <= 4 && s == 4 { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + if 4 < cardNum && s == 8 { + option.ResourceId = spec.Id + option.ComputeCard = computeCard + return nil + } + + case MLU: // 1, 2, 4 + if cardNum*2 == s { + + } + } + } + } + } + return errors.New("set ResourceId error") +} + func (o *OctopusLink) generateDatasetsId(option *option.AiOption) error { if option.DatasetsName == "" { return errors.New("DatasetsName not set") diff --git a/api/internal/storeLink/storeLink.go b/api/internal/storeLink/storeLink.go index 84c7b2fe..cdaab2ad 100644 --- a/api/internal/storeLink/storeLink.go +++ b/api/internal/storeLink/storeLink.go @@ -45,6 +45,7 @@ const ( UNDERSCORE = "_" EQUAL = "=" COMMA = "," + STAR = "*" TYPE_OCTOPUS = "1" TYPE_MODELARTS = "2" TYPE_SHUGUANGAI = "3"