diff --git a/api/internal/mqs/ScheduleAi.go b/api/internal/mqs/ScheduleAi.go index ec5c942e..6cc7dc2f 100644 --- a/api/internal/mqs/ScheduleAi.go +++ b/api/internal/mqs/ScheduleAi.go @@ -17,7 +17,9 @@ package mqs import ( "context" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" - scheduler2 "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/schedulers" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service" ) /* @@ -25,36 +27,29 @@ import ( Listening to the payment flow status change notification message queue */ type AiQueue struct { - ctx context.Context - svcCtx *svc.ServiceContext + ctx context.Context + svcCtx *svc.ServiceContext + scheduler *scheduler.Scheduler } func NewAiMq(ctx context.Context, svcCtx *svc.ServiceContext) *AiQueue { + aiExecutorMap, aiCollectorMap := service.InitAiClusterMap(svcCtx.ACRpc, svcCtx.ModelArtsRpc, svcCtx.ModelArtsImgRpc, svcCtx.OctopusRpc) return &AiQueue{ - ctx: ctx, - svcCtx: svcCtx, + ctx: ctx, + svcCtx: svcCtx, + scheduler: scheduler.NewScheduler2(aiCollectorMap, nil, aiExecutorMap), } } func (l *AiQueue) Consume(val string) error { // 接受消息, 根据标签筛选过滤 - aiSchdl := scheduler2.NewAiScheduler(val) - schdl, err := scheduler2.NewScheduler(aiSchdl, val, l.svcCtx.DbEngin, nil) - if err != nil { - return err - } - schdl.MatchLabels() + aiSchdl, _ := schedulers.NewAiScheduler(val, l.scheduler) // 调度算法 - err = schdl.AssignAndSchedule() + err := l.scheduler.AssignAndSchedule(aiSchdl) if err != nil { return err } - // 存储数据 - err = schdl.SaveToDb() - if err != nil { - return err - } return nil } diff --git a/api/internal/mqs/ScheduleCloud.go b/api/internal/mqs/ScheduleCloud.go index c40a5e9b..cb7c9d7b 100644 --- a/api/internal/mqs/ScheduleCloud.go +++ b/api/internal/mqs/ScheduleCloud.go @@ -18,6 +18,7 @@ import ( "context" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/schedulers" ) /* @@ -37,7 +38,7 @@ func NewCloudMq(ctx context.Context, svcCtx *svc.ServiceContext) *CloudMq { func (l *CloudMq) Consume(val string) error { // 接受消息, 根据标签筛选过滤 - cloudScheduler := scheduler.NewCloudScheduler() + cloudScheduler := schedulers.NewCloudScheduler() schdl, err := scheduler.NewScheduler(cloudScheduler, val, l.svcCtx.DbEngin, l.svcCtx.ParticipantRpc) if err != nil { return err diff --git a/api/internal/mqs/ScheduleHpc.go b/api/internal/mqs/ScheduleHpc.go index 1e188e92..f0b56aee 100644 --- a/api/internal/mqs/ScheduleHpc.go +++ b/api/internal/mqs/ScheduleHpc.go @@ -17,7 +17,6 @@ package mqs import ( "context" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" - scheduler2 "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler" ) /* @@ -37,24 +36,5 @@ func NewHpcMq(ctx context.Context, svcCtx *svc.ServiceContext) *HpcMq { } func (l *HpcMq) Consume(val string) error { - // 接受消息, 根据标签筛选过滤 - hpcSchdl := scheduler2.NewHpcScheduler(val) - schdl, err := scheduler2.NewScheduler(hpcSchdl, val, l.svcCtx.DbEngin, nil) - if err != nil { - return err - } - schdl.MatchLabels() - - // 调度算法 - err = schdl.AssignAndSchedule() - if err != nil { - return err - } - - // 存储数据 - err = schdl.SaveToDb() - if err != nil { - return err - } return nil } diff --git a/pkg/scheduler/aiScheduler.go b/pkg/scheduler/aiScheduler.go deleted file mode 100644 index 52708f30..00000000 --- a/pkg/scheduler/aiScheduler.go +++ /dev/null @@ -1,58 +0,0 @@ -/* - - Copyright (c) [2023] [pcm] - [pcm-coordinator] is licensed under Mulan PSL v2. - You can use this software according to the terms and conditions of the Mulan PSL v2. - You may obtain a copy of Mulan PSL v2 at: - http://license.coscl.org.cn/MulanPSL2 - THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, - EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, - MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. - See the Mulan PSL v2 for more details. - -*/ - -package scheduler - -import ( - "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response" - "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models" - "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/algorithm/providerPricing" - "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/collector" - "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategies" - "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils" -) - -type AiScheduler struct { - yamlString string - collector collector.ResourceCollector -} - -func NewAiScheduler(val string) *AiScheduler { - return &AiScheduler{yamlString: val} -} - -func (as *AiScheduler) getNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) { - ai := models.Ai{ - ParticipantId: participantId, - TaskId: task.TaskId, - Status: "Saved", - YamlString: as.yamlString, - } - utils.Convert(task.Metadata, &ai) - return ai, nil -} - -func (as *AiScheduler) pickOptimalStrategy() (strategies.Strategy, error) { - //a, b := as.genTaskAndProviders() - - return nil, nil -} - -func (as *AiScheduler) genTaskAndProviders() (*providerPricing.Task, []*providerPricing.Provider) { - return nil, nil -} - -func (as *AiScheduler) assignTask(clusters []*strategies.AssignedCluster) error { - return nil -} diff --git a/pkg/scheduler/collector/acCollector.go b/pkg/scheduler/collector/acCollector.go deleted file mode 100644 index fdeee694..00000000 --- a/pkg/scheduler/collector/acCollector.go +++ /dev/null @@ -1,13 +0,0 @@ -package collector - -//单条作业费=作业运行秒数×(CPU核心数*CPU单价+GPU卡数×GPU单价+DCU卡数×DCU单价)/3600 -//CPU单价=队列CPU费率×计算中心CPU单价 -//GPU单价=队列GPU费率×计算中心GPU单价 -//DCU单价=队列DCU费率×计算中心DCU单价 - -type ShuguangAiCollector struct { -} - -func (a *ShuguangAiCollector) getResourceSpecs() { - -} diff --git a/pkg/scheduler/collector/collector.go b/pkg/scheduler/collector/collector.go deleted file mode 100644 index 73bcffd0..00000000 --- a/pkg/scheduler/collector/collector.go +++ /dev/null @@ -1,19 +0,0 @@ -package collector - -type ResourceCollector interface { - getResourceSpecs() ([]ResourceSpecs, error) -} - -type ResourceSpecs struct { - CpuAvail float64 - MemAvail float64 - DiskAvail float64 - GpuAvail float64 - CardAvail []Card -} - -type Card struct { - Type string - Name string - TOpsAtFp16 float64 -} diff --git a/pkg/scheduler/common.go b/pkg/scheduler/common/common.go similarity index 76% rename from pkg/scheduler/common.go rename to pkg/scheduler/common/common.go index 7b2cab8e..fb71cc8c 100644 --- a/pkg/scheduler/common.go +++ b/pkg/scheduler/common/common.go @@ -12,23 +12,23 @@ */ -package scheduler +package common import ( "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response" - "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategies" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategy" "math/rand" "time" ) -type scheduleService interface { - getNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) - pickOptimalStrategy() (strategies.Strategy, error) - assignTask(clusters []*strategies.AssignedCluster) error +type SubSchedule interface { + GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) + PickOptimalStrategy() (strategy.Strategy, error) + AssignTask(clusters []*strategy.AssignedCluster) error } // 求交集 -func intersect(slice1, slice2 []int64) []int64 { +func Intersect(slice1, slice2 []int64) []int64 { m := make(map[int64]int) nn := make([]int64, 0) for _, v := range slice1 { @@ -44,7 +44,7 @@ func intersect(slice1, slice2 []int64) []int64 { return nn } -func micsSlice(origin []int64, count int) []int64 { +func MicsSlice(origin []int64, count int) []int64 { tmpOrigin := make([]int64, len(origin)) copy(tmpOrigin, origin) //一定要seed diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index e04952a9..49d3a150 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -19,31 +19,42 @@ import ( "github.com/pkg/errors" "github.com/zeromicro/go-zero/core/logx" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/common" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/database" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/executor" "gitlink.org.cn/jcce-pcm/pcm-coordinator/rpc/client/participantservice" "gorm.io/gorm" "sigs.k8s.io/yaml" "strings" ) -type scheduler struct { - task *response.TaskInfo - participantIds []int64 - scheduleService scheduleService - dbEngin *gorm.DB - result []string //pID:子任务yamlstring 键值对 - participantRpc participantservice.ParticipantService +type Scheduler struct { + task *response.TaskInfo + participantIds []int64 + subSchedule common.SubSchedule + dbEngin *gorm.DB + result []string //pID:子任务yamlstring 键值对 + participantRpc participantservice.ParticipantService + ResourceCollector *map[string]collector.ResourceCollector + Storages database.Storage + AiExecutor *map[string]executor.Executor } -func NewScheduler(scheduleService scheduleService, val string, dbEngin *gorm.DB, participantRpc participantservice.ParticipantService) (*scheduler, error) { +func NewScheduler(subSchedule common.SubSchedule, val string, dbEngin *gorm.DB, participantRpc participantservice.ParticipantService) (*Scheduler, error) { var task *response.TaskInfo err := json.Unmarshal([]byte(val), &task) if err != nil { return nil, errors.New("create scheduler failed : " + err.Error()) } - return &scheduler{task: task, scheduleService: scheduleService, dbEngin: dbEngin, participantRpc: participantRpc}, nil + return &Scheduler{task: task, subSchedule: subSchedule, dbEngin: dbEngin, participantRpc: participantRpc}, nil } -func (s *scheduler) SpecifyClusters() { +func NewScheduler2(resourceCollector *map[string]collector.ResourceCollector, storages database.Storage, aiExecutor *map[string]executor.Executor) *Scheduler { + return &Scheduler{ResourceCollector: resourceCollector, Storages: storages, AiExecutor: aiExecutor} +} + +func (s *Scheduler) SpecifyClusters() { // 如果已指定集群名,通过数据库查询后返回p端ip列表 if len(s.task.Clusters) != 0 { s.dbEngin.Raw("select id from sc_participant_phy_info where `name` in (?)", s.task.Clusters).Scan(&s.participantIds) @@ -51,7 +62,7 @@ func (s *scheduler) SpecifyClusters() { } } -func (s *scheduler) SpecifyNsID() { +func (s *Scheduler) SpecifyNsID() { // 未指定集群名,只指定nsID if len(s.task.Clusters) == 0 { if len(s.task.NsID) != 0 { @@ -67,7 +78,7 @@ func (s *scheduler) SpecifyNsID() { } } -func (s *scheduler) MatchLabels() { +func (s *Scheduler) MatchLabels() { var ids []int64 count := 0 @@ -80,7 +91,7 @@ func (s *scheduler) MatchLabels() { if count == 0 { ids = participantIds } - ids = intersect(ids, participantIds) + ids = common.Intersect(ids, participantIds) count++ } s.participantIds = ids @@ -90,7 +101,7 @@ func (s *scheduler) MatchLabels() { } // TempAssign todo 屏蔽原调度算法 -func (s *scheduler) TempAssign() error { +func (s *Scheduler) TempAssign() error { //需要判断task中的资源类型,针对metadata中的多个kind做不同处理 //输入副本数和集群列表,最终结果输出为pID对应副本数量列表,针对多个kind需要做拆分和重新拼接组合 @@ -110,28 +121,28 @@ func (s *scheduler) TempAssign() error { return nil } -func (s *scheduler) AssignAndSchedule() error { - // 已指定 ParticipantId - if s.task.ParticipantId != 0 { - return nil - } - // 标签匹配以及后,未找到ParticipantIds - if len(s.participantIds) == 0 { - return errors.New("未找到匹配的ParticipantIds") - } +func (s *Scheduler) AssignAndSchedule(ss common.SubSchedule) error { + //// 已指定 ParticipantId + //if s.task.ParticipantId != 0 { + // return nil + //} + //// 标签匹配以及后,未找到ParticipantIds + //if len(s.participantIds) == 0 { + // return errors.New("未找到匹配的ParticipantIds") + //} + // + //// 指定或者标签匹配的结果只有一个集群,给任务信息指定 + //if len(s.participantIds) == 1 { + // s.task.ParticipantId = s.participantIds[0] + // //replicas := s.task.Metadata.(map[string]interface{})["spec"].(map[string]interface{})["replicas"].(float64) + // //result := make(map[int64]string) + // //result[s.participantIds[0]] = strconv.FormatFloat(replicas, 'f', 2, 64) + // //s.result = result + // + // return nil + //} - // 指定或者标签匹配的结果只有一个集群,给任务信息指定 - if len(s.participantIds) == 1 { - s.task.ParticipantId = s.participantIds[0] - //replicas := s.task.Metadata.(map[string]interface{})["spec"].(map[string]interface{})["replicas"].(float64) - //result := make(map[int64]string) - //result[s.participantIds[0]] = strconv.FormatFloat(replicas, 'f', 2, 64) - //s.result = result - - return nil - } - - strategy, err := s.scheduleService.pickOptimalStrategy() + strategy, err := ss.PickOptimalStrategy() if err != nil { return err } @@ -147,7 +158,7 @@ func (s *scheduler) AssignAndSchedule() error { // return nil //} - err = s.scheduleService.assignTask(clusters) + err = ss.AssignTask(clusters) if err != nil { return err } @@ -155,12 +166,12 @@ func (s *scheduler) AssignAndSchedule() error { return nil } -func (s *scheduler) SaveToDb() error { +func (s *Scheduler) SaveToDb() error { for _, participantId := range s.participantIds { for _, resource := range s.task.Metadata { - structForDb, err := s.scheduleService.getNewStructForDb(s.task, resource, participantId) + structForDb, err := s.subSchedule.GetNewStructForDb(s.task, resource, participantId) if err != nil { return err } diff --git a/pkg/scheduler/schedulers/aiScheduler.go b/pkg/scheduler/schedulers/aiScheduler.go new file mode 100644 index 00000000..d127b266 --- /dev/null +++ b/pkg/scheduler/schedulers/aiScheduler.go @@ -0,0 +1,102 @@ +/* + + Copyright (c) [2023] [pcm] + [pcm-coordinator] is licensed under Mulan PSL v2. + You can use this software according to the terms and conditions of the Mulan PSL v2. + You may obtain a copy of Mulan PSL v2 at: + http://license.coscl.org.cn/MulanPSL2 + THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + See the Mulan PSL v2 for more details. + +*/ + +package schedulers + +import ( + "errors" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/algorithm/providerPricing" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/entity" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategy" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils" +) + +type AiScheduler struct { + yamlString string + task *response.TaskInfo + *scheduler.Scheduler +} + +func NewAiScheduler(val string, scheduler *scheduler.Scheduler) (*AiScheduler, error) { + return &AiScheduler{yamlString: val, Scheduler: scheduler}, nil +} + +func (as *AiScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) { + ai := models.Ai{ + ParticipantId: participantId, + TaskId: task.TaskId, + Status: "Saved", + YamlString: as.yamlString, + } + utils.Convert(task.Metadata, &ai) + return ai, nil +} + +func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) { + resources, err := as.findProvidersWithResource() + if err != nil { + return nil, err + } + + if len(resources) < 2 /*|| as.task */ { + var pros []entity.Participant + for _, resource := range resources { + pros = append(pros, entity.Participant{ + Participant_id: resource.ParticipantId, + Name: resource.Name, + }) + } + strategy := strategy.NewReplicationStrategy(nil, 0) + return strategy, nil + } + + task, providerList := as.genTaskAndProviders() + if err != nil { + return nil, nil + } + strategy := strategy.NewPricingStrategy(task, providerList...) + return strategy, nil +} + +func (as *AiScheduler) genTaskAndProviders() (*providerPricing.Task, []*providerPricing.Provider) { + + return nil, nil +} + +func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) error { + if clusters == nil { + return errors.New("clusters is nil") + } + + return nil +} + +func (as *AiScheduler) findProvidersWithResource() ([]*collector.ResourceSpecs, error) { + var resourceSpecs []*collector.ResourceSpecs + for _, resourceCollector := range *as.ResourceCollector { + spec, err := resourceCollector.GetResourceSpecs() + if err != nil { + continue + } + resourceSpecs = append(resourceSpecs, spec) + } + if len(resourceSpecs) == 0 { + return nil, errors.New("no resource found") + } + return resourceSpecs, nil +} diff --git a/pkg/scheduler/cloudScheduler.go b/pkg/scheduler/schedulers/cloudScheduler.go similarity index 90% rename from pkg/scheduler/cloudScheduler.go rename to pkg/scheduler/schedulers/cloudScheduler.go index 95121182..89d46795 100644 --- a/pkg/scheduler/cloudScheduler.go +++ b/pkg/scheduler/schedulers/cloudScheduler.go @@ -12,7 +12,7 @@ */ -package scheduler +package schedulers import ( "bytes" @@ -20,7 +20,7 @@ import ( "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/algorithm/providerPricing" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/database" - "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategies" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategy" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils" "io" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -37,17 +37,17 @@ func NewCloudScheduler() *CloudScheduler { return &CloudScheduler{} } -func (cs *CloudScheduler) pickOptimalStrategy() (strategies.Strategy, error) { +func (cs *CloudScheduler) PickOptimalStrategy() (strategy.Strategy, error) { task, providerList, err := cs.genTaskAndProviders() if err != nil { return nil, nil } //调度算法 - strategy := strategies.NewPricingStrategy(task, providerList...) + strategy := strategy.NewPricingStrategy(task, providerList...) return strategy, nil } -func (cs *CloudScheduler) getNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) { +func (cs *CloudScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) { cloud := cs.UnMarshalK8sStruct(resource, task.TaskId, task.NsID) cloud.Id = utils.GenSnowflakeID() cloud.NsID = task.NsID @@ -117,6 +117,6 @@ func (cs *CloudScheduler) genTaskAndProviders() (*providerPricing.Task, []*provi return nil, providerList, nil } -func (cs *CloudScheduler) assignTask(clusters []*strategies.AssignedCluster) error { +func (cs *CloudScheduler) AssignTask(clusters []*strategy.AssignedCluster) error { return nil } diff --git a/pkg/scheduler/hpcScheduler.go b/pkg/scheduler/schedulers/hpcScheduler.go similarity index 82% rename from pkg/scheduler/hpcScheduler.go rename to pkg/scheduler/schedulers/hpcScheduler.go index af6416e6..92d49d84 100644 --- a/pkg/scheduler/hpcScheduler.go +++ b/pkg/scheduler/schedulers/hpcScheduler.go @@ -12,14 +12,14 @@ */ -package scheduler +package schedulers import ( "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/constants" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/algorithm/providerPricing" - "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategies" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategy" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils" ) @@ -31,7 +31,7 @@ func NewHpcScheduler(val string) *HpcScheduler { return &HpcScheduler{yamlString: val} } -func (h *HpcScheduler) getNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) { +func (h *HpcScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) { hpc := models.Hpc{} utils.Convert(task.Metadata, &hpc) hpc.Id = utils.GenSnowflakeID() @@ -42,7 +42,7 @@ func (h *HpcScheduler) getNewStructForDb(task *response.TaskInfo, resource strin return hpc, nil } -func (h *HpcScheduler) pickOptimalStrategy() (strategies.Strategy, error) { +func (h *HpcScheduler) PickOptimalStrategy() (strategy.Strategy, error) { return nil, nil } @@ -50,6 +50,6 @@ func (h *HpcScheduler) genTaskAndProviders(task *response.TaskInfo) (*providerPr return nil, nil } -func (h *HpcScheduler) assignTask(clusters []*strategies.AssignedCluster) error { +func (h *HpcScheduler) AssignTask(clusters []*strategy.AssignedCluster) error { return nil } diff --git a/pkg/scheduler/schedulers/vmScheduler.go b/pkg/scheduler/schedulers/vmScheduler.go new file mode 100644 index 00000000..ad4b7de0 --- /dev/null +++ b/pkg/scheduler/schedulers/vmScheduler.go @@ -0,0 +1,24 @@ +package schedulers + +import ( + "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategy" +) + +type VmScheduler struct { +} + +func (v VmScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) { + //TODO implement me + panic("implement me") +} + +func (v VmScheduler) PickOptimalStrategy() (strategy.Strategy, error) { + //TODO implement me + panic("implement me") +} + +func (v VmScheduler) AssignTask(clusters []*strategy.AssignedCluster) error { + //TODO implement me + panic("implement me") +} diff --git a/pkg/scheduler/service/aiService.go b/pkg/scheduler/service/aiService.go new file mode 100644 index 00000000..9fa919dd --- /dev/null +++ b/pkg/scheduler/service/aiService.go @@ -0,0 +1,49 @@ +package service + +import ( + "gitlink.org.cn/jcce-pcm/pcm-ac/hpcacclient" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/executor" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/impl" + "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice" + "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice" + "gitlink.org.cn/jcce-pcm/pcm-participant-octopus/octopusclient" +) + +const ( + OCTOPUS = "Octopus" + MODELARTS = "Modelarts" + SHUGUANGAI = "ShuguangAi" +) + +var ( + AiTypeMap = map[string]string{ + "Hanwuji": OCTOPUS, + "Suiyan": OCTOPUS, + "Sailingsi": OCTOPUS, + "modelarts-CloudBrain2": MODELARTS, + "ShuguangAi": SHUGUANGAI, + } +) + +func InitAiClusterMap(ACRpc hpcacclient.HpcAC, ModelArtsRpc modelartsservice.ModelArtsService, ModelArtsImgRpc imagesservice.ImagesService, OctopusRpc octopusclient.Octopus) (*map[string]executor.Executor, *map[string]collector.ResourceCollector) { + executorMap := make(map[string]executor.Executor) + collectorMap := make(map[string]collector.ResourceCollector) + for k, v := range AiTypeMap { + switch v { + case OCTOPUS: + octopus := impl.NewOctopusExecutor(OctopusRpc, k) + collectorMap[k] = octopus + executorMap[k] = octopus + case MODELARTS: + modelarts := impl.NewModelartsExecutor(ModelArtsRpc, ModelArtsImgRpc, k) + collectorMap[k] = modelarts + executorMap[k] = modelarts + case SHUGUANGAI: + sgai := impl.NewShuguangAiExecutor(ACRpc, k) + collectorMap[k] = sgai + executorMap[k] = sgai + } + } + return &executorMap, &collectorMap +} diff --git a/pkg/scheduler/service/collector/collector.go b/pkg/scheduler/service/collector/collector.go new file mode 100644 index 00000000..0f1d5720 --- /dev/null +++ b/pkg/scheduler/service/collector/collector.go @@ -0,0 +1,22 @@ +package collector + +type ResourceCollector interface { + GetResourceSpecs() (*ResourceSpecs, error) +} + +type ResourceSpecs struct { + ParticipantId int64 + Name string + CpuAvail float64 + MemAvail float64 + DiskAvail float64 + GpuAvail float64 + CardAvail []Card + Balance float64 +} + +type Card struct { + Type string + Name string + TOpsAtFp16 float64 +} diff --git a/pkg/scheduler/service/executor/executor.go b/pkg/scheduler/service/executor/executor.go new file mode 100644 index 00000000..61055927 --- /dev/null +++ b/pkg/scheduler/service/executor/executor.go @@ -0,0 +1,17 @@ +package executor + +type Executor interface { + QueryImageList() ([]Image, error) + SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string) (Task, error) + QueryTask(taskId string) (Task, error) + QuerySpecs() (Spec, error) +} + +type Image struct { +} + +type Task struct { +} + +type Spec struct { +} diff --git a/pkg/scheduler/service/impl/modelarts.go b/pkg/scheduler/service/impl/modelarts.go new file mode 100644 index 00000000..76f702b7 --- /dev/null +++ b/pkg/scheduler/service/impl/modelarts.go @@ -0,0 +1,44 @@ +package impl + +import ( + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/executor" + "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice" + "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice" +) + +type ModelArtsExecutor struct { + Name string + pageIndex int32 + pageSize int32 + ModelArtsRpc modelartsservice.ModelArtsService + ModelArtsImgRpc imagesservice.ImagesService +} + +func NewModelartsExecutor(modelArtsRpc modelartsservice.ModelArtsService, modelArtsImgRpc imagesservice.ImagesService, name string) *ModelArtsExecutor { + return &ModelArtsExecutor{Name: name, ModelArtsRpc: modelArtsRpc, ModelArtsImgRpc: modelArtsImgRpc, pageIndex: 1, pageSize: 100} +} + +func (m ModelArtsExecutor) QueryImageList() ([]executor.Image, error) { + //TODO implement me + panic("implement me") +} + +func (m ModelArtsExecutor) SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string) (executor.Task, error) { + //TODO implement me + panic("implement me") +} + +func (m ModelArtsExecutor) QueryTask(taskId string) (executor.Task, error) { + //TODO implement me + panic("implement me") +} + +func (m ModelArtsExecutor) QuerySpecs() (executor.Spec, error) { + //TODO implement me + panic("implement me") +} + +func (a *ModelArtsExecutor) GetResourceSpecs() (*collector.ResourceSpecs, error) { + return nil, nil +} diff --git a/pkg/scheduler/service/impl/octopus.go b/pkg/scheduler/service/impl/octopus.go new file mode 100644 index 00000000..a4b6c5b8 --- /dev/null +++ b/pkg/scheduler/service/impl/octopus.go @@ -0,0 +1,42 @@ +package impl + +import ( + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/executor" + "gitlink.org.cn/jcce-pcm/pcm-participant-octopus/octopusclient" +) + +type OctopusExecutor struct { + Name string + pageIndex int32 + pageSize int32 + OctopusRpc octopusclient.Octopus +} + +func NewOctopusExecutor(OctopusRpc octopusclient.Octopus, name string) *OctopusExecutor { + return &OctopusExecutor{OctopusRpc: OctopusRpc, Name: name, pageIndex: 1, pageSize: 100} +} + +func (o OctopusExecutor) QueryImageList() ([]executor.Image, error) { + //TODO implement me + panic("implement me") +} + +func (o OctopusExecutor) SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string) (executor.Task, error) { + //TODO implement me + panic("implement me") +} + +func (o OctopusExecutor) QueryTask(taskId string) (executor.Task, error) { + //TODO implement me + panic("implement me") +} + +func (o OctopusExecutor) QuerySpecs() (executor.Spec, error) { + //TODO implement me + panic("implement me") +} + +func (a *OctopusExecutor) GetResourceSpecs() (*collector.ResourceSpecs, error) { + return nil, nil +} diff --git a/pkg/scheduler/service/impl/shuguangAi.go b/pkg/scheduler/service/impl/shuguangAi.go new file mode 100644 index 00000000..049455c6 --- /dev/null +++ b/pkg/scheduler/service/impl/shuguangAi.go @@ -0,0 +1,45 @@ +package impl + +import ( + "gitlink.org.cn/jcce-pcm/pcm-ac/hpcacclient" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector" + "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/executor" +) + +//单条作业费=作业运行秒数×(CPU核心数*CPU单价+GPU卡数×GPU单价+DCU卡数×DCU单价)/3600 +//CPU单价=队列CPU费率×计算中心CPU单价 +//GPU单价=队列GPU费率×计算中心GPU单价 +//DCU单价=队列DCU费率×计算中心DCU单价 + +type ShuguangAiExecutor struct { + Name string + ACRpc hpcacclient.HpcAC +} + +func NewShuguangAiExecutor(acRpc hpcacclient.HpcAC, name string) *ShuguangAiExecutor { + return &ShuguangAiExecutor{Name: name, ACRpc: acRpc} +} + +func (s ShuguangAiExecutor) QueryImageList() ([]executor.Image, error) { + //TODO implement me + panic("implement me") +} + +func (s ShuguangAiExecutor) SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string) (executor.Task, error) { + //TODO implement me + panic("implement me") +} + +func (s ShuguangAiExecutor) QueryTask(taskId string) (executor.Task, error) { + //TODO implement me + panic("implement me") +} + +func (s ShuguangAiExecutor) QuerySpecs() (executor.Spec, error) { + //TODO implement me + panic("implement me") +} + +func (a *ShuguangAiExecutor) GetResourceSpecs() (*collector.ResourceSpecs, error) { + return nil, nil +} diff --git a/pkg/scheduler/strategy/dynamicResources.go b/pkg/scheduler/strategy/dynamicResources.go new file mode 100644 index 00000000..579333f2 --- /dev/null +++ b/pkg/scheduler/strategy/dynamicResources.go @@ -0,0 +1,4 @@ +package strategy + +type DynamicResourcesStrategy struct { +} diff --git a/pkg/scheduler/strategies/replication.go b/pkg/scheduler/strategy/replication.go similarity index 97% rename from pkg/scheduler/strategies/replication.go rename to pkg/scheduler/strategy/replication.go index 2a699c5a..08d7e29f 100644 --- a/pkg/scheduler/strategies/replication.go +++ b/pkg/scheduler/strategy/replication.go @@ -1,4 +1,4 @@ -package strategies +package strategy import ( "github.com/pkg/errors" diff --git a/pkg/scheduler/strategies/resourcePricing.go b/pkg/scheduler/strategy/resourcePricing.go similarity index 99% rename from pkg/scheduler/strategies/resourcePricing.go rename to pkg/scheduler/strategy/resourcePricing.go index 5c620fa3..f909f62f 100644 --- a/pkg/scheduler/strategies/resourcePricing.go +++ b/pkg/scheduler/strategy/resourcePricing.go @@ -12,7 +12,7 @@ */ -package strategies +package strategy import ( "errors" diff --git a/pkg/scheduler/strategies/staticWeight.go b/pkg/scheduler/strategy/staticWeight.go similarity index 91% rename from pkg/scheduler/strategies/staticWeight.go rename to pkg/scheduler/strategy/staticWeight.go index 8e08d219..3aa5d769 100644 --- a/pkg/scheduler/strategies/staticWeight.go +++ b/pkg/scheduler/strategy/staticWeight.go @@ -1,4 +1,4 @@ -package strategies +package strategy type StaticWeightStrategy struct { // TODO: add fields diff --git a/pkg/scheduler/strategies/strategy.go b/pkg/scheduler/strategy/strategy.go similarity index 90% rename from pkg/scheduler/strategies/strategy.go rename to pkg/scheduler/strategy/strategy.go index e265acdd..1502dc21 100644 --- a/pkg/scheduler/strategies/strategy.go +++ b/pkg/scheduler/strategy/strategy.go @@ -1,4 +1,4 @@ -package strategies +package strategy type Strategy interface { Schedule() ([]*AssignedCluster, error)