Merge pull request 'scheduler module modified' (#12) from tzwang/pcm-coordinator:master into master
Former-commit-id: 7cceae0438dce12298a59c312b1ffd05db791083
This commit is contained in:
commit
bf27a2d75c
|
@ -17,7 +17,9 @@ package mqs
|
|||
import (
|
||||
"context"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
|
||||
scheduler2 "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/schedulers"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service"
|
||||
)
|
||||
|
||||
/*
|
||||
|
@ -25,36 +27,29 @@ import (
|
|||
Listening to the payment flow status change notification message queue
|
||||
*/
|
||||
type AiQueue struct {
|
||||
ctx context.Context
|
||||
svcCtx *svc.ServiceContext
|
||||
ctx context.Context
|
||||
svcCtx *svc.ServiceContext
|
||||
scheduler *scheduler.Scheduler
|
||||
}
|
||||
|
||||
func NewAiMq(ctx context.Context, svcCtx *svc.ServiceContext) *AiQueue {
|
||||
aiExecutorMap, aiCollectorMap := service.InitAiClusterMap(svcCtx.ACRpc, svcCtx.ModelArtsRpc, svcCtx.ModelArtsImgRpc, svcCtx.OctopusRpc)
|
||||
return &AiQueue{
|
||||
ctx: ctx,
|
||||
svcCtx: svcCtx,
|
||||
ctx: ctx,
|
||||
svcCtx: svcCtx,
|
||||
scheduler: scheduler.NewScheduler2(aiCollectorMap, nil, aiExecutorMap),
|
||||
}
|
||||
}
|
||||
|
||||
func (l *AiQueue) Consume(val string) error {
|
||||
// 接受消息, 根据标签筛选过滤
|
||||
aiSchdl := scheduler2.NewAiScheduler(val)
|
||||
schdl, err := scheduler2.NewScheduler(aiSchdl, val, l.svcCtx.DbEngin, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
schdl.MatchLabels()
|
||||
aiSchdl, _ := schedulers.NewAiScheduler(val, l.scheduler)
|
||||
|
||||
// 调度算法
|
||||
err = schdl.AssignAndSchedule()
|
||||
err := l.scheduler.AssignAndSchedule(aiSchdl)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 存储数据
|
||||
err = schdl.SaveToDb()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@ import (
|
|||
"context"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/schedulers"
|
||||
)
|
||||
|
||||
/*
|
||||
|
@ -37,7 +38,7 @@ func NewCloudMq(ctx context.Context, svcCtx *svc.ServiceContext) *CloudMq {
|
|||
|
||||
func (l *CloudMq) Consume(val string) error {
|
||||
// 接受消息, 根据标签筛选过滤
|
||||
cloudScheduler := scheduler.NewCloudScheduler()
|
||||
cloudScheduler := schedulers.NewCloudScheduler()
|
||||
schdl, err := scheduler.NewScheduler(cloudScheduler, val, l.svcCtx.DbEngin, l.svcCtx.ParticipantRpc)
|
||||
if err != nil {
|
||||
return err
|
||||
|
|
|
@ -17,7 +17,6 @@ package mqs
|
|||
import (
|
||||
"context"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
|
||||
scheduler2 "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler"
|
||||
)
|
||||
|
||||
/*
|
||||
|
@ -37,24 +36,5 @@ func NewHpcMq(ctx context.Context, svcCtx *svc.ServiceContext) *HpcMq {
|
|||
}
|
||||
|
||||
func (l *HpcMq) Consume(val string) error {
|
||||
// 接受消息, 根据标签筛选过滤
|
||||
hpcSchdl := scheduler2.NewHpcScheduler(val)
|
||||
schdl, err := scheduler2.NewScheduler(hpcSchdl, val, l.svcCtx.DbEngin, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
schdl.MatchLabels()
|
||||
|
||||
// 调度算法
|
||||
err = schdl.AssignAndSchedule()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 存储数据
|
||||
err = schdl.SaveToDb()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -1,58 +0,0 @@
|
|||
/*
|
||||
|
||||
Copyright (c) [2023] [pcm]
|
||||
[pcm-coordinator] is licensed under Mulan PSL v2.
|
||||
You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
You may obtain a copy of Mulan PSL v2 at:
|
||||
http://license.coscl.org.cn/MulanPSL2
|
||||
THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
||||
EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
||||
MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
||||
See the Mulan PSL v2 for more details.
|
||||
|
||||
*/
|
||||
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/algorithm/providerPricing"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/collector"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategies"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
|
||||
)
|
||||
|
||||
type AiScheduler struct {
|
||||
yamlString string
|
||||
collector collector.ResourceCollector
|
||||
}
|
||||
|
||||
func NewAiScheduler(val string) *AiScheduler {
|
||||
return &AiScheduler{yamlString: val}
|
||||
}
|
||||
|
||||
func (as *AiScheduler) getNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
|
||||
ai := models.Ai{
|
||||
ParticipantId: participantId,
|
||||
TaskId: task.TaskId,
|
||||
Status: "Saved",
|
||||
YamlString: as.yamlString,
|
||||
}
|
||||
utils.Convert(task.Metadata, &ai)
|
||||
return ai, nil
|
||||
}
|
||||
|
||||
func (as *AiScheduler) pickOptimalStrategy() (strategies.Strategy, error) {
|
||||
//a, b := as.genTaskAndProviders()
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (as *AiScheduler) genTaskAndProviders() (*providerPricing.Task, []*providerPricing.Provider) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (as *AiScheduler) assignTask(clusters []*strategies.AssignedCluster) error {
|
||||
return nil
|
||||
}
|
|
@ -1,13 +0,0 @@
|
|||
package collector
|
||||
|
||||
//单条作业费=作业运行秒数×(CPU核心数*CPU单价+GPU卡数×GPU单价+DCU卡数×DCU单价)/3600
|
||||
//CPU单价=队列CPU费率×计算中心CPU单价
|
||||
//GPU单价=队列GPU费率×计算中心GPU单价
|
||||
//DCU单价=队列DCU费率×计算中心DCU单价
|
||||
|
||||
type ShuguangAiCollector struct {
|
||||
}
|
||||
|
||||
func (a *ShuguangAiCollector) getResourceSpecs() {
|
||||
|
||||
}
|
|
@ -1,19 +0,0 @@
|
|||
package collector
|
||||
|
||||
type ResourceCollector interface {
|
||||
getResourceSpecs() ([]ResourceSpecs, error)
|
||||
}
|
||||
|
||||
type ResourceSpecs struct {
|
||||
CpuAvail float64
|
||||
MemAvail float64
|
||||
DiskAvail float64
|
||||
GpuAvail float64
|
||||
CardAvail []Card
|
||||
}
|
||||
|
||||
type Card struct {
|
||||
Type string
|
||||
Name string
|
||||
TOpsAtFp16 float64
|
||||
}
|
|
@ -12,23 +12,23 @@
|
|||
|
||||
*/
|
||||
|
||||
package scheduler
|
||||
package common
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategies"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategy"
|
||||
"math/rand"
|
||||
"time"
|
||||
)
|
||||
|
||||
type scheduleService interface {
|
||||
getNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error)
|
||||
pickOptimalStrategy() (strategies.Strategy, error)
|
||||
assignTask(clusters []*strategies.AssignedCluster) error
|
||||
type SubSchedule interface {
|
||||
GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error)
|
||||
PickOptimalStrategy() (strategy.Strategy, error)
|
||||
AssignTask(clusters []*strategy.AssignedCluster) error
|
||||
}
|
||||
|
||||
// 求交集
|
||||
func intersect(slice1, slice2 []int64) []int64 {
|
||||
func Intersect(slice1, slice2 []int64) []int64 {
|
||||
m := make(map[int64]int)
|
||||
nn := make([]int64, 0)
|
||||
for _, v := range slice1 {
|
||||
|
@ -44,7 +44,7 @@ func intersect(slice1, slice2 []int64) []int64 {
|
|||
return nn
|
||||
}
|
||||
|
||||
func micsSlice(origin []int64, count int) []int64 {
|
||||
func MicsSlice(origin []int64, count int) []int64 {
|
||||
tmpOrigin := make([]int64, len(origin))
|
||||
copy(tmpOrigin, origin)
|
||||
//一定要seed
|
|
@ -19,31 +19,42 @@ import (
|
|||
"github.com/pkg/errors"
|
||||
"github.com/zeromicro/go-zero/core/logx"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/common"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/database"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/executor"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/rpc/client/participantservice"
|
||||
"gorm.io/gorm"
|
||||
"sigs.k8s.io/yaml"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type scheduler struct {
|
||||
task *response.TaskInfo
|
||||
participantIds []int64
|
||||
scheduleService scheduleService
|
||||
dbEngin *gorm.DB
|
||||
result []string //pID:子任务yamlstring 键值对
|
||||
participantRpc participantservice.ParticipantService
|
||||
type Scheduler struct {
|
||||
task *response.TaskInfo
|
||||
participantIds []int64
|
||||
subSchedule common.SubSchedule
|
||||
dbEngin *gorm.DB
|
||||
result []string //pID:子任务yamlstring 键值对
|
||||
participantRpc participantservice.ParticipantService
|
||||
ResourceCollector *map[string]collector.ResourceCollector
|
||||
Storages database.Storage
|
||||
AiExecutor *map[string]executor.Executor
|
||||
}
|
||||
|
||||
func NewScheduler(scheduleService scheduleService, val string, dbEngin *gorm.DB, participantRpc participantservice.ParticipantService) (*scheduler, error) {
|
||||
func NewScheduler(subSchedule common.SubSchedule, val string, dbEngin *gorm.DB, participantRpc participantservice.ParticipantService) (*Scheduler, error) {
|
||||
var task *response.TaskInfo
|
||||
err := json.Unmarshal([]byte(val), &task)
|
||||
if err != nil {
|
||||
return nil, errors.New("create scheduler failed : " + err.Error())
|
||||
}
|
||||
return &scheduler{task: task, scheduleService: scheduleService, dbEngin: dbEngin, participantRpc: participantRpc}, nil
|
||||
return &Scheduler{task: task, subSchedule: subSchedule, dbEngin: dbEngin, participantRpc: participantRpc}, nil
|
||||
}
|
||||
|
||||
func (s *scheduler) SpecifyClusters() {
|
||||
func NewScheduler2(resourceCollector *map[string]collector.ResourceCollector, storages database.Storage, aiExecutor *map[string]executor.Executor) *Scheduler {
|
||||
return &Scheduler{ResourceCollector: resourceCollector, Storages: storages, AiExecutor: aiExecutor}
|
||||
}
|
||||
|
||||
func (s *Scheduler) SpecifyClusters() {
|
||||
// 如果已指定集群名,通过数据库查询后返回p端ip列表
|
||||
if len(s.task.Clusters) != 0 {
|
||||
s.dbEngin.Raw("select id from sc_participant_phy_info where `name` in (?)", s.task.Clusters).Scan(&s.participantIds)
|
||||
|
@ -51,7 +62,7 @@ func (s *scheduler) SpecifyClusters() {
|
|||
}
|
||||
}
|
||||
|
||||
func (s *scheduler) SpecifyNsID() {
|
||||
func (s *Scheduler) SpecifyNsID() {
|
||||
// 未指定集群名,只指定nsID
|
||||
if len(s.task.Clusters) == 0 {
|
||||
if len(s.task.NsID) != 0 {
|
||||
|
@ -67,7 +78,7 @@ func (s *scheduler) SpecifyNsID() {
|
|||
}
|
||||
}
|
||||
|
||||
func (s *scheduler) MatchLabels() {
|
||||
func (s *Scheduler) MatchLabels() {
|
||||
|
||||
var ids []int64
|
||||
count := 0
|
||||
|
@ -80,7 +91,7 @@ func (s *scheduler) MatchLabels() {
|
|||
if count == 0 {
|
||||
ids = participantIds
|
||||
}
|
||||
ids = intersect(ids, participantIds)
|
||||
ids = common.Intersect(ids, participantIds)
|
||||
count++
|
||||
}
|
||||
s.participantIds = ids
|
||||
|
@ -90,7 +101,7 @@ func (s *scheduler) MatchLabels() {
|
|||
}
|
||||
|
||||
// TempAssign todo 屏蔽原调度算法
|
||||
func (s *scheduler) TempAssign() error {
|
||||
func (s *Scheduler) TempAssign() error {
|
||||
|
||||
//需要判断task中的资源类型,针对metadata中的多个kind做不同处理
|
||||
//输入副本数和集群列表,最终结果输出为pID对应副本数量列表,针对多个kind需要做拆分和重新拼接组合
|
||||
|
@ -110,28 +121,28 @@ func (s *scheduler) TempAssign() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (s *scheduler) AssignAndSchedule() error {
|
||||
// 已指定 ParticipantId
|
||||
if s.task.ParticipantId != 0 {
|
||||
return nil
|
||||
}
|
||||
// 标签匹配以及后,未找到ParticipantIds
|
||||
if len(s.participantIds) == 0 {
|
||||
return errors.New("未找到匹配的ParticipantIds")
|
||||
}
|
||||
func (s *Scheduler) AssignAndSchedule(ss common.SubSchedule) error {
|
||||
//// 已指定 ParticipantId
|
||||
//if s.task.ParticipantId != 0 {
|
||||
// return nil
|
||||
//}
|
||||
//// 标签匹配以及后,未找到ParticipantIds
|
||||
//if len(s.participantIds) == 0 {
|
||||
// return errors.New("未找到匹配的ParticipantIds")
|
||||
//}
|
||||
//
|
||||
//// 指定或者标签匹配的结果只有一个集群,给任务信息指定
|
||||
//if len(s.participantIds) == 1 {
|
||||
// s.task.ParticipantId = s.participantIds[0]
|
||||
// //replicas := s.task.Metadata.(map[string]interface{})["spec"].(map[string]interface{})["replicas"].(float64)
|
||||
// //result := make(map[int64]string)
|
||||
// //result[s.participantIds[0]] = strconv.FormatFloat(replicas, 'f', 2, 64)
|
||||
// //s.result = result
|
||||
//
|
||||
// return nil
|
||||
//}
|
||||
|
||||
// 指定或者标签匹配的结果只有一个集群,给任务信息指定
|
||||
if len(s.participantIds) == 1 {
|
||||
s.task.ParticipantId = s.participantIds[0]
|
||||
//replicas := s.task.Metadata.(map[string]interface{})["spec"].(map[string]interface{})["replicas"].(float64)
|
||||
//result := make(map[int64]string)
|
||||
//result[s.participantIds[0]] = strconv.FormatFloat(replicas, 'f', 2, 64)
|
||||
//s.result = result
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
strategy, err := s.scheduleService.pickOptimalStrategy()
|
||||
strategy, err := ss.PickOptimalStrategy()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -147,7 +158,7 @@ func (s *scheduler) AssignAndSchedule() error {
|
|||
// return nil
|
||||
//}
|
||||
|
||||
err = s.scheduleService.assignTask(clusters)
|
||||
err = ss.AssignTask(clusters)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -155,12 +166,12 @@ func (s *scheduler) AssignAndSchedule() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (s *scheduler) SaveToDb() error {
|
||||
func (s *Scheduler) SaveToDb() error {
|
||||
|
||||
for _, participantId := range s.participantIds {
|
||||
|
||||
for _, resource := range s.task.Metadata {
|
||||
structForDb, err := s.scheduleService.getNewStructForDb(s.task, resource, participantId)
|
||||
structForDb, err := s.subSchedule.GetNewStructForDb(s.task, resource, participantId)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -0,0 +1,102 @@
|
|||
/*
|
||||
|
||||
Copyright (c) [2023] [pcm]
|
||||
[pcm-coordinator] is licensed under Mulan PSL v2.
|
||||
You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||||
You may obtain a copy of Mulan PSL v2 at:
|
||||
http://license.coscl.org.cn/MulanPSL2
|
||||
THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
||||
EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
||||
MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
||||
See the Mulan PSL v2 for more details.
|
||||
|
||||
*/
|
||||
|
||||
package schedulers
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/algorithm/providerPricing"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/entity"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategy"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
|
||||
)
|
||||
|
||||
type AiScheduler struct {
|
||||
yamlString string
|
||||
task *response.TaskInfo
|
||||
*scheduler.Scheduler
|
||||
}
|
||||
|
||||
func NewAiScheduler(val string, scheduler *scheduler.Scheduler) (*AiScheduler, error) {
|
||||
return &AiScheduler{yamlString: val, Scheduler: scheduler}, nil
|
||||
}
|
||||
|
||||
func (as *AiScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
|
||||
ai := models.Ai{
|
||||
ParticipantId: participantId,
|
||||
TaskId: task.TaskId,
|
||||
Status: "Saved",
|
||||
YamlString: as.yamlString,
|
||||
}
|
||||
utils.Convert(task.Metadata, &ai)
|
||||
return ai, nil
|
||||
}
|
||||
|
||||
func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
|
||||
resources, err := as.findProvidersWithResource()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(resources) < 2 /*|| as.task */ {
|
||||
var pros []entity.Participant
|
||||
for _, resource := range resources {
|
||||
pros = append(pros, entity.Participant{
|
||||
Participant_id: resource.ParticipantId,
|
||||
Name: resource.Name,
|
||||
})
|
||||
}
|
||||
strategy := strategy.NewReplicationStrategy(nil, 0)
|
||||
return strategy, nil
|
||||
}
|
||||
|
||||
task, providerList := as.genTaskAndProviders()
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
strategy := strategy.NewPricingStrategy(task, providerList...)
|
||||
return strategy, nil
|
||||
}
|
||||
|
||||
func (as *AiScheduler) genTaskAndProviders() (*providerPricing.Task, []*providerPricing.Provider) {
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) error {
|
||||
if clusters == nil {
|
||||
return errors.New("clusters is nil")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (as *AiScheduler) findProvidersWithResource() ([]*collector.ResourceSpecs, error) {
|
||||
var resourceSpecs []*collector.ResourceSpecs
|
||||
for _, resourceCollector := range *as.ResourceCollector {
|
||||
spec, err := resourceCollector.GetResourceSpecs()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
resourceSpecs = append(resourceSpecs, spec)
|
||||
}
|
||||
if len(resourceSpecs) == 0 {
|
||||
return nil, errors.New("no resource found")
|
||||
}
|
||||
return resourceSpecs, nil
|
||||
}
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
*/
|
||||
|
||||
package scheduler
|
||||
package schedulers
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
@ -20,7 +20,7 @@ import (
|
|||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/algorithm/providerPricing"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/database"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategies"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategy"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
|
||||
"io"
|
||||
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
|
||||
|
@ -37,17 +37,17 @@ func NewCloudScheduler() *CloudScheduler {
|
|||
return &CloudScheduler{}
|
||||
}
|
||||
|
||||
func (cs *CloudScheduler) pickOptimalStrategy() (strategies.Strategy, error) {
|
||||
func (cs *CloudScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
|
||||
task, providerList, err := cs.genTaskAndProviders()
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
//调度算法
|
||||
strategy := strategies.NewPricingStrategy(task, providerList...)
|
||||
strategy := strategy.NewPricingStrategy(task, providerList...)
|
||||
return strategy, nil
|
||||
}
|
||||
|
||||
func (cs *CloudScheduler) getNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
|
||||
func (cs *CloudScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
|
||||
cloud := cs.UnMarshalK8sStruct(resource, task.TaskId, task.NsID)
|
||||
cloud.Id = utils.GenSnowflakeID()
|
||||
cloud.NsID = task.NsID
|
||||
|
@ -117,6 +117,6 @@ func (cs *CloudScheduler) genTaskAndProviders() (*providerPricing.Task, []*provi
|
|||
return nil, providerList, nil
|
||||
}
|
||||
|
||||
func (cs *CloudScheduler) assignTask(clusters []*strategies.AssignedCluster) error {
|
||||
func (cs *CloudScheduler) AssignTask(clusters []*strategy.AssignedCluster) error {
|
||||
return nil
|
||||
}
|
|
@ -12,14 +12,14 @@
|
|||
|
||||
*/
|
||||
|
||||
package scheduler
|
||||
package schedulers
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/constants"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/algorithm/providerPricing"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategies"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategy"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
|
||||
)
|
||||
|
||||
|
@ -31,7 +31,7 @@ func NewHpcScheduler(val string) *HpcScheduler {
|
|||
return &HpcScheduler{yamlString: val}
|
||||
}
|
||||
|
||||
func (h *HpcScheduler) getNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
|
||||
func (h *HpcScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
|
||||
hpc := models.Hpc{}
|
||||
utils.Convert(task.Metadata, &hpc)
|
||||
hpc.Id = utils.GenSnowflakeID()
|
||||
|
@ -42,7 +42,7 @@ func (h *HpcScheduler) getNewStructForDb(task *response.TaskInfo, resource strin
|
|||
return hpc, nil
|
||||
}
|
||||
|
||||
func (h *HpcScheduler) pickOptimalStrategy() (strategies.Strategy, error) {
|
||||
func (h *HpcScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
|
@ -50,6 +50,6 @@ func (h *HpcScheduler) genTaskAndProviders(task *response.TaskInfo) (*providerPr
|
|||
return nil, nil
|
||||
}
|
||||
|
||||
func (h *HpcScheduler) assignTask(clusters []*strategies.AssignedCluster) error {
|
||||
func (h *HpcScheduler) AssignTask(clusters []*strategy.AssignedCluster) error {
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package schedulers
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/strategy"
|
||||
)
|
||||
|
||||
type VmScheduler struct {
|
||||
}
|
||||
|
||||
func (v VmScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (v VmScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (v VmScheduler) AssignTask(clusters []*strategy.AssignedCluster) error {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package service
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/jcce-pcm/pcm-ac/hpcacclient"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/executor"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/impl"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-participant-octopus/octopusclient"
|
||||
)
|
||||
|
||||
const (
|
||||
OCTOPUS = "Octopus"
|
||||
MODELARTS = "Modelarts"
|
||||
SHUGUANGAI = "ShuguangAi"
|
||||
)
|
||||
|
||||
var (
|
||||
AiTypeMap = map[string]string{
|
||||
"Hanwuji": OCTOPUS,
|
||||
"Suiyan": OCTOPUS,
|
||||
"Sailingsi": OCTOPUS,
|
||||
"modelarts-CloudBrain2": MODELARTS,
|
||||
"ShuguangAi": SHUGUANGAI,
|
||||
}
|
||||
)
|
||||
|
||||
func InitAiClusterMap(ACRpc hpcacclient.HpcAC, ModelArtsRpc modelartsservice.ModelArtsService, ModelArtsImgRpc imagesservice.ImagesService, OctopusRpc octopusclient.Octopus) (*map[string]executor.Executor, *map[string]collector.ResourceCollector) {
|
||||
executorMap := make(map[string]executor.Executor)
|
||||
collectorMap := make(map[string]collector.ResourceCollector)
|
||||
for k, v := range AiTypeMap {
|
||||
switch v {
|
||||
case OCTOPUS:
|
||||
octopus := impl.NewOctopusExecutor(OctopusRpc, k)
|
||||
collectorMap[k] = octopus
|
||||
executorMap[k] = octopus
|
||||
case MODELARTS:
|
||||
modelarts := impl.NewModelartsExecutor(ModelArtsRpc, ModelArtsImgRpc, k)
|
||||
collectorMap[k] = modelarts
|
||||
executorMap[k] = modelarts
|
||||
case SHUGUANGAI:
|
||||
sgai := impl.NewShuguangAiExecutor(ACRpc, k)
|
||||
collectorMap[k] = sgai
|
||||
executorMap[k] = sgai
|
||||
}
|
||||
}
|
||||
return &executorMap, &collectorMap
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
package collector
|
||||
|
||||
type ResourceCollector interface {
|
||||
GetResourceSpecs() (*ResourceSpecs, error)
|
||||
}
|
||||
|
||||
type ResourceSpecs struct {
|
||||
ParticipantId int64
|
||||
Name string
|
||||
CpuAvail float64
|
||||
MemAvail float64
|
||||
DiskAvail float64
|
||||
GpuAvail float64
|
||||
CardAvail []Card
|
||||
Balance float64
|
||||
}
|
||||
|
||||
type Card struct {
|
||||
Type string
|
||||
Name string
|
||||
TOpsAtFp16 float64
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
package executor
|
||||
|
||||
type Executor interface {
|
||||
QueryImageList() ([]Image, error)
|
||||
SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string) (Task, error)
|
||||
QueryTask(taskId string) (Task, error)
|
||||
QuerySpecs() (Spec, error)
|
||||
}
|
||||
|
||||
type Image struct {
|
||||
}
|
||||
|
||||
type Task struct {
|
||||
}
|
||||
|
||||
type Spec struct {
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package impl
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/executor"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice"
|
||||
)
|
||||
|
||||
type ModelArtsExecutor struct {
|
||||
Name string
|
||||
pageIndex int32
|
||||
pageSize int32
|
||||
ModelArtsRpc modelartsservice.ModelArtsService
|
||||
ModelArtsImgRpc imagesservice.ImagesService
|
||||
}
|
||||
|
||||
func NewModelartsExecutor(modelArtsRpc modelartsservice.ModelArtsService, modelArtsImgRpc imagesservice.ImagesService, name string) *ModelArtsExecutor {
|
||||
return &ModelArtsExecutor{Name: name, ModelArtsRpc: modelArtsRpc, ModelArtsImgRpc: modelArtsImgRpc, pageIndex: 1, pageSize: 100}
|
||||
}
|
||||
|
||||
func (m ModelArtsExecutor) QueryImageList() ([]executor.Image, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (m ModelArtsExecutor) SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string) (executor.Task, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (m ModelArtsExecutor) QueryTask(taskId string) (executor.Task, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (m ModelArtsExecutor) QuerySpecs() (executor.Spec, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (a *ModelArtsExecutor) GetResourceSpecs() (*collector.ResourceSpecs, error) {
|
||||
return nil, nil
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
package impl
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/executor"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-participant-octopus/octopusclient"
|
||||
)
|
||||
|
||||
type OctopusExecutor struct {
|
||||
Name string
|
||||
pageIndex int32
|
||||
pageSize int32
|
||||
OctopusRpc octopusclient.Octopus
|
||||
}
|
||||
|
||||
func NewOctopusExecutor(OctopusRpc octopusclient.Octopus, name string) *OctopusExecutor {
|
||||
return &OctopusExecutor{OctopusRpc: OctopusRpc, Name: name, pageIndex: 1, pageSize: 100}
|
||||
}
|
||||
|
||||
func (o OctopusExecutor) QueryImageList() ([]executor.Image, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (o OctopusExecutor) SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string) (executor.Task, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (o OctopusExecutor) QueryTask(taskId string) (executor.Task, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (o OctopusExecutor) QuerySpecs() (executor.Spec, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (a *OctopusExecutor) GetResourceSpecs() (*collector.ResourceSpecs, error) {
|
||||
return nil, nil
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package impl
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/jcce-pcm/pcm-ac/hpcacclient"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/collector"
|
||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/service/executor"
|
||||
)
|
||||
|
||||
//单条作业费=作业运行秒数×(CPU核心数*CPU单价+GPU卡数×GPU单价+DCU卡数×DCU单价)/3600
|
||||
//CPU单价=队列CPU费率×计算中心CPU单价
|
||||
//GPU单价=队列GPU费率×计算中心GPU单价
|
||||
//DCU单价=队列DCU费率×计算中心DCU单价
|
||||
|
||||
type ShuguangAiExecutor struct {
|
||||
Name string
|
||||
ACRpc hpcacclient.HpcAC
|
||||
}
|
||||
|
||||
func NewShuguangAiExecutor(acRpc hpcacclient.HpcAC, name string) *ShuguangAiExecutor {
|
||||
return &ShuguangAiExecutor{Name: name, ACRpc: acRpc}
|
||||
}
|
||||
|
||||
func (s ShuguangAiExecutor) QueryImageList() ([]executor.Image, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (s ShuguangAiExecutor) SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string) (executor.Task, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (s ShuguangAiExecutor) QueryTask(taskId string) (executor.Task, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (s ShuguangAiExecutor) QuerySpecs() (executor.Spec, error) {
|
||||
//TODO implement me
|
||||
panic("implement me")
|
||||
}
|
||||
|
||||
func (a *ShuguangAiExecutor) GetResourceSpecs() (*collector.ResourceSpecs, error) {
|
||||
return nil, nil
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
package strategy
|
||||
|
||||
type DynamicResourcesStrategy struct {
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package strategies
|
||||
package strategy
|
||||
|
||||
import (
|
||||
"github.com/pkg/errors"
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
*/
|
||||
|
||||
package strategies
|
||||
package strategy
|
||||
|
||||
import (
|
||||
"errors"
|
|
@ -1,4 +1,4 @@
|
|||
package strategies
|
||||
package strategy
|
||||
|
||||
type StaticWeightStrategy struct {
|
||||
// TODO: add fields
|
|
@ -1,4 +1,4 @@
|
|||
package strategies
|
||||
package strategy
|
||||
|
||||
type Strategy interface {
|
||||
Schedule() ([]*AssignedCluster, error)
|
Loading…
Reference in New Issue