调度结构修改
Former-commit-id: a45bd574fefd8825b89020bc28a86461ee8a503a
This commit is contained in:
parent
2e97f1b481
commit
e0b48ecb39
|
@ -4,10 +4,13 @@ NacosConfig:
|
||||||
ServerConfigs:
|
ServerConfigs:
|
||||||
# - IpAddr: 127.0.0.1
|
# - IpAddr: 127.0.0.1
|
||||||
# Port: 8848
|
# Port: 8848
|
||||||
- IpAddr: nacos.jcce.dev
|
# - IpAddr: 10.101.15.7
|
||||||
|
# Port: 8848
|
||||||
|
- IpAddr: 119.45.100.73
|
||||||
Port: 8848
|
Port: 8848
|
||||||
ClientConfig:
|
ClientConfig:
|
||||||
NamespaceId: test
|
NamespaceId: tzwang
|
||||||
|
# NamespaceId: test
|
||||||
TimeoutMs: 5000
|
TimeoutMs: 5000
|
||||||
NotLoadCacheAtStart: true
|
NotLoadCacheAtStart: true
|
||||||
LogDir:
|
LogDir:
|
||||||
|
|
|
@ -38,6 +38,7 @@ type Config struct {
|
||||||
CephRpcConf zrpc.RpcClientConf
|
CephRpcConf zrpc.RpcClientConf
|
||||||
OpenstackRpcConf zrpc.RpcClientConf
|
OpenstackRpcConf zrpc.RpcClientConf
|
||||||
OctopusRpcConf zrpc.RpcClientConf
|
OctopusRpcConf zrpc.RpcClientConf
|
||||||
|
PcmCoreRpcConf zrpc.RpcClientConf
|
||||||
NexusUrl string
|
NexusUrl string
|
||||||
JccScheduleUrl string
|
JccScheduleUrl string
|
||||||
MinioConf struct {
|
MinioConf struct {
|
||||||
|
|
|
@ -39,7 +39,7 @@ func NewAiMq(ctx context.Context, svcCtx *svc.ServiceContext) *AiQueue {
|
||||||
func (l *AiQueue) Consume(val string) error {
|
func (l *AiQueue) Consume(val string) error {
|
||||||
// 接受消息, 根据标签筛选过滤
|
// 接受消息, 根据标签筛选过滤
|
||||||
aiSchdl := scheduler2.NewAiScheduler(val)
|
aiSchdl := scheduler2.NewAiScheduler(val)
|
||||||
schdl, err := scheduler2.NewScheduler(aiSchdl, val, l.svcCtx.DbEngin)
|
schdl, err := scheduler2.NewScheduler(aiSchdl, val, l.svcCtx.DbEngin, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ func NewCloudMq(ctx context.Context, svcCtx *svc.ServiceContext) *CloudMq {
|
||||||
func (l *CloudMq) Consume(val string) error {
|
func (l *CloudMq) Consume(val string) error {
|
||||||
// 接受消息, 根据标签筛选过滤
|
// 接受消息, 根据标签筛选过滤
|
||||||
cloudScheduler := scheduler.NewCloudScheduler()
|
cloudScheduler := scheduler.NewCloudScheduler()
|
||||||
schdl, err := scheduler.NewScheduler(cloudScheduler, val, l.svcCtx.DbEngin)
|
schdl, err := scheduler.NewScheduler(cloudScheduler, val, l.svcCtx.DbEngin, l.svcCtx.ParticipantRpc)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ func NewHpcMq(ctx context.Context, svcCtx *svc.ServiceContext) *HpcMq {
|
||||||
func (l *HpcMq) Consume(val string) error {
|
func (l *HpcMq) Consume(val string) error {
|
||||||
// 接受消息, 根据标签筛选过滤
|
// 接受消息, 根据标签筛选过滤
|
||||||
hpcSchdl := scheduler2.NewHpcScheduler(val)
|
hpcSchdl := scheduler2.NewHpcScheduler(val)
|
||||||
schdl, err := scheduler2.NewScheduler(hpcSchdl, val, l.svcCtx.DbEngin)
|
schdl, err := scheduler2.NewScheduler(hpcSchdl, val, l.svcCtx.DbEngin, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,7 @@ import (
|
||||||
"gitlink.org.cn/jcce-pcm/pcm-ac/hpcacclient"
|
"gitlink.org.cn/jcce-pcm/pcm-ac/hpcacclient"
|
||||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/config"
|
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/config"
|
||||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
|
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
|
||||||
|
"gitlink.org.cn/jcce-pcm/pcm-coordinator/rpc/client/participantservice"
|
||||||
"gitlink.org.cn/jcce-pcm/pcm-participant-ceph/cephclient"
|
"gitlink.org.cn/jcce-pcm/pcm-participant-ceph/cephclient"
|
||||||
"gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes/kubernetesclient"
|
"gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes/kubernetesclient"
|
||||||
"gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice"
|
"gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice"
|
||||||
|
@ -55,6 +56,7 @@ type ServiceContext struct {
|
||||||
Downloader *s3manager.Downloader
|
Downloader *s3manager.Downloader
|
||||||
Uploader *s3manager.Uploader
|
Uploader *s3manager.Uploader
|
||||||
K8sRpc map[int64]kubernetesclient.Kubernetes
|
K8sRpc map[int64]kubernetesclient.Kubernetes
|
||||||
|
ParticipantRpc participantservice.ParticipantService
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewServiceContext(c config.Config) *ServiceContext {
|
func NewServiceContext(c config.Config) *ServiceContext {
|
||||||
|
@ -101,6 +103,7 @@ func NewServiceContext(c config.Config) *ServiceContext {
|
||||||
OctopusRpc: octopusclient.NewOctopus(zrpc.MustNewClient(c.OctopusRpcConf)),
|
OctopusRpc: octopusclient.NewOctopus(zrpc.MustNewClient(c.OctopusRpcConf)),
|
||||||
OpenstackRpc: openstackclient.NewOpenstack(zrpc.MustNewClient(c.OpenstackRpcConf)),
|
OpenstackRpc: openstackclient.NewOpenstack(zrpc.MustNewClient(c.OpenstackRpcConf)),
|
||||||
K8sRpc: make(map[int64]kubernetesclient.Kubernetes),
|
K8sRpc: make(map[int64]kubernetesclient.Kubernetes),
|
||||||
|
ParticipantRpc: participantservice.NewParticipantService(zrpc.MustNewClient(c.PcmCoreRpcConf)),
|
||||||
DockerClient: dockerClient,
|
DockerClient: dockerClient,
|
||||||
Downloader: downloader,
|
Downloader: downloader,
|
||||||
Uploader: uploader,
|
Uploader: uploader,
|
||||||
|
|
|
@ -41,7 +41,7 @@ func (as *aiScheduler) getNewStructForDb(task *response.TaskInfo, participantId
|
||||||
return ai, nil
|
return ai, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (as *aiScheduler) pickOptimalStrategy(task *algo.Task, providers ...*algo.Provider) (*algo.Task, error) {
|
func (as *aiScheduler) pickOptimalStrategy(task *algo.Task, providers ...*algo.Provider) (*algo.Strategy, error) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ func NewCloudScheduler() *cloudScheduler {
|
||||||
return &cloudScheduler{}
|
return &cloudScheduler{}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (cs *cloudScheduler) pickOptimalStrategy(task *algo.Task, providers ...*algo.Provider) (*algo.Task, error) {
|
func (cs *cloudScheduler) pickOptimalStrategy(task *algo.Task, providers ...*algo.Provider) (*algo.Strategy, error) {
|
||||||
//参数为空返回 nil
|
//参数为空返回 nil
|
||||||
if len(providers) == 0 || task == nil {
|
if len(providers) == 0 || task == nil {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
|
@ -48,7 +48,7 @@ func (cs *cloudScheduler) pickOptimalStrategy(task *algo.Task, providers ...*alg
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
return taskResult, nil
|
return taskResult.MaxscoreStrategy, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (cs *cloudScheduler) getNewStructForDb(task *response.TaskInfo, participantId int64) (interface{}, error) {
|
func (cs *cloudScheduler) getNewStructForDb(task *response.TaskInfo, participantId int64) (interface{}, error) {
|
||||||
|
|
|
@ -24,7 +24,7 @@ import (
|
||||||
|
|
||||||
type scheduleService interface {
|
type scheduleService interface {
|
||||||
getNewStructForDb(task *response.TaskInfo, participantId int64) (interface{}, error)
|
getNewStructForDb(task *response.TaskInfo, participantId int64) (interface{}, error)
|
||||||
pickOptimalStrategy(task *algo.Task, providers ...*algo.Provider) (*algo.Task, error)
|
pickOptimalStrategy(task *algo.Task, providers ...*algo.Provider) (*algo.Strategy, error)
|
||||||
genTaskAndProviders(task *response.TaskInfo, dbEngin *gorm.DB) (*algo.Task, []*algo.Provider)
|
genTaskAndProviders(task *response.TaskInfo, dbEngin *gorm.DB) (*algo.Task, []*algo.Provider)
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,7 +42,7 @@ func (h *hpcScheduler) getNewStructForDb(task *response.TaskInfo, participantId
|
||||||
return hpc, nil
|
return hpc, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *hpcScheduler) pickOptimalStrategy(task *algo.Task, providers ...*algo.Provider) (*algo.Task, error) {
|
func (h *hpcScheduler) pickOptimalStrategy(task *algo.Task, providers ...*algo.Provider) (*algo.Strategy, error) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,28 +15,36 @@
|
||||||
package scheduler
|
package scheduler
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
"github.com/zeromicro/go-zero/core/logx"
|
"github.com/zeromicro/go-zero/core/logx"
|
||||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response"
|
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/pkg/response"
|
||||||
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/algo"
|
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/scheduler/algo"
|
||||||
|
"gitlink.org.cn/jcce-pcm/pcm-coordinator/rpc/client/participantservice"
|
||||||
"gorm.io/gorm"
|
"gorm.io/gorm"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type Replicas int64
|
||||||
|
|
||||||
|
type ParticipantId int64
|
||||||
|
|
||||||
type scheduler struct {
|
type scheduler struct {
|
||||||
task *response.TaskInfo
|
task *response.TaskInfo
|
||||||
participantIds []int64
|
participantIds []int64
|
||||||
scheduleService scheduleService
|
scheduleService scheduleService
|
||||||
dbEngin *gorm.DB
|
dbEngin *gorm.DB
|
||||||
|
result map[ParticipantId]Replicas
|
||||||
|
participantRpc participantservice.ParticipantService
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewScheduler(scheduleService scheduleService, val string, dbEngin *gorm.DB) (*scheduler, error) {
|
func NewScheduler(scheduleService scheduleService, val string, dbEngin *gorm.DB, participantRpc participantservice.ParticipantService) (*scheduler, error) {
|
||||||
var task *response.TaskInfo
|
var task *response.TaskInfo
|
||||||
err := json.Unmarshal([]byte(val), &task)
|
err := json.Unmarshal([]byte(val), &task)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, errors.New("create scheduler failed : " + err.Error())
|
return nil, errors.New("create scheduler failed : " + err.Error())
|
||||||
}
|
}
|
||||||
return &scheduler{task: task, scheduleService: scheduleService, dbEngin: dbEngin}, nil
|
return &scheduler{task: task, scheduleService: scheduleService, dbEngin: dbEngin, participantRpc: participantRpc}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *scheduler) MatchLabels() {
|
func (s *scheduler) MatchLabels() {
|
||||||
|
@ -92,19 +100,10 @@ func (s *scheduler) AssignAndSchedule() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
//调度结果 ParticipantId
|
//调度结果
|
||||||
for i, e := range strategy.ResourcePerTask {
|
err = s.assignReplicasToResult(strategy, providerList)
|
||||||
|
if err != nil {
|
||||||
if len(e) != 0 {
|
return err
|
||||||
for _, ej := range e {
|
|
||||||
if ej == 1 {
|
|
||||||
s.task.ParticipantId = providerList[i].Pid
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -130,3 +129,65 @@ func (s *scheduler) SaveToDb() error {
|
||||||
func (s *scheduler) obtainParamsforStrategy() (*algo.Task, []*algo.Provider) {
|
func (s *scheduler) obtainParamsforStrategy() (*algo.Task, []*algo.Provider) {
|
||||||
return s.scheduleService.genTaskAndProviders(s.task, s.dbEngin)
|
return s.scheduleService.genTaskAndProviders(s.task, s.dbEngin)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *scheduler) checkAvailableParticipants() error {
|
||||||
|
resp, err := s.participantRpc.ListParticipant(context.Background(), nil)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.Code != 200 {
|
||||||
|
return errors.New("集群列表查询失败")
|
||||||
|
}
|
||||||
|
|
||||||
|
var workingIds []int64
|
||||||
|
for _, e := range resp.Data {
|
||||||
|
if e.ClientState == "UNKNOWN" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
workingIds = append(workingIds, e.ParticipantId)
|
||||||
|
}
|
||||||
|
|
||||||
|
for id, _ := range s.result {
|
||||||
|
if !contains(workingIds, int64(id)) {
|
||||||
|
return errors.Errorf("集群 %d 不可用", id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(s.result) == 0 {
|
||||||
|
return errors.New("可用集群为空")
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *scheduler) assignReplicasToResult(strategy *algo.Strategy, providerList []*algo.Provider) error {
|
||||||
|
|
||||||
|
if len(strategy.Tasksolution) == 0 {
|
||||||
|
return errors.New("调度失败, 未能获取调度结果")
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, e := range strategy.Tasksolution {
|
||||||
|
if e == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s.result[ParticipantId(providerList[i].Pid)] = Replicas(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
// 查询集群是否可用
|
||||||
|
err := s.checkAvailableParticipants()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func contains(s []int64, e int64) bool {
|
||||||
|
for _, a := range s {
|
||||||
|
if a == e {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue