调度部分调整

Former-commit-id: 62ca70a4a1aaed22d3badbfefac73a7033a5a545
This commit is contained in:
zhouqunjie 2023-11-24 14:29:59 +08:00
parent 27cb0d6fbe
commit 81f1a1ce1b
10 changed files with 74 additions and 134 deletions

View File

@ -28,11 +28,11 @@ func NewRegisterClusterLogic(ctx context.Context, svcCtx *svc.ServiceContext) *R
}
func (l *RegisterClusterLogic) RegisterCluster(req *types.RegisterClusterReq) (*types.CloudResp, error) {
var ms []models.ScParticipantPhyInfo
var phyInfos []models.ScParticipantPhyInfo
var resp types.CloudResp
l.svcCtx.DbEngin.Raw("select * from sc_participant_phy_info where `name` = ?", req.Name).Scan(&ms)
if len(ms) != 0 {
l.svcCtx.DbEngin.Raw("select * from sc_participant_phy_info where `name` = ?", req.Name).Scan(&phyInfos)
if len(phyInfos) != 0 {
resp.Code = "400"
resp.Msg = "cluster name already exist"
resp.Data = ""
@ -40,7 +40,6 @@ func (l *RegisterClusterLogic) RegisterCluster(req *types.RegisterClusterReq) (*
}
participant := models.ScParticipantPhyInfo{}
participant.Token = req.Token
participant.Name = req.Name
participant.Address = req.Address
@ -50,10 +49,21 @@ func (l *RegisterClusterLogic) RegisterCluster(req *types.RegisterClusterReq) (*
participant.CreatedTime = time.Now()
participant.UpdatedTime = time.Now()
labelInfo := models.ScParticipantLabelInfo{}
labelInfo.Id = utils.GenSnowflakeID()
labelInfo.ParticipantId = participant.Id
labelInfo.CreatedTime = time.Now()
labelInfo.Key = "cloud"
labelInfo.Value = "sealos"
tx := l.svcCtx.DbEngin.Create(&participant)
if tx.Error != nil {
return nil, tx.Error
}
tx2 := l.svcCtx.DbEngin.Create(&labelInfo)
if tx2.Error != nil {
return nil, tx.Error
}
resp.Code = string(200)
resp.Msg = "success"
resp.Data = "participantId:" + strconv.FormatInt(participant.Id, 10)

View File

@ -43,6 +43,8 @@ func (l *CloudMq) Consume(val string) error {
if err != nil {
return err
}
//通过标签匹配筛选出集群范围
schdl.MatchLabels()
// 调度算法

View File

@ -3340,7 +3340,6 @@ type RegisterClusterReq struct {
Name string `form:"name"` // 名称
Address string `form:"address"` // 地址
Token string `form:"token"` // 数算集群token
Type string `form:"type"` // 参与者类型:CLOUD-数算集群;AI-智算集群HPC-超算集群
MetricsUrl string `form:"metricsUrl"` //监控url
}
@ -3348,22 +3347,6 @@ type DeleteClusterReq struct {
Name string `form:"name"` // 名称
}
type ListParticipantResp struct {
Code string `json:"code"`
Msg string `json:"msg"`
Data []ParticipantResp `json:"data"`
}
type ParticipantResp struct {
Id int64 `json:"id"` // id
Name string `json:"name"` // 名称
Address string `json:"address"` // 地址
Token string `json:"token"` // 数算集群token
Type string `json:"type"` // 参与者类型:CLOUD-数算集群;AI-智算集群HPC-超算集群
ParticipantId int64 `json:"name"` // participant id
MetricsUrl string `json:"metricsUrl"` //监控url
}
type CloudResp struct {
Code string `json:"code"`
Msg string `json:"msg"`

View File

@ -18,10 +18,11 @@ import "fmt"
type TaskInfo struct {
TaskId int64 `json:"taskId,optional"`
NsID string `json:"nsID"`
NsID string `json:"nsID"` //云际平台传入namespace
TaskType string `json:"taskType,optional"`
MatchLabels map[string]string `json:"matchLabels"`
ParticipantId int64 `json:"participantId"`
ParticipantId int64 `json:"participantId,optional"` //湘江预留字段
Clusters []string `json:"clusters,optional"` //云际平台传入集群名称列表
TenantId int64 `json:"tenantId"`
Metadata interface{} `json:"metadata"`
}

View File

@ -30,7 +30,7 @@ func NewAiScheduler(val string) *aiScheduler {
return &aiScheduler{yamlString: val}
}
func (as *aiScheduler) getNewStructForDb(task *response.TaskInfo, participantId int64) (interface{}, error) {
func (as *aiScheduler) getNewStructForDb(task *response.TaskInfo, participantId int64, replica int32) (interface{}, error) {
ai := models.Ai{
ParticipantId: participantId,
TaskId: task.TaskId,

View File

@ -193,8 +193,13 @@ func computeHighDegree(task *Task, resourcesolution []int, providerList []*Provi
magnitude2 := mat.Norm(nowLeft, 2)
// 计算余弦相似度
cosine_similarity := dot_product / (magnitude1 * magnitude2)
highDegreeSum += cosine_similarity
//临时处理被除数为0的特殊情况
var cosineSimilarity = 0.0
if magnitude1 != 0 && magnitude2 != 0 {
cosineSimilarity = dot_product / (magnitude1 * magnitude2)
}
highDegreeSum += cosineSimilarity
}
return highDegreeSum / float64(len(providerList))

View File

@ -37,15 +37,6 @@ func NewCloudScheduler() *cloudScheduler {
}
func (cs *cloudScheduler) pickOptimalStrategy(task *algo.Task, providers ...*algo.Provider) (*algo.Strategy, error) {
////参数为空,返回 nil
//if len(providers) == 0 || task == nil {
// return nil, errors.New("算法获取参数为空")
//}
//
////仅有一个provider返回nil
//if len(providers) == 1 {
// return nil, nil
//}
//调度算法
strategy := algo.NewK8sStrategy(task, providers...)
@ -56,8 +47,9 @@ func (cs *cloudScheduler) pickOptimalStrategy(task *algo.Task, providers ...*alg
return taskResult.MaxscoreStrategy, nil
}
func (cs *cloudScheduler) getNewStructForDb(task *response.TaskInfo, participantId int64) (interface{}, error) {
func (cs *cloudScheduler) getNewStructForDb(task *response.TaskInfo, participantId int64, replica int32) (interface{}, error) {
bytes, err := json.Marshal(task.Metadata)
//replica 需要替换到yaml中
if err != nil {
return nil, err
}
@ -118,7 +110,8 @@ func (cs *cloudScheduler) genTaskAndProviders(task *response.TaskInfo, dbEngin *
providerList = append(providerList, provider)
}
t := algo.NewTask(0, 1, 2, 75120000, 301214500, 1200, 2, 6, 2000)
replicas := task.Metadata.(map[string]interface{})["spec"].(map[string]interface{})["replicas"].(float64)
t := algo.NewTask(0, int(replicas), 2, 75120000, 301214500, 1200, 2, 6, 2000)
return t, providerList
}

View File

@ -23,7 +23,7 @@ import (
)
type scheduleService interface {
getNewStructForDb(task *response.TaskInfo, participantId int64) (interface{}, error)
getNewStructForDb(task *response.TaskInfo, participantId int64, replica int32) (interface{}, error)
pickOptimalStrategy(task *algo.Task, providers ...*algo.Provider) (*algo.Strategy, error)
genTaskAndProviders(task *response.TaskInfo, dbEngin *gorm.DB) (*algo.Task, []*algo.Provider)
}

View File

@ -31,7 +31,7 @@ func NewHpcScheduler(val string) *hpcScheduler {
return &hpcScheduler{yamlString: val}
}
func (h *hpcScheduler) getNewStructForDb(task *response.TaskInfo, participantId int64) (interface{}, error) {
func (h *hpcScheduler) getNewStructForDb(task *response.TaskInfo, participantId int64, replica int32) (interface{}, error) {
hpc := models.Hpc{}
utils.Convert(task.Metadata, &hpc)
hpc.Id = utils.GenSnowflakeID()

View File

@ -15,7 +15,6 @@
package scheduler
import (
"context"
"encoding/json"
"github.com/pkg/errors"
"github.com/zeromicro/go-zero/core/logx"
@ -47,27 +46,42 @@ func NewScheduler(scheduleService scheduleService, val string, dbEngin *gorm.DB,
return &scheduler{task: task, scheduleService: scheduleService, dbEngin: dbEngin, participantRpc: participantRpc, result: make(map[ParticipantId]Replicas, 0)}, nil
}
func (s *scheduler) SepcifyClusters() {
}
func (s *scheduler) MatchLabels() {
// 已指定 ParticipantId
var ids []int64
count := 0
// 已指定 ParticipantId 直接不走标签匹配
if s.task.ParticipantId != 0 {
return
}
var ids []int64
count := 0
// 如果已指定集群名通过数据库查询后返回p端ip列表
if len(s.task.Clusters) != 0 {
for i, _ := range s.task.Clusters {
clusterName := s.task.Clusters[i]
var participantId int64
s.dbEngin.Raw("select id from sc_participant_phy_info where `name` = ?", clusterName).Scan(&participantId)
s.participantIds = append(s.participantIds, participantId)
}
return
}
//如果均未指定,则通过标签匹配
for key := range s.task.MatchLabels {
var participantIds []int64
s.dbEngin.Raw("select participant_id from sc_participant_label_info where `key` = ? and value = ?", key, s.task.MatchLabels[key]).Scan(&participantIds)
if count == 0 {
ids = participantIds
}
//if len(participantId) == 0 || len(ids) == 0 {
// return nil, nil
//}
ids = intersect(ids, participantIds)
count++
}
s.participantIds = micsSlice(ids, 1)
s.participantIds = ids
}
func (s *scheduler) AssignAndSchedule() error {
@ -75,18 +89,19 @@ func (s *scheduler) AssignAndSchedule() error {
if s.task.ParticipantId != 0 {
return nil
}
// 标签匹配后未找到ParticipantIds
// 标签匹配以及后未找到ParticipantIds
if len(s.participantIds) == 0 {
return errors.New("未找到匹配的ParticipantIds")
}
// ParticipantIds 返回唯一值
// 指定或者标签匹配的结果只有一个集群,给任务信息指定
if len(s.participantIds) == 1 {
if !s.checkIfParticipantAvailable(ParticipantId(s.participantIds[0])) {
return errors.Errorf("集群 %d 不可用", s.participantIds[0])
}
s.task.ParticipantId = s.participantIds[0]
replicas := s.task.Metadata.(map[string]interface{})["spec"].(map[string]interface{})["replicas"].(float64)
result := make(map[ParticipantId]Replicas)
result[ParticipantId(s.participantIds[0])] = Replicas(replicas)
s.result = result
return nil
}
@ -98,9 +113,6 @@ func (s *scheduler) AssignAndSchedule() error {
//集群数量不满足,指定到标签匹配后第一个集群
if len(providerList) < 2 {
if !s.checkIfParticipantAvailable(ParticipantId(s.participantIds[0])) {
return errors.Errorf("集群 %d 不可用", s.participantIds[0])
}
s.task.ParticipantId = s.participantIds[0]
return nil
}
@ -121,18 +133,18 @@ func (s *scheduler) AssignAndSchedule() error {
}
func (s *scheduler) SaveToDb() error {
if s.task.ParticipantId == 0 {
return errors.New("participantId 为空")
}
structForDb, err := s.scheduleService.getNewStructForDb(s.task, s.task.ParticipantId)
if err != nil {
return err
}
tx := s.dbEngin.Create(structForDb)
if tx.Error != nil {
// todo 保存失败数据
logx.Error(tx.Error)
return tx.Error
for key, value := range s.result {
structForDb, err := s.scheduleService.getNewStructForDb(s.task, int64(key), int32(value))
if err != nil {
return err
}
tx := s.dbEngin.Create(structForDb)
if tx.Error != nil {
logx.Error(tx.Error)
return tx.Error
}
}
return nil
}
@ -144,66 +156,9 @@ func (s *scheduler) obtainParamsforStrategy() (*algo.Task, []*algo.Provider, err
return nil, nil, errors.New("获取集群失败")
}
// 过滤可用集群
err := s.filterAvailableProviders(&providerList)
if err != nil {
return nil, nil, err
}
return task, providerList, nil
}
func (s *scheduler) checkIfParticipantAvailable(id ParticipantId) bool {
workingIds, err := s.getAvailableParticipantIds()
if err != nil {
return false
}
return contains(workingIds, int64(id))
}
func (s *scheduler) getAvailableParticipantIds() ([]int64, error) {
resp, err := s.participantRpc.ListParticipant(context.Background(), nil)
if err != nil {
return nil, err
}
if resp.Code != 200 {
return nil, errors.New("集群列表查询失败")
}
var workingIds []int64
for _, e := range resp.Data {
if e.ClientState == "UNKNOWN" {
continue
}
workingIds = append(workingIds, e.ParticipantId)
}
return workingIds, nil
}
func (s *scheduler) filterAvailableProviders(providerList *[]*algo.Provider) error {
workingIds, err := s.getAvailableParticipantIds()
if err != nil {
return err
}
var tempList []*algo.Provider
for _, provider := range *providerList {
if contains(workingIds, provider.Pid) && contains(s.participantIds, provider.Pid) {
tempList = append(tempList, provider)
}
}
*providerList = tempList
return nil
}
func (s *scheduler) assignReplicasToResult(strategy *algo.Strategy, providerList []*algo.Provider) error {
if len(strategy.Tasksolution) == 0 {
@ -223,12 +178,3 @@ func (s *scheduler) assignReplicasToResult(strategy *algo.Strategy, providerList
return nil
}
func contains(s []int64, e int64) bool {
for _, a := range s {
if a == e {
return true
}
}
return false
}