251 lines
8.0 KiB
Go
251 lines
8.0 KiB
Go
package cron
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"github.com/zeromicro/go-zero/core/logx"
|
|
"github.com/zeromicro/go-zero/zrpc"
|
|
hpcacclient "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/config"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
|
"gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
|
|
"gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
|
|
"gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
|
|
"net/http"
|
|
"strconv"
|
|
"sync"
|
|
)
|
|
|
|
const (
|
|
OCTOPUS = "octopus"
|
|
MODELARTS = "modelarts"
|
|
SHUGUANGAI = "shuguangAi"
|
|
)
|
|
|
|
func GetTaskList(svc *svc.ServiceContext) ([]*types.TaskModel, error) {
|
|
limit := 10
|
|
offset := 0
|
|
var list []*types.TaskModel
|
|
db := svc.DbEngin.Model(&types.TaskModel{}).Table("task")
|
|
|
|
db = db.Where("deleted_at is null")
|
|
|
|
//count total
|
|
var total int64
|
|
err := db.Count(&total).Error
|
|
db.Limit(limit).Offset(offset)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
err = db.Order("created_time desc").Find(&list).Error
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return list, nil
|
|
}
|
|
|
|
func UpdateAiAdapterMaps(svc *svc.ServiceContext) {
|
|
var aiType = "1"
|
|
adapterIds, err := svc.Scheduler.AiStorages.GetAdapterIdsByType(aiType)
|
|
if err != nil {
|
|
msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error())
|
|
logx.Errorf(errors.New(msg).Error())
|
|
return
|
|
}
|
|
if len(adapterIds) == 0 {
|
|
return
|
|
}
|
|
|
|
for _, id := range adapterIds {
|
|
clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(id)
|
|
if err != nil {
|
|
msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error())
|
|
logx.Errorf(errors.New(msg).Error())
|
|
return
|
|
}
|
|
if len(clusters.List) == 0 {
|
|
continue
|
|
}
|
|
if isAdapterExist(svc, id, len(clusters.List)) {
|
|
continue
|
|
} else {
|
|
if isAdapterEmpty(svc, id) {
|
|
exeClusterMap, colClusterMap := InitAiClusterMap(&svc.Config, clusters.List)
|
|
svc.Scheduler.AiService.AiExecutorAdapterMap[id] = exeClusterMap
|
|
svc.Scheduler.AiService.AiCollectorAdapterMap[id] = colClusterMap
|
|
} else {
|
|
UpdateClusterMaps(svc, id, clusters.List)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func UpdateClusterMaps(svc *svc.ServiceContext, adapterId string, clusters []types.ClusterInfo) {
|
|
for _, c := range clusters {
|
|
_, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id]
|
|
_, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[adapterId][c.Id]
|
|
if !ok && !ok2 {
|
|
switch c.Name {
|
|
case OCTOPUS:
|
|
id, _ := strconv.ParseInt(c.Id, 10, 64)
|
|
octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(svc.Config.OctopusRpcConf))
|
|
octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
|
|
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus
|
|
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus
|
|
case MODELARTS:
|
|
id, _ := strconv.ParseInt(c.Id, 10, 64)
|
|
modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(svc.Config.ModelArtsRpcConf))
|
|
modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(svc.Config.ModelArtsImgRpcConf))
|
|
modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
|
|
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
|
|
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
|
|
case SHUGUANGAI:
|
|
id, _ := strconv.ParseInt(c.Id, 10, 64)
|
|
aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(svc.Config.ACRpcConf))
|
|
sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
|
|
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai
|
|
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai
|
|
}
|
|
} else {
|
|
continue
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
func isAdapterExist(svc *svc.ServiceContext, id string, clusterNum int) bool {
|
|
emap, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id]
|
|
cmap, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id]
|
|
if ok && ok2 {
|
|
if len(emap) == clusterNum && len(cmap) == clusterNum {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isAdapterEmpty(svc *svc.ServiceContext, id string) bool {
|
|
_, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id]
|
|
_, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id]
|
|
if !ok && !ok2 {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[string]executor.AiExecutor, map[string]collector.AiCollector) {
|
|
executorMap := make(map[string]executor.AiExecutor)
|
|
collectorMap := make(map[string]collector.AiCollector)
|
|
for _, c := range clusters {
|
|
switch c.Name {
|
|
case OCTOPUS:
|
|
id, _ := strconv.ParseInt(c.Id, 10, 64)
|
|
octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(conf.OctopusRpcConf))
|
|
octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
|
|
collectorMap[c.Id] = octopus
|
|
executorMap[c.Id] = octopus
|
|
case MODELARTS:
|
|
id, _ := strconv.ParseInt(c.Id, 10, 64)
|
|
modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf))
|
|
modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf))
|
|
modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
|
|
collectorMap[c.Id] = modelarts
|
|
executorMap[c.Id] = modelarts
|
|
case SHUGUANGAI:
|
|
id, _ := strconv.ParseInt(c.Id, 10, 64)
|
|
aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf))
|
|
sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
|
|
collectorMap[c.Id] = sgai
|
|
executorMap[c.Id] = sgai
|
|
}
|
|
}
|
|
|
|
return executorMap, collectorMap
|
|
}
|
|
|
|
func UpdateClusterResource(svc *svc.ServiceContext) {
|
|
list, err := svc.Scheduler.AiStorages.GetAdaptersByType("1")
|
|
if err != nil {
|
|
return
|
|
}
|
|
var wg sync.WaitGroup
|
|
for _, adapter := range list {
|
|
clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
for _, cluster := range clusters.List {
|
|
c := cluster
|
|
clusterResource, err := svc.Scheduler.AiStorages.GetClusterResourcesById(c.Id)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
wg.Add(1)
|
|
go func() {
|
|
_, ok := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id]
|
|
if !ok {
|
|
wg.Done()
|
|
return
|
|
}
|
|
h := http.Request{}
|
|
stat, err := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id].GetResourceStats(h.Context())
|
|
if err != nil {
|
|
wg.Done()
|
|
return
|
|
}
|
|
if stat == nil {
|
|
wg.Done()
|
|
return
|
|
}
|
|
clusterType, err := strconv.ParseInt(adapter.Type, 10, 64)
|
|
if err != nil {
|
|
wg.Done()
|
|
return
|
|
}
|
|
var cardTotal int64
|
|
var topsTotal float64
|
|
for _, card := range stat.CardsAvail {
|
|
cardTotal += int64(card.CardNum)
|
|
topsTotal += card.TOpsAtFp16 * float64(card.CardNum)
|
|
}
|
|
|
|
if (models.TClusterResource{} == *clusterResource) {
|
|
err = svc.Scheduler.AiStorages.SaveClusterResources(adapter.Id, c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal),
|
|
stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal)
|
|
if err != nil {
|
|
wg.Done()
|
|
return
|
|
}
|
|
} else {
|
|
if stat.CpuCoreTotal == 0 || stat.MemTotal == 0 || stat.DiskTotal == 0 {
|
|
wg.Done()
|
|
return
|
|
}
|
|
clusterResource.CardTotal = cardTotal
|
|
clusterResource.CardTopsTotal = topsTotal
|
|
clusterResource.CpuAvail = float64(stat.CpuCoreAvail)
|
|
clusterResource.CpuTotal = float64(stat.CpuCoreTotal)
|
|
clusterResource.MemAvail = stat.MemAvail
|
|
clusterResource.MemTotal = stat.MemTotal
|
|
clusterResource.DiskAvail = stat.DiskAvail
|
|
clusterResource.DiskTotal = stat.DiskTotal
|
|
|
|
err := svc.Scheduler.AiStorages.UpdateClusterResources(clusterResource)
|
|
if err != nil {
|
|
wg.Done()
|
|
return
|
|
}
|
|
}
|
|
wg.Done()
|
|
}()
|
|
}
|
|
}
|
|
wg.Wait()
|
|
}
|