pcm-coordinator/internal/cron/aiCronTask.go

251 lines
8.0 KiB
Go

package cron
import (
"errors"
"fmt"
"github.com/zeromicro/go-zero/core/logx"
"github.com/zeromicro/go-zero/zrpc"
hpcacclient "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/config"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
"gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
"gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
"net/http"
"strconv"
"sync"
)
const (
OCTOPUS = "octopus"
MODELARTS = "modelarts"
SHUGUANGAI = "shuguangAi"
)
func GetTaskList(svc *svc.ServiceContext) ([]*types.TaskModel, error) {
limit := 10
offset := 0
var list []*types.TaskModel
db := svc.DbEngin.Model(&types.TaskModel{}).Table("task")
db = db.Where("deleted_at is null")
//count total
var total int64
err := db.Count(&total).Error
db.Limit(limit).Offset(offset)
if err != nil {
return nil, err
}
err = db.Order("created_time desc").Find(&list).Error
if err != nil {
return nil, err
}
return list, nil
}
func UpdateAiAdapterMaps(svc *svc.ServiceContext) {
var aiType = "1"
adapterIds, err := svc.Scheduler.AiStorages.GetAdapterIdsByType(aiType)
if err != nil {
msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error())
logx.Errorf(errors.New(msg).Error())
return
}
if len(adapterIds) == 0 {
return
}
for _, id := range adapterIds {
clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(id)
if err != nil {
msg := fmt.Sprintf("###UpdateAiAdapterMaps###, error: %v \n", err.Error())
logx.Errorf(errors.New(msg).Error())
return
}
if len(clusters.List) == 0 {
continue
}
if isAdapterExist(svc, id, len(clusters.List)) {
continue
} else {
if isAdapterEmpty(svc, id) {
exeClusterMap, colClusterMap := InitAiClusterMap(&svc.Config, clusters.List)
svc.Scheduler.AiService.AiExecutorAdapterMap[id] = exeClusterMap
svc.Scheduler.AiService.AiCollectorAdapterMap[id] = colClusterMap
} else {
UpdateClusterMaps(svc, id, clusters.List)
}
}
}
}
func UpdateClusterMaps(svc *svc.ServiceContext, adapterId string, clusters []types.ClusterInfo) {
for _, c := range clusters {
_, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id]
_, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[adapterId][c.Id]
if !ok && !ok2 {
switch c.Name {
case OCTOPUS:
id, _ := strconv.ParseInt(c.Id, 10, 64)
octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(svc.Config.OctopusRpcConf))
octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = octopus
case MODELARTS:
id, _ := strconv.ParseInt(c.Id, 10, 64)
modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(svc.Config.ModelArtsRpcConf))
modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(svc.Config.ModelArtsImgRpcConf))
modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
case SHUGUANGAI:
id, _ := strconv.ParseInt(c.Id, 10, 64)
aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(svc.Config.ACRpcConf))
sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai
svc.Scheduler.AiService.AiExecutorAdapterMap[adapterId][c.Id] = sgai
}
} else {
continue
}
}
}
func isAdapterExist(svc *svc.ServiceContext, id string, clusterNum int) bool {
emap, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id]
cmap, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id]
if ok && ok2 {
if len(emap) == clusterNum && len(cmap) == clusterNum {
return true
}
}
return false
}
func isAdapterEmpty(svc *svc.ServiceContext, id string) bool {
_, ok := svc.Scheduler.AiService.AiExecutorAdapterMap[id]
_, ok2 := svc.Scheduler.AiService.AiCollectorAdapterMap[id]
if !ok && !ok2 {
return true
}
return false
}
func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[string]executor.AiExecutor, map[string]collector.AiCollector) {
executorMap := make(map[string]executor.AiExecutor)
collectorMap := make(map[string]collector.AiCollector)
for _, c := range clusters {
switch c.Name {
case OCTOPUS:
id, _ := strconv.ParseInt(c.Id, 10, 64)
octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(conf.OctopusRpcConf))
octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
collectorMap[c.Id] = octopus
executorMap[c.Id] = octopus
case MODELARTS:
id, _ := strconv.ParseInt(c.Id, 10, 64)
modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf))
modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf))
modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
collectorMap[c.Id] = modelarts
executorMap[c.Id] = modelarts
case SHUGUANGAI:
id, _ := strconv.ParseInt(c.Id, 10, 64)
aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf))
sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
collectorMap[c.Id] = sgai
executorMap[c.Id] = sgai
}
}
return executorMap, collectorMap
}
func UpdateClusterResource(svc *svc.ServiceContext) {
list, err := svc.Scheduler.AiStorages.GetAdaptersByType("1")
if err != nil {
return
}
var wg sync.WaitGroup
for _, adapter := range list {
clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id)
if err != nil {
continue
}
for _, cluster := range clusters.List {
c := cluster
clusterResource, err := svc.Scheduler.AiStorages.GetClusterResourcesById(c.Id)
if err != nil {
continue
}
wg.Add(1)
go func() {
_, ok := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id]
if !ok {
wg.Done()
return
}
h := http.Request{}
stat, err := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id].GetResourceStats(h.Context())
if err != nil {
wg.Done()
return
}
if stat == nil {
wg.Done()
return
}
clusterType, err := strconv.ParseInt(adapter.Type, 10, 64)
if err != nil {
wg.Done()
return
}
var cardTotal int64
var topsTotal float64
for _, card := range stat.CardsAvail {
cardTotal += int64(card.CardNum)
topsTotal += card.TOpsAtFp16 * float64(card.CardNum)
}
if (models.TClusterResource{} == *clusterResource) {
err = svc.Scheduler.AiStorages.SaveClusterResources(adapter.Id, c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal),
stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal)
if err != nil {
wg.Done()
return
}
} else {
if stat.CpuCoreTotal == 0 || stat.MemTotal == 0 || stat.DiskTotal == 0 {
wg.Done()
return
}
clusterResource.CardTotal = cardTotal
clusterResource.CardTopsTotal = topsTotal
clusterResource.CpuAvail = float64(stat.CpuCoreAvail)
clusterResource.CpuTotal = float64(stat.CpuCoreTotal)
clusterResource.MemAvail = stat.MemAvail
clusterResource.MemTotal = stat.MemTotal
clusterResource.DiskAvail = stat.DiskAvail
clusterResource.DiskTotal = stat.DiskTotal
err := svc.Scheduler.AiStorages.UpdateClusterResources(clusterResource)
if err != nil {
wg.Done()
return
}
}
wg.Done()
}()
}
}
wg.Wait()
}