Merge remote-tracking branch 'origin/master' into master-wq

# Conflicts:
#	api/internal/logic/schedule/schedulesubmitlogic.go
#	api/internal/scheduler/database/aiStorage.go


Former-commit-id: dd876d4244717e948b9ec8cef12e5c243fa982cc
This commit is contained in:
qiwang 2024-05-16 15:58:34 +08:00
commit 961ec2d790
19 changed files with 248 additions and 2036 deletions

View File

@ -1733,6 +1733,8 @@ PayloadCreateTrainJob{
AiTask { AiTask {
Name string `json:"name,optional"` Name string `json:"name,optional"`
status string `json:"status,optional"` status string `json:"status,optional"`
Cluster string `json:"cluster,optional"`
Card string `json:"card,optional"`
TimeElapsed int32 `json:"elapsed,optional"` TimeElapsed int32 `json:"elapsed,optional"`
} }
) )

View File

@ -14,7 +14,7 @@ type (
Description string `json:"description,optional"` Description string `json:"description,optional"`
TenantId int64 `json:"tenantId,optional"` TenantId int64 `json:"tenantId,optional"`
TaskId int64 `json:"taskId,optional"` TaskId int64 `json:"taskId,optional"`
AdapterIds []string `json:"adapterId"` AdapterIds []string `json:"adapterIds"`
MatchLabels map[string]string `json:"matchLabels,optional"` MatchLabels map[string]string `json:"matchLabels,optional"`
CardCount int64 `json:"cardCount,optional"` CardCount int64 `json:"cardCount,optional"`
WorkDir string `json:"workDir,optional"` //paratera:workingDir WorkDir string `json:"workDir,optional"` //paratera:workingDir

View File

@ -19,7 +19,9 @@ type (
ScheduleResult { ScheduleResult {
ClusterId string `json:"clusterId"` ClusterId string `json:"clusterId"`
TaskId string `json:"taskId"` TaskId string `json:"taskId"`
Card string `json:"card"`
Strategy string `json:"strategy"` Strategy string `json:"strategy"`
JobId string `json:"jobId"`
Replica int32 `json:"replica"` Replica int32 `json:"replica"`
Msg string `json:"msg"` Msg string `json:"msg"`
} }
@ -32,6 +34,7 @@ type (
AdapterId string `json:"adapterId"` AdapterId string `json:"adapterId"`
AiClusterIds []string `json:"aiClusterIds"` AiClusterIds []string `json:"aiClusterIds"`
ResourceType string `json:"resourceType"` ResourceType string `json:"resourceType"`
ComputeCard string `json:"card"`
Tops float64 `json:"Tops,optional"` Tops float64 `json:"Tops,optional"`
TaskType string `json:"taskType"` TaskType string `json:"taskType"`
Datasets string `json:"datasets"` Datasets string `json:"datasets"`

View File

@ -2,6 +2,8 @@ package ai
import ( import (
"context" "context"
"errors"
"fmt"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"strconv" "strconv"
"sync" "sync"
@ -46,6 +48,9 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList
if err != nil { if err != nil {
continue continue
} }
if len(taskList) == 0 {
continue
}
for _, task := range taskList { for _, task := range taskList {
var elapsed time.Duration var elapsed time.Duration
switch task.Status { switch task.Status {
@ -68,6 +73,8 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList
t := &types.AiTask{ t := &types.AiTask{
Name: task.Name, Name: task.Name,
Status: task.Status, Status: task.Status,
Cluster: task.ClusterName,
Card: task.Card,
TimeElapsed: int32(elapsed.Seconds()), TimeElapsed: int32(elapsed.Seconds()),
} }
resp.List = append(resp.List, t) resp.List = append(resp.List, t)
@ -80,7 +87,6 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList
case <-time.After(2 * time.Second): case <-time.After(2 * time.Second):
return resp, nil return resp, nil
} }
} }
func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) { func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) {
@ -90,15 +96,20 @@ func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<-
if err != nil { if err != nil {
continue continue
} }
if len(taskList) == 0 {
continue
}
for _, task := range taskList { for _, task := range taskList {
t := task t := task
if t.Status == constants.Completed || t.JobId == "" { if t.Status == constants.Completed || task.Status == constants.Failed {
continue continue
} }
wg.Add(1) wg.Add(1)
go func() { go func() {
trainingTask, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(l.ctx, t.JobId) trainingTask, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(l.ctx, t.JobId)
if err != nil { if err != nil {
msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
logx.Errorf(errors.New(msg).Error())
wg.Done() wg.Done()
return return
} }

View File

@ -2,12 +2,16 @@ package core
import ( import (
"context" "context"
"errors"
"fmt"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils"
"strconv"
"sync"
"time" "time"
"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
@ -53,8 +57,9 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa
} }
// 更新智算任务状态 // 更新智算任务状态
var ch = make(chan struct{}) chs := [2]chan struct{}{make(chan struct{}), make(chan struct{})}
go l.updateAitaskStatus(list, ch) go l.updateTaskStatus(list, chs[0])
go l.updateAiTaskStatus(list, chs[1])
for _, model := range list { for _, model := range list {
if model.StartTime != "" && model.EndTime == "" { if model.StartTime != "" && model.EndTime == "" {
@ -72,20 +77,22 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa
resp.PageNum = req.PageNum resp.PageNum = req.PageNum
resp.Total = total resp.Total = total
for _, ch := range chs {
select { select {
case _ = <-ch: case <-ch:
return resp, nil case <-time.After(2 * time.Second):
case <-time.After(1 * time.Second): return
return resp, nil
} }
} }
return
}
func (l *PageListTaskLogic) updateAitaskStatus(tasks []*types.TaskModel, ch chan<- struct{}) { func (l *PageListTaskLogic) updateTaskStatus(tasks []*types.TaskModel, ch chan<- struct{}) {
for _, task := range tasks { for _, task := range tasks {
if task.AdapterTypeDict != 1 { if task.AdapterTypeDict != 1 {
continue continue
} }
if task.Status == constants.Succeeded { if task.Status == constants.Succeeded || task.Status == constants.Failed {
continue continue
} }
@ -96,9 +103,15 @@ func (l *PageListTaskLogic) updateAitaskStatus(tasks []*types.TaskModel, ch chan
return return
} }
if len(aiTask) == 0 {
continue
}
start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local) start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local)
end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local) end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local)
var status = constants.Succeeded
var status string
var count int
for _, a := range aiTask { for _, a := range aiTask {
s, _ := time.ParseInLocation(constants.Layout, a.StartTime, time.Local) s, _ := time.ParseInLocation(constants.Layout, a.StartTime, time.Local)
e, _ := time.ParseInLocation(constants.Layout, a.EndTime, time.Local) e, _ := time.ParseInLocation(constants.Layout, a.EndTime, time.Local)
@ -116,20 +129,90 @@ func (l *PageListTaskLogic) updateAitaskStatus(tasks []*types.TaskModel, ch chan
break break
} }
if a.Status == constants.Pending {
status = a.Status
continue
}
if a.Status == constants.Running { if a.Status == constants.Running {
status = a.Status status = a.Status
continue continue
} }
if a.Status == constants.Completed {
count++
continue
}
} }
if count == len(aiTask) {
status = constants.Succeeded
}
if status != "" {
task.Status = status task.Status = status
task.StartTime = start.Format(constants.Layout) task.StartTime = start.Format(constants.Layout)
task.EndTime = end.Format(constants.Layout) task.EndTime = end.Format(constants.Layout)
}
tx = l.svcCtx.DbEngin.Table("task").Updates(task) tx = l.svcCtx.DbEngin.Table("task").Updates(task)
if tx.Error != nil { if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return return
} }
} }
ch <- struct{}{} ch <- struct{}{}
} }
func (l *PageListTaskLogic) updateAiTaskStatus(tasks []*types.TaskModel, ch chan<- struct{}) {
var wg sync.WaitGroup
for _, task := range tasks {
if task.AdapterTypeDict != 1 {
continue
}
if task.Status == constants.Succeeded || task.Status == constants.Failed {
continue
}
var aiTaskList []*models.TaskAi
tx := l.svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", task.Id).Scan(&aiTaskList)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return
}
if len(aiTaskList) == 0 {
continue
}
for _, aitask := range aiTaskList {
t := aitask
if t.Status == constants.Completed {
continue
}
wg.Add(1)
go func() {
trainingTask, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[strconv.FormatInt(t.AdapterId, 10)][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(l.ctx, t.JobId)
if err != nil {
msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
logx.Errorf(errors.New(msg).Error())
wg.Done()
return
}
t.Status = trainingTask.Status
t.StartTime = trainingTask.Start
t.EndTime = trainingTask.End
err = l.svcCtx.Scheduler.AiStorages.UpdateAiTask(t)
if err != nil {
msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
logx.Errorf(errors.New(msg).Error())
wg.Done()
return
}
wg.Done()
}()
}
}
wg.Wait()
ch <- struct{}{}
}

View File

@ -2,6 +2,7 @@ package hpc
import ( import (
"context" "context"
"errors"
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client" clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
@ -63,7 +64,9 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t
l.svcCtx.DbEngin.Raw("SELECT nickname FROM `t_cluster` where id = ?", clusterId).Scan(&clusterName) l.svcCtx.DbEngin.Raw("SELECT nickname FROM `t_cluster` where id = ?", clusterId).Scan(&clusterName)
l.svcCtx.DbEngin.Raw("SELECT adapter_id FROM `t_cluster` where id = ?", clusterId).Scan(&adapterId) l.svcCtx.DbEngin.Raw("SELECT adapter_id FROM `t_cluster` where id = ?", clusterId).Scan(&adapterId)
l.svcCtx.DbEngin.Raw("SELECT name FROM `t_adapter` where id = ?", adapterId).Scan(&adapterName) l.svcCtx.DbEngin.Raw("SELECT name FROM `t_adapter` where id = ?", adapterId).Scan(&adapterName)
if len(adapterName) == 0 || adapterName == "" {
return nil, errors.New("no corresponding adapter found")
}
env, _ := json.Marshal(req.Environment) env, _ := json.Marshal(req.Environment)
hpcInfo := models.TaskHpc{ hpcInfo := models.TaskHpc{

View File

@ -7,6 +7,8 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"strconv"
"strings"
"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
) )
@ -32,6 +34,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
TaskName: req.AiOption.TaskName, TaskName: req.AiOption.TaskName,
ResourceType: req.AiOption.ResourceType, ResourceType: req.AiOption.ResourceType,
Replica: req.AiOption.Replica, Replica: req.AiOption.Replica,
ComputeCard: req.AiOption.ComputeCard,
Tops: req.AiOption.Tops, Tops: req.AiOption.Tops,
TaskType: req.AiOption.TaskType, TaskType: req.AiOption.TaskType,
DatasetsName: req.AiOption.Datasets, DatasetsName: req.AiOption.Datasets,
@ -69,14 +72,22 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
for _, r := range rs { for _, r := range rs {
scheResult := &types.ScheduleResult{} scheResult := &types.ScheduleResult{}
scheResult.ClusterId = r.ClusterId scheResult.ClusterId = r.ClusterId
scheResult.TaskId = r.TaskId scheResult.TaskId = strconv.FormatInt(id, 10)
scheResult.JobId = r.JobId
scheResult.Strategy = r.Strategy scheResult.Strategy = r.Strategy
scheResult.Card = strings.ToUpper(r.Card)
scheResult.Replica = r.Replica scheResult.Replica = r.Replica
scheResult.Msg = r.Msg scheResult.Msg = r.Msg
err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, r.ClusterId, r.TaskId, constants.Saved, r.Msg)
opt.ComputeCard = strings.ToUpper(r.Card)
clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(r.ClusterId)
err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, r.ClusterId, clusterName, r.JobId, constants.Saved, r.Msg)
if err != nil { if err != nil {
return nil, err return nil, err
} }
resp.Results = append(resp.Results, scheResult) resp.Results = append(resp.Results, scheResult)
} }

View File

@ -35,6 +35,16 @@ func (s *AiStorage) GetClustersByAdapterId(id string) (*types.ClusterListResp, e
return &resp, nil return &resp, nil
} }
func (s *AiStorage) GetClusterNameById(id string) (string, error) {
var name string
tx := s.DbEngin.Raw("select `description` from t_cluster where `id` = ?", id).Scan(&name)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return "", tx.Error
}
return name, nil
}
func (s *AiStorage) GetAdapterIdsByType(adapterType string) ([]string, error) { func (s *AiStorage) GetAdapterIdsByType(adapterType string) ([]string, error) {
var list []types.AdapterInfo var list []types.AdapterInfo
var ids []string var ids []string
@ -63,10 +73,11 @@ func (s *AiStorage) GetAdaptersByType(adapterType string) ([]*types.AdapterInfo,
func (s *AiStorage) GetAiTasksByAdapterId(adapterId string) ([]*models.TaskAi, error) { func (s *AiStorage) GetAiTasksByAdapterId(adapterId string) ([]*models.TaskAi, error) {
var resp []*models.TaskAi var resp []*models.TaskAi
tx := s.DbEngin.Raw("select * from task_ai where `adapter_id` = ? ", adapterId).Scan(&resp) db := s.DbEngin.Model(&models.TaskAi{}).Table("task_ai")
if tx.Error != nil { db = db.Where("adapter_id = ?", adapterId)
logx.Errorf(tx.Error.Error()) err := db.Order("commit_time desc").Find(&resp).Error
return nil, tx.Error if err != nil {
return nil, err
} }
return resp, nil return resp, nil
} }
@ -90,7 +101,7 @@ func (s *AiStorage) SaveTask(name string, strategyCode int64, synergyStatus int6
return taskModel.Id, nil return taskModel.Id, nil
} }
func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId string, jobId string, status string, msg string) error { func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId string, clusterName string, jobId string, status string, msg string) error {
// 构建主任务结构体 // 构建主任务结构体
aId, err := strconv.ParseInt(option.AdapterId, 10, 64) aId, err := strconv.ParseInt(option.AdapterId, 10, 64)
if err != nil { if err != nil {
@ -100,10 +111,14 @@ func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId
if err != nil { if err != nil {
return err return err
} }
del, _ := time.Parse(constants.Layout, constants.Layout)
aiTaskModel := models.TaskAi{ aiTaskModel := models.TaskAi{
TaskId: taskId, TaskId: taskId,
AdapterId: aId, AdapterId: aId,
ClusterId: cId, ClusterId: cId,
ClusterName: clusterName,
Name: option.TaskName, Name: option.TaskName,
Replica: int64(option.Replica), Replica: int64(option.Replica),
JobId: jobId, JobId: jobId,
@ -111,6 +126,8 @@ func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId
Strategy: option.StrategyName, Strategy: option.StrategyName,
Status: status, Status: status,
Msg: msg, Msg: msg,
Card: option.ComputeCard,
DeletedAt: del,
CommitTime: time.Now(), CommitTime: time.Now(),
} }
// 保存任务数据到数据库 // 保存任务数据到数据库

View File

@ -19,6 +19,7 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-ac/hpcAC" "gitlink.org.cn/JointCloud/pcm-ac/hpcAC"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
@ -42,10 +43,11 @@ type AiScheduler struct {
} }
type AiResult struct { type AiResult struct {
TaskId string JobId string
ClusterId string ClusterId string
Strategy string Strategy string
Replica int32 Replica int32
Card string
Msg string Msg string
} }
@ -156,6 +158,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
result.Replica = c.Replicas result.Replica = c.Replicas
result.ClusterId = c.ClusterId result.ClusterId = c.ClusterId
result.Strategy = as.option.StrategyName result.Strategy = as.option.StrategyName
result.Card = opt.ComputeCard
ch <- result ch <- result
wg.Done() wg.Done()
@ -192,28 +195,35 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
}) })
msg := fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error()) msg := fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
errmsg += msg errmsg += msg
err := as.AiStorages.SaveAiTask(taskId, as.option, e.clusterId, "", constants.Failed, msg)
clusterName, _ := as.AiStorages.GetClusterNameById(e.clusterId)
err := as.AiStorages.SaveAiTask(taskId, as.option, e.clusterId, clusterName, "", constants.Failed, msg)
if err != nil { if err != nil {
return nil, errors.New("database add failed: " + err.Error()) return nil, errors.New("database add failed: " + err.Error())
} }
} }
for _, s := range results { for _, s := range results {
as.option.ComputeCard = s.Card //execute card
clusterName, _ := as.AiStorages.GetClusterNameById(s.ClusterId)
if s.Msg != "" { if s.Msg != "" {
msg := fmt.Sprintf("clusterId: %v , error: %v \n", s.ClusterId, s.Msg) msg := fmt.Sprintf("clusterId: %v , error: %v \n", s.ClusterId, s.Msg)
errmsg += msg errmsg += msg
err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, "", constants.Failed, msg) err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, clusterName, "", constants.Failed, msg)
if err != nil { if err != nil {
return nil, errors.New("database add failed: " + err.Error()) return nil, errors.New("database add failed: " + err.Error())
} }
} else { } else {
msg := fmt.Sprintf("clusterId: %v , submitted successfully, taskId: %v \n", s.ClusterId, s.TaskId) msg := fmt.Sprintf("clusterId: %v , submitted successfully, jobId: %v \n", s.ClusterId, s.JobId)
errmsg += msg errmsg += msg
err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, s.TaskId, constants.Succeeded, msg) err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, clusterName, s.JobId, constants.Saved, msg)
if err != nil { if err != nil {
return nil, errors.New("database add failed: " + err.Error()) return nil, errors.New("database add failed: " + err.Error())
} }
} }
} }
logx.Errorf(errors.New(errmsg).Error())
return nil, errors.New(errmsg) return nil, errors.New(errmsg)
} }
@ -288,7 +298,7 @@ func convertType(in interface{}) (*AiResult, error) {
case *hpcAC.SubmitTaskAiResp: case *hpcAC.SubmitTaskAiResp:
resp := (in).(*hpcAC.SubmitTaskAiResp) resp := (in).(*hpcAC.SubmitTaskAiResp)
if resp.Code == "0" { if resp.Code == "0" {
result.TaskId = resp.Data result.JobId = resp.Data
} else { } else {
result.Msg = resp.Msg result.Msg = resp.Msg
} }
@ -297,7 +307,7 @@ func convertType(in interface{}) (*AiResult, error) {
resp := (in).(*octopus.CreateTrainJobResp) resp := (in).(*octopus.CreateTrainJobResp)
if resp.Success { if resp.Success {
result.TaskId = resp.Payload.JobId result.JobId = resp.Payload.JobId
} else { } else {
result.Msg = resp.Error.Message result.Msg = resp.Error.Message
} }

View File

@ -19,6 +19,7 @@ type Strategy interface {
type AssignedCluster struct { type AssignedCluster struct {
ClusterId string ClusterId string
ClusterName string
Replicas int32 Replicas int32
} }

View File

@ -402,7 +402,7 @@ func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType st
aLatest := &octopus.Algorithms{} aLatest := &octopus.Algorithms{}
for i, _ := range algorithms { for i, _ := range algorithms {
if time.Unix(aLatest.CreatedAt, 0).After(time.Unix(algorithms[i].CreatedAt, 0)) { if time.Unix(algorithms[i].CreatedAt, 0).After(time.Unix(aLatest.CreatedAt, 0)) {
aLatest = algorithms[i] aLatest = algorithms[i]
} }
} }
@ -493,7 +493,11 @@ func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*coll
} }
jobresp, ok := (resp).(*octopus.GetTrainJobResp) jobresp, ok := (resp).(*octopus.GetTrainJobResp)
if !jobresp.Success || !ok { if !jobresp.Success || !ok {
return nil, errors.New("get training task failed") if jobresp.Error != nil {
return nil, errors.New(jobresp.Error.Message)
} else {
return nil, errors.New("get training task failed, empty error returned")
}
} }
var task collector.Task var task collector.Task
task.Id = jobresp.Payload.TrainJob.Id task.Id = jobresp.Payload.TrainJob.Id
@ -508,6 +512,8 @@ func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*coll
task.Status = constants.Running task.Status = constants.Running
case "stopped": case "stopped":
task.Status = constants.Stopped task.Status = constants.Stopped
case "pending":
task.Status = constants.Pending
default: default:
task.Status = "undefined" task.Status = "undefined"
} }
@ -662,10 +668,23 @@ func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOpti
if option.ResourceType == CARD { if option.ResourceType == CARD {
for _, image := range preImgResp.Payload.Images { for _, image := range preImgResp.Payload.Images {
if strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) { if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(option.ComputeCard)]) {
switch strings.ToUpper(option.ComputeCard) {
case GCU:
if strings.HasPrefix(image.ImageVersion, "t20_") {
option.ImageId = image.Id option.ImageId = image.Id
return nil return nil
} }
case BIV100:
if strings.HasPrefix(image.ImageVersion, "bi_") {
option.ImageId = image.Id
return nil
}
case MLU:
option.ImageId = image.Id
return nil
}
}
} }
} }
@ -750,7 +769,7 @@ func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpec
if spec.Price == 1 { if spec.Price == 1 {
ns := strings.Split(spec.Name, COMMA) ns := strings.Split(spec.Name, COMMA)
cardSpecs := strings.Split(ns[0], STAR) cardSpecs := strings.Split(ns[0], STAR)
if cardSpecs[1] == cardCnMap[computeCard] { if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] {
option.ResourceId = spec.Id option.ResourceId = spec.Id
option.ComputeCard = computeCard option.ComputeCard = computeCard
return nil return nil
@ -766,7 +785,7 @@ func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpec
if spec.Price == 1 { if spec.Price == 1 {
ns := strings.Split(spec.Name, COMMA) ns := strings.Split(spec.Name, COMMA)
cardSpecs := strings.Split(ns[0], STAR) cardSpecs := strings.Split(ns[0], STAR)
if cardSpecs[1] == cardCnMap[computeCard] { if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] {
option.ResourceId = spec.Id option.ResourceId = spec.Id
option.ComputeCard = computeCard option.ComputeCard = computeCard
return nil return nil
@ -780,7 +799,7 @@ func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpec
continue continue
} }
cardSpecs := strings.Split(ns[0], STAR) cardSpecs := strings.Split(ns[0], STAR)
if cardSpecs[1] != cardCnMap[computeCard] { if cardSpecs[1] != cardCnMap[strings.ToUpper(computeCard)] {
continue continue
} }
s, err := strconv.ParseFloat(cardSpecs[0], 64) s, err := strconv.ParseFloat(cardSpecs[0], 64)
@ -789,36 +808,32 @@ func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpec
} }
switch computeCard { switch computeCard {
case GCU: case GCU:
option.ComputeCard = computeCard
if cardNum == s { // 1, 4, 8 if cardNum == s { // 1, 4, 8
option.ResourceId = spec.Id option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil return nil
} }
if 1 < cardNum && cardNum <= 4 && s == 4 { if 1 < cardNum && cardNum <= 4 && s == 4 {
option.ResourceId = spec.Id option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil return nil
} }
if 4 < cardNum && s == 8 { if 4 < cardNum && s == 8 {
option.ResourceId = spec.Id option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil return nil
} }
case MLU: // 1, 2, 4 case MLU: // 1, 2, 4
option.ComputeCard = computeCard
if cardNum/2 == s { if cardNum/2 == s {
option.ResourceId = spec.Id option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil return nil
} }
if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 { if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 {
option.ResourceId = spec.Id option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil return nil
} }
if 2 < cardNum/2 && s == 4 { if 2 < cardNum/2 && s == 4 {
option.ResourceId = spec.Id option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil return nil
} }
} }

View File

@ -565,10 +565,14 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error {
if option.ResourceType == CPU { if option.ResourceType == CPU {
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
option.ComputeCard = CPU
return nil return nil
} }
if option.ResourceType == CARD { if option.ResourceType == CARD {
option.ComputeCard = DCU
if 0 <= option.Tops && option.Tops <= DCU_TOPS { if 0 <= option.Tops && option.Tops <= DCU_TOPS {
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
return nil return nil

View File

@ -82,8 +82,8 @@ var (
"3": SHUGUANGAI, "3": SHUGUANGAI,
"4": SHUGUANGHPC, "4": SHUGUANGHPC,
} }
resourceTypes = []string{CPU, CARD} resourceTypes = []string{CARD}
taskTypes = []string{PYTORCH_TASK, TENSORFLOW_TASK} taskTypes = []string{PYTORCH_TASK}
ERROR_RESP_EMPTY = errors.New("resp empty error") ERROR_RESP_EMPTY = errors.New("resp empty error")
ERROR_CONVERT_EMPTY = errors.New("convert empty error") ERROR_CONVERT_EMPTY = errors.New("convert empty error")

View File

@ -91,7 +91,7 @@ func NewServiceContext(c config.Config) *ServiceContext {
NamingStrategy: schema.NamingStrategy{ NamingStrategy: schema.NamingStrategy{
SingularTable: true, // 使用单数表名,启用该选项,此时,`User` 的表名应该是 `t_user` SingularTable: true, // 使用单数表名,启用该选项,此时,`User` 的表名应该是 `t_user`
}, },
Logger: logger.Default.LogMode(logger.Info), Logger: logger.Default.LogMode(logger.Error),
}) })
if err != nil { if err != nil {
logx.Errorf("数据库连接失败, err%v", err) logx.Errorf("数据库连接失败, err%v", err)

View File

@ -1164,7 +1164,7 @@ type CommitHpcTaskReq struct {
Description string `json:"description,optional"` Description string `json:"description,optional"`
TenantId int64 `json:"tenantId,optional"` TenantId int64 `json:"tenantId,optional"`
TaskId int64 `json:"taskId,optional"` TaskId int64 `json:"taskId,optional"`
AdapterIds []string `json:"adapterId"` AdapterIds []string `json:"adapterIds"`
MatchLabels map[string]string `json:"matchLabels,optional"` MatchLabels map[string]string `json:"matchLabels,optional"`
CardCount int64 `json:"cardCount,optional"` CardCount int64 `json:"cardCount,optional"`
WorkDir string `json:"workDir,optional"` //paratera:workingDir WorkDir string `json:"workDir,optional"` //paratera:workingDir
@ -2841,6 +2841,8 @@ type CenterTaskListResp struct {
type AiTask struct { type AiTask struct {
Name string `json:"name,optional"` Name string `json:"name,optional"`
Status string `json:"status,optional"` Status string `json:"status,optional"`
Cluster string `json:"cluster,optional"`
Card string `json:"card,optional"`
TimeElapsed int32 `json:"elapsed,optional"` TimeElapsed int32 `json:"elapsed,optional"`
} }
@ -5620,7 +5622,9 @@ type ScheduleResp struct {
type ScheduleResult struct { type ScheduleResult struct {
ClusterId string `json:"clusterId"` ClusterId string `json:"clusterId"`
TaskId string `json:"taskId"` TaskId string `json:"taskId"`
Card string `json:"card"`
Strategy string `json:"strategy"` Strategy string `json:"strategy"`
JobId string `json:"jobId"`
Replica int32 `json:"replica"` Replica int32 `json:"replica"`
Msg string `json:"msg"` Msg string `json:"msg"`
} }
@ -5633,6 +5637,7 @@ type AiOption struct {
AdapterId string `json:"adapterId"` AdapterId string `json:"adapterId"`
AiClusterIds []string `json:"aiClusterIds"` AiClusterIds []string `json:"aiClusterIds"`
ResourceType string `json:"resourceType"` ResourceType string `json:"resourceType"`
ComputeCard string `json:"card"`
Tops float64 `json:"Tops,optional"` Tops float64 `json:"Tops,optional"`
TaskType string `json:"taskType"` TaskType string `json:"taskType"`
Datasets string `json:"datasets"` Datasets string `json:"datasets"`

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

BIN
deploy/yaml.tar.gz Normal file

Binary file not shown.

View File

@ -38,8 +38,10 @@ type (
TaskAi struct { TaskAi struct {
Id int64 `db:"id"` // id Id int64 `db:"id"` // id
TaskId int64 `db:"task_id"` // 任务id TaskId int64 `db:"task_id"` // 任务id
AdapterId int64 `db:"adapter_id"` // 设配器id AdapterId int64 `db:"adapter_id"` // 适配器id
AdapterName string `db:"adapter_name"` // 适配器名称
ClusterId int64 `db:"cluster_id"` // 集群id ClusterId int64 `db:"cluster_id"` // 集群id
ClusterName string `db:"cluster_name"` // 集群名称
Name string `db:"name"` // 任务名 Name string `db:"name"` // 任务名
Replica int64 `db:"replica"` // 执行数 Replica int64 `db:"replica"` // 执行数
JobId string `db:"job_id"` // 集群返回任务id JobId string `db:"job_id"` // 集群返回任务id
@ -50,6 +52,8 @@ type (
StartTime string `db:"start_time"` // 开始时间 StartTime string `db:"start_time"` // 开始时间
EndTime string `db:"end_time"` // 结束时间 EndTime string `db:"end_time"` // 结束时间
TaskType string `db:"task_type"` TaskType string `db:"task_type"`
DeletedAt time.Time `db:"deleted_at"`
Card string `db:"card"`
} }
) )
@ -88,14 +92,14 @@ func (m *defaultTaskAiModel) FindOne(ctx context.Context, id int64) (*TaskAi, er
} }
func (m *defaultTaskAiModel) Insert(ctx context.Context, data *TaskAi) (sql.Result, error) { func (m *defaultTaskAiModel) Insert(ctx context.Context, data *TaskAi) (sql.Result, error) {
query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskAiRowsExpectAutoSet) query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskAiRowsExpectAutoSet)
ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.ClusterId, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType) ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.DeletedAt, data.Card)
return ret, err return ret, err
} }
func (m *defaultTaskAiModel) Update(ctx context.Context, data *TaskAi) error { func (m *defaultTaskAiModel) Update(ctx context.Context, data *TaskAi) error {
query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskAiRowsWithPlaceHolder) query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskAiRowsWithPlaceHolder)
_, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.ClusterId, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.Id) _, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.DeletedAt, data.Card, data.Id)
return err return err
} }