Merge pull request 'fix db bugs' (#172) from tzwang/pcm-coordinator:master into master

Former-commit-id: 91e8e0532cbd60ec7a5ea4b7419515a49fe0aaf4
This commit is contained in:
tzwang 2024-05-14 15:07:48 +08:00
commit fcd12bbe61
8 changed files with 78 additions and 42 deletions

View File

@ -68,6 +68,8 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList
t := &types.AiTask{
Name: task.Name,
Status: task.Status,
Cluster: task.ClusterName,
Card: task.Card,
TimeElapsed: int32(elapsed.Seconds()),
}
resp.List = append(resp.List, t)

View File

@ -73,7 +73,12 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
scheResult.Strategy = r.Strategy
scheResult.Replica = r.Replica
scheResult.Msg = r.Msg
err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, r.ClusterId, r.TaskId, constants.Saved, r.Msg)
opt.ComputeCard = r.Card
clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(r.ClusterId)
err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, r.ClusterId, clusterName, r.TaskId, constants.Saved, r.Msg)
if err != nil {
return nil, err
}

View File

@ -35,6 +35,16 @@ func (s *AiStorage) GetClustersByAdapterId(id string) (*types.ClusterListResp, e
return &resp, nil
}
func (s *AiStorage) GetClusterNameById(id string) (string, error) {
var name string
tx := s.DbEngin.Raw("select `description` from t_cluster where `id` = ?", id).Scan(&name)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return "", tx.Error
}
return name, nil
}
func (s *AiStorage) GetAdapterIdsByType(adapterType string) ([]string, error) {
var list []types.AdapterInfo
var ids []string
@ -90,7 +100,7 @@ func (s *AiStorage) SaveTask(name string, strategyCode int64, synergyStatus int6
return taskModel.Id, nil
}
func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId string, jobId string, status string, msg string) error {
func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId string, clusterName string, jobId string, status string, msg string) error {
// 构建主任务结构体
aId, err := strconv.ParseInt(option.AdapterId, 10, 64)
if err != nil {
@ -100,10 +110,14 @@ func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId
if err != nil {
return err
}
del, _ := time.Parse(constants.Layout, constants.Layout)
aiTaskModel := models.TaskAi{
TaskId: taskId,
AdapterId: aId,
ClusterId: cId,
ClusterName: clusterName,
Name: option.TaskName,
Replica: option.Replica,
JobId: jobId,
@ -111,6 +125,8 @@ func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId
Strategy: option.StrategyName,
Status: status,
Msg: msg,
Card: option.ComputeCard,
DeletedAt: del,
CommitTime: time.Now(),
}
// 保存任务数据到数据库

View File

@ -46,6 +46,7 @@ type AiResult struct {
ClusterId string
Strategy string
Replica int32
Card string
Msg string
}
@ -156,6 +157,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
result.Replica = c.Replicas
result.ClusterId = c.ClusterId
result.Strategy = as.option.StrategyName
result.Card = opt.ComputeCard
ch <- result
wg.Done()
@ -192,23 +194,29 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
})
msg := fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
errmsg += msg
err := as.AiStorages.SaveAiTask(taskId, as.option, e.clusterId, "", constants.Failed, msg)
clusterName, _ := as.AiStorages.GetClusterNameById(e.clusterId)
err := as.AiStorages.SaveAiTask(taskId, as.option, e.clusterId, clusterName, "", constants.Failed, msg)
if err != nil {
return nil, errors.New("database add failed: " + err.Error())
}
}
for _, s := range results {
as.option.ComputeCard = s.Card //execute card
clusterName, _ := as.AiStorages.GetClusterNameById(s.ClusterId)
if s.Msg != "" {
msg := fmt.Sprintf("clusterId: %v , error: %v \n", s.ClusterId, s.Msg)
errmsg += msg
err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, "", constants.Failed, msg)
err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, clusterName, "", constants.Failed, msg)
if err != nil {
return nil, errors.New("database add failed: " + err.Error())
}
} else {
msg := fmt.Sprintf("clusterId: %v , submitted successfully, taskId: %v \n", s.ClusterId, s.TaskId)
errmsg += msg
err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, s.TaskId, constants.Saved, msg)
err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, clusterName, s.TaskId, constants.Saved, msg)
if err != nil {
return nil, errors.New("database add failed: " + err.Error())
}

View File

@ -19,6 +19,7 @@ type Strategy interface {
type AssignedCluster struct {
ClusterId string
ClusterName string
Replicas int32
}

View File

@ -402,7 +402,7 @@ func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType st
aLatest := &octopus.Algorithms{}
for i, _ := range algorithms {
if time.Unix(aLatest.CreatedAt, 0).After(time.Unix(algorithms[i].CreatedAt, 0)) {
if time.Unix(algorithms[i].CreatedAt, 0).After(time.Unix(aLatest.CreatedAt, 0)) {
aLatest = algorithms[i]
}
}
@ -789,36 +789,32 @@ func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpec
}
switch computeCard {
case GCU:
option.ComputeCard = computeCard
if cardNum == s { // 1, 4, 8
option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil
}
if 1 < cardNum && cardNum <= 4 && s == 4 {
option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil
}
if 4 < cardNum && s == 8 {
option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil
}
case MLU: // 1, 2, 4
option.ComputeCard = computeCard
if cardNum/2 == s {
option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil
}
if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 {
option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil
}
if 2 < cardNum/2 && s == 4 {
option.ResourceId = spec.Id
option.ComputeCard = computeCard
return nil
}
}

View File

@ -565,10 +565,14 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error {
if option.ResourceType == CPU {
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
option.ComputeCard = CPU
return nil
}
if option.ResourceType == CARD {
option.ComputeCard = DCU
if 0 <= option.Tops && option.Tops <= DCU_TOPS {
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
return nil

View File

@ -38,8 +38,10 @@ type (
TaskAi struct {
Id int64 `db:"id"` // id
TaskId int64 `db:"task_id"` // 任务id
AdapterId int64 `db:"adapter_id"` // 设配器id
AdapterId int64 `db:"adapter_id"` // 适配器id
AdapterName string `db:"adapter_name"` // 适配器名称
ClusterId int64 `db:"cluster_id"` // 集群id
ClusterName string `db:"cluster_name"` // 集群名称
Name string `db:"name"` // 任务名
Replica int64 `db:"replica"` // 执行数
JobId string `db:"job_id"` // 集群返回任务id
@ -50,6 +52,8 @@ type (
StartTime string `db:"start_time"` // 开始时间
EndTime string `db:"end_time"` // 结束时间
TaskType string `db:"task_type"`
DeletedAt time.Time `db:"deleted_at"`
Card string `db:"card"`
}
)
@ -88,14 +92,14 @@ func (m *defaultTaskAiModel) FindOne(ctx context.Context, id int64) (*TaskAi, er
}
func (m *defaultTaskAiModel) Insert(ctx context.Context, data *TaskAi) (sql.Result, error) {
query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskAiRowsExpectAutoSet)
ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.ClusterId, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType)
query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskAiRowsExpectAutoSet)
ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.DeletedAt, data.Card)
return ret, err
}
func (m *defaultTaskAiModel) Update(ctx context.Context, data *TaskAi) error {
query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskAiRowsWithPlaceHolder)
_, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.ClusterId, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.Id)
_, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.DeletedAt, data.Card, data.Id)
return err
}