Merge pull request 'core task status sync' (#150) from zhouqunjie/pcm-coordinator:master into master

Former-commit-id: 46158af2b239eada0fae6449043698d6c1505e00
This commit is contained in:
zhouqunjie 2024-05-10 10:56:32 +08:00
commit 004da905ae
3 changed files with 44 additions and 190 deletions

View File

@ -5,6 +5,20 @@ import (
"time" "time"
) )
var HpcStatusMapping = map[string][]string{
"Running": {"RUNNING", "RUNNING", "CONFIGURING", "COMPLETING"},
"Succeeded": {"COMPLETED"},
"Failed": {"FAILED", "TIMEOUT", "DEADLINE", "OUT_OF_MEMORY", "BOOT_FAIL", "CANCELLED"},
}
var AiStatusMapping = map[string]string{
"PENDING": "Running",
}
var CloudStatusMapping = map[string]string{
"PENDING": "Running",
}
type PullTaskInfoReq struct { type PullTaskInfoReq struct {
AdapterId int64 `form:"adapterId"` AdapterId int64 `form:"adapterId"`
} }

View File

@ -73,20 +73,34 @@ func syncTask(gorm *gorm.DB, taskId int64) {
if tx.Error != nil { if tx.Error != nil {
logx.Error(tx.Error) logx.Error(tx.Error)
} }
// 子状态统一则修改主任务状态
statusArray := strings.Split(allStatus, ",") for pcmStatus, hpcStatus := range clientCore.HpcStatusMapping {
if len(removeRepeatedElement(statusArray)) == 1 { for _, status := range hpcStatus {
updateTask(gorm, taskId, statusArray[0]) // if Failed type status appears in subTask then update mainTask to Failed
} if pcmStatus == "Failed" && strings.Contains(allStatus, status) {
// 子任务包含失败状态 主任务则失败
if strings.Contains(allStatus, constants.Failed) {
updateTask(gorm, taskId, constants.Failed) updateTask(gorm, taskId, constants.Failed)
return
// no Failed type status in subTask,if Saved type status appears in subTask then update mainTask to Saved
} else if pcmStatus == "Saved" {
if strings.Contains(allStatus, status) {
updateTask(gorm, taskId, constants.Saved)
return
} }
if strings.Contains(allStatus, constants.Running) { // no Failed and Saved type status in subTask,if Running type status appears in subTask then update mainTask to Running
} else if pcmStatus == "Running" {
if strings.Contains(allStatus, status) {
updateTask(gorm, taskId, constants.Running) updateTask(gorm, taskId, constants.Running)
return
}
// at last, mainTask should be succeeded
} else {
if strings.Contains(allStatus, status) {
updateTask(gorm, taskId, constants.Succeeded)
return
}
}
}
} }
} }
func updateTask(gorm *gorm.DB, taskId int64, status string) { func updateTask(gorm *gorm.DB, taskId int64, status string) {
@ -98,8 +112,12 @@ func updateTask(gorm *gorm.DB, taskId int64, status string) {
if status == constants.Running { if status == constants.Running {
task.StartTime = &now task.StartTime = &now
} }
if task.Status == constants.Failed || task.Status == constants.Succeeded {
task.EndTime = &now
}
gorm.Updates(&task) gorm.Updates(&task)
} }
} }
func removeRepeatedElement(arr []string) (newArr []string) { func removeRepeatedElement(arr []string) (newArr []string) {

View File

@ -938,184 +938,6 @@ type ListResult struct {
List interface{} `json:"list,omitempty"` List interface{} `json:"list,omitempty"`
} }
type HpcInfo struct {
Id int64 `json:"id"` // id
TaskId int64 `json:"task_id"` // 任务id
JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id)
AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id
ClusterId int64 `json:"cluster_id"` // 执行任务的集群id
ClusterType string `json:"cluster_type"` // 执行任务的集群类型
Name string `json:"name"` // 名称
Status string `json:"status"` // 状态
CmdScript string `json:"cmd_script"`
StartTime string `json:"start_time"` // 开始时间
RunningTime int64 `json:"running_time"` // 运行时间
DerivedEs string `json:"derived_es"`
Cluster string `json:"cluster"`
BlockId int64 `json:"block_id"`
AllocNodes int64 `json:"alloc_nodes"`
AllocCpu int64 `json:"alloc_cpu"`
CardCount int64 `json:"card_count"` // 卡数
Version string `json:"version"`
Account string `json:"account"`
WorkDir string `json:"work_dir"` // 工作路径
AssocId int64 `json:"assoc_id"`
ExitCode int64 `json:"exit_code"`
WallTime string `json:"wall_time"` // 最大运行时间
Result string `json:"result"` // 运行结果
DeletedAt string `json:"deleted_at"` // 删除时间
YamlString string `json:"yaml_string"`
AppType string `json:"app_type"` // 应用类型
AppName string `json:"app_name"` // 应用名称
Queue string `json:"queue"` // 队列名称
SubmitType string `json:"submit_type"` // cmd命令行模式
NNode string `json:"n_node"` // 节点个数当指定该参数时GAP_NODE_STRING必须为""
StdOutFile string `json:"std_out_file"` // 工作路径/std.err.%j
StdErrFile string `json:"std_err_file"` // 工作路径/std.err.%j
StdInput string `json:"std_input"`
Environment string `json:"environment"`
DeletedFlag int64 `json:"deleted_flag"` // 是否删除0-否1-是)
CreatedBy int64 `json:"created_by"` // 创建人
CreateTime string `json:"created_time"` // 创建时间
UpdatedBy int64 `json:"updated_by"` // 更新人
UpdateTime string `json:"updated_time"` // 更新时间
}
type CloudInfo struct {
Participant int64 `json:"participant,omitempty"`
Id int64 `json:"id,omitempty"`
TaskId int64 `json:"taskId,omitempty"`
ApiVersion string `json:"apiVersion,omitempty"`
Kind string `json:"kind,omitempty"`
Namespace string `json:"namespace,omitempty"`
Name string `json:"name,omitempty"`
Status string `json:"status,omitempty"`
StartTime string `json:"startTime,omitempty"`
RunningTime int64 `json:"runningTime,omitempty"`
Result string `json:"result,omitempty"`
YamlString string `json:"yamlString,omitempty"`
}
type AiInfo struct {
ParticipantId int64 `json:"participantId,omitempty"`
TaskId int64 `json:"taskId,omitempty"`
ProjectId string `json:"project_id,omitempty"`
Name string `json:"name,omitempty"`
Status string `json:"status,omitempty"`
StartTime string `json:"startTime,omitempty"`
RunningTime int64 `json:"runningTime,omitempty"`
Result string `json:"result,omitempty"`
JobId string `json:"jobId,omitempty"`
CreateTime string `json:"createTime,omitempty"`
ImageUrl string `json:"imageUrl,omitempty"`
Command string `json:"command,omitempty"`
FlavorId string `json:"flavorId,omitempty"`
SubscriptionId string `json:"subscriptionId,omitempty"`
ItemVersionId string `json:"itemVersionId,omitempty"`
}
type VmInfo struct {
ParticipantId int64 `json:"participantId,omitempty"`
TaskId int64 `json:"taskId,omitempty"`
Name string `json:"name,omitempty"`
FlavorRef string `json:"flavor_ref,omitempty"`
ImageRef string `json:"image_ref,omitempty"`
NetworkUuid string `json:"network_uuid,omitempty"`
BlockUuid string `json:"block_uuid,omitempty"`
SourceType string `json:"source_type,omitempty"`
DeleteOnTermination bool `json:"delete_on_termination,omitempty"`
Status string `json:"status,omitempty"`
MinCount string `json:"min_count,omitempty"`
Platform string `json:"platform,omitempty"`
Uuid string `json:"uuid,omitempty"`
}
type PullTaskInfoReq struct {
AdapterId int64 `form:"adapterId"`
}
type PullTaskInfoResp struct {
HpcInfoList []*HpcInfo `json:"HpcInfoList,omitempty"`
CloudInfoList []*CloudInfo `json:"CloudInfoList,omitempty"`
AiInfoList []*AiInfo `json:"AiInfoList,omitempty"`
VmInfoList []*VmInfo `json:"VmInfoList,omitempty"`
}
type PushTaskInfoReq struct {
AdapterId int64 `json:"adapterId"`
HpcInfoList []*HpcInfo `json:"hpcInfoList"`
CloudInfoList []*CloudInfo `json:"cloudInfoList"`
AiInfoList []*AiInfo `json:"aiInfoList"`
VmInfoList []*VmInfo `json:"vmInfoList"`
}
type PushTaskInfoResp struct {
Code int64 `json:"code"`
Msg string `json:"msg"`
}
type PushResourceInfoReq struct {
AdapterId int64 `json:"adapterId"`
ResourceStats []ResourceStats `json:"resourceStats"`
}
type PushResourceInfoResp struct {
Code int64 `json:"code"`
Msg string `json:"msg"`
}
type NoticeInfo struct {
AdapterId int64 `json:"adapterId"`
AdapterName string `json:"adapterName"`
ClusterId int64 `json:"clusterId"`
ClusterName string `json:"clusterName"`
NoticeType string `json:"noticeType"`
TaskName string `json:"taskName"`
Incident string `json:"incident"`
}
type ListNoticeReq struct {
}
type ListNoticeResp struct {
Code int64 `json:"code"`
Msg string `json:"msg"`
Data []NoticeInfo `json:"data"`
}
type PushNoticeReq struct {
NoticeInfo NoticeInfo `json:"noticeInfo"`
}
type PushNoticeResp struct {
Code int64 `json:"code"`
Msg string `json:"msg"`
}
type ResourceStats struct {
ClusterId int64 `json:"clusterId"`
Name string `json:"name"`
CpuCoreAvail int64 `json:"cpuCoreAvail"`
CpuCoreTotal int64 `json:"cpuCoreTotal"`
MemAvail float64 `json:"memAvail"`
MemTotal float64 `json:"memTotal"`
DiskAvail float64 `json:"diskAvail"`
DiskTotal float64 `json:"diskTotal"`
GpuAvail int64 `json:"gpuAvail"`
CardsAvail []*Card `json:"cardsAvail"`
CpuCoreHours float64 `json:"cpuCoreHours"`
Balance float64 `json:"balance"`
}
type Card struct {
Platform string `json:"platform"`
Type string `json:"type"`
Name string `json:"name"`
TOpsAtFp16 float64 `json:"TOpsAtFp16"`
CardHours float64 `json:"cardHours"`
CardNum int32 `json:"cardNum"`
}
type TaskStatusResp struct { type TaskStatusResp struct {
Succeeded int `json:"Succeeded"` Succeeded int `json:"Succeeded"`
Failed int `json:"Failed"` Failed int `json:"Failed"`