Merge pull request 'core task status sync' (#150) from zhouqunjie/pcm-coordinator:master into master
Former-commit-id: 46158af2b239eada0fae6449043698d6c1505e00
This commit is contained in:
commit
004da905ae
|
@ -5,6 +5,20 @@ import (
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var HpcStatusMapping = map[string][]string{
|
||||||
|
"Running": {"RUNNING", "RUNNING", "CONFIGURING", "COMPLETING"},
|
||||||
|
"Succeeded": {"COMPLETED"},
|
||||||
|
"Failed": {"FAILED", "TIMEOUT", "DEADLINE", "OUT_OF_MEMORY", "BOOT_FAIL", "CANCELLED"},
|
||||||
|
}
|
||||||
|
|
||||||
|
var AiStatusMapping = map[string]string{
|
||||||
|
"PENDING": "Running",
|
||||||
|
}
|
||||||
|
|
||||||
|
var CloudStatusMapping = map[string]string{
|
||||||
|
"PENDING": "Running",
|
||||||
|
}
|
||||||
|
|
||||||
type PullTaskInfoReq struct {
|
type PullTaskInfoReq struct {
|
||||||
AdapterId int64 `form:"adapterId"`
|
AdapterId int64 `form:"adapterId"`
|
||||||
}
|
}
|
||||||
|
|
|
@ -73,20 +73,34 @@ func syncTask(gorm *gorm.DB, taskId int64) {
|
||||||
if tx.Error != nil {
|
if tx.Error != nil {
|
||||||
logx.Error(tx.Error)
|
logx.Error(tx.Error)
|
||||||
}
|
}
|
||||||
// 子状态统一则修改主任务状态
|
|
||||||
statusArray := strings.Split(allStatus, ",")
|
for pcmStatus, hpcStatus := range clientCore.HpcStatusMapping {
|
||||||
if len(removeRepeatedElement(statusArray)) == 1 {
|
for _, status := range hpcStatus {
|
||||||
updateTask(gorm, taskId, statusArray[0])
|
// if Failed type status appears in subTask then update mainTask to Failed
|
||||||
}
|
if pcmStatus == "Failed" && strings.Contains(allStatus, status) {
|
||||||
// 子任务包含失败状态 主任务则失败
|
|
||||||
if strings.Contains(allStatus, constants.Failed) {
|
|
||||||
updateTask(gorm, taskId, constants.Failed)
|
updateTask(gorm, taskId, constants.Failed)
|
||||||
|
return
|
||||||
|
// no Failed type status in subTask,if Saved type status appears in subTask then update mainTask to Saved
|
||||||
|
} else if pcmStatus == "Saved" {
|
||||||
|
if strings.Contains(allStatus, status) {
|
||||||
|
updateTask(gorm, taskId, constants.Saved)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
if strings.Contains(allStatus, constants.Running) {
|
// no Failed and Saved type status in subTask,if Running type status appears in subTask then update mainTask to Running
|
||||||
|
} else if pcmStatus == "Running" {
|
||||||
|
if strings.Contains(allStatus, status) {
|
||||||
updateTask(gorm, taskId, constants.Running)
|
updateTask(gorm, taskId, constants.Running)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// at last, mainTask should be succeeded
|
||||||
|
} else {
|
||||||
|
if strings.Contains(allStatus, status) {
|
||||||
|
updateTask(gorm, taskId, constants.Succeeded)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func updateTask(gorm *gorm.DB, taskId int64, status string) {
|
func updateTask(gorm *gorm.DB, taskId int64, status string) {
|
||||||
|
@ -98,8 +112,12 @@ func updateTask(gorm *gorm.DB, taskId int64, status string) {
|
||||||
if status == constants.Running {
|
if status == constants.Running {
|
||||||
task.StartTime = &now
|
task.StartTime = &now
|
||||||
}
|
}
|
||||||
|
if task.Status == constants.Failed || task.Status == constants.Succeeded {
|
||||||
|
task.EndTime = &now
|
||||||
|
}
|
||||||
gorm.Updates(&task)
|
gorm.Updates(&task)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func removeRepeatedElement(arr []string) (newArr []string) {
|
func removeRepeatedElement(arr []string) (newArr []string) {
|
||||||
|
|
|
@ -938,184 +938,6 @@ type ListResult struct {
|
||||||
List interface{} `json:"list,omitempty"`
|
List interface{} `json:"list,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HpcInfo struct {
|
|
||||||
Id int64 `json:"id"` // id
|
|
||||||
TaskId int64 `json:"task_id"` // 任务id
|
|
||||||
JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id)
|
|
||||||
AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id
|
|
||||||
ClusterId int64 `json:"cluster_id"` // 执行任务的集群id
|
|
||||||
ClusterType string `json:"cluster_type"` // 执行任务的集群类型
|
|
||||||
Name string `json:"name"` // 名称
|
|
||||||
Status string `json:"status"` // 状态
|
|
||||||
CmdScript string `json:"cmd_script"`
|
|
||||||
StartTime string `json:"start_time"` // 开始时间
|
|
||||||
RunningTime int64 `json:"running_time"` // 运行时间
|
|
||||||
DerivedEs string `json:"derived_es"`
|
|
||||||
Cluster string `json:"cluster"`
|
|
||||||
BlockId int64 `json:"block_id"`
|
|
||||||
AllocNodes int64 `json:"alloc_nodes"`
|
|
||||||
AllocCpu int64 `json:"alloc_cpu"`
|
|
||||||
CardCount int64 `json:"card_count"` // 卡数
|
|
||||||
Version string `json:"version"`
|
|
||||||
Account string `json:"account"`
|
|
||||||
WorkDir string `json:"work_dir"` // 工作路径
|
|
||||||
AssocId int64 `json:"assoc_id"`
|
|
||||||
ExitCode int64 `json:"exit_code"`
|
|
||||||
WallTime string `json:"wall_time"` // 最大运行时间
|
|
||||||
Result string `json:"result"` // 运行结果
|
|
||||||
DeletedAt string `json:"deleted_at"` // 删除时间
|
|
||||||
YamlString string `json:"yaml_string"`
|
|
||||||
AppType string `json:"app_type"` // 应用类型
|
|
||||||
AppName string `json:"app_name"` // 应用名称
|
|
||||||
Queue string `json:"queue"` // 队列名称
|
|
||||||
SubmitType string `json:"submit_type"` // cmd(命令行模式)
|
|
||||||
NNode string `json:"n_node"` // 节点个数(当指定该参数时,GAP_NODE_STRING必须为"")
|
|
||||||
StdOutFile string `json:"std_out_file"` // 工作路径/std.err.%j
|
|
||||||
StdErrFile string `json:"std_err_file"` // 工作路径/std.err.%j
|
|
||||||
StdInput string `json:"std_input"`
|
|
||||||
Environment string `json:"environment"`
|
|
||||||
DeletedFlag int64 `json:"deleted_flag"` // 是否删除(0-否,1-是)
|
|
||||||
CreatedBy int64 `json:"created_by"` // 创建人
|
|
||||||
CreateTime string `json:"created_time"` // 创建时间
|
|
||||||
UpdatedBy int64 `json:"updated_by"` // 更新人
|
|
||||||
UpdateTime string `json:"updated_time"` // 更新时间
|
|
||||||
}
|
|
||||||
|
|
||||||
type CloudInfo struct {
|
|
||||||
Participant int64 `json:"participant,omitempty"`
|
|
||||||
Id int64 `json:"id,omitempty"`
|
|
||||||
TaskId int64 `json:"taskId,omitempty"`
|
|
||||||
ApiVersion string `json:"apiVersion,omitempty"`
|
|
||||||
Kind string `json:"kind,omitempty"`
|
|
||||||
Namespace string `json:"namespace,omitempty"`
|
|
||||||
Name string `json:"name,omitempty"`
|
|
||||||
Status string `json:"status,omitempty"`
|
|
||||||
StartTime string `json:"startTime,omitempty"`
|
|
||||||
RunningTime int64 `json:"runningTime,omitempty"`
|
|
||||||
Result string `json:"result,omitempty"`
|
|
||||||
YamlString string `json:"yamlString,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type AiInfo struct {
|
|
||||||
ParticipantId int64 `json:"participantId,omitempty"`
|
|
||||||
TaskId int64 `json:"taskId,omitempty"`
|
|
||||||
ProjectId string `json:"project_id,omitempty"`
|
|
||||||
Name string `json:"name,omitempty"`
|
|
||||||
Status string `json:"status,omitempty"`
|
|
||||||
StartTime string `json:"startTime,omitempty"`
|
|
||||||
RunningTime int64 `json:"runningTime,omitempty"`
|
|
||||||
Result string `json:"result,omitempty"`
|
|
||||||
JobId string `json:"jobId,omitempty"`
|
|
||||||
CreateTime string `json:"createTime,omitempty"`
|
|
||||||
ImageUrl string `json:"imageUrl,omitempty"`
|
|
||||||
Command string `json:"command,omitempty"`
|
|
||||||
FlavorId string `json:"flavorId,omitempty"`
|
|
||||||
SubscriptionId string `json:"subscriptionId,omitempty"`
|
|
||||||
ItemVersionId string `json:"itemVersionId,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type VmInfo struct {
|
|
||||||
ParticipantId int64 `json:"participantId,omitempty"`
|
|
||||||
TaskId int64 `json:"taskId,omitempty"`
|
|
||||||
Name string `json:"name,omitempty"`
|
|
||||||
FlavorRef string `json:"flavor_ref,omitempty"`
|
|
||||||
ImageRef string `json:"image_ref,omitempty"`
|
|
||||||
NetworkUuid string `json:"network_uuid,omitempty"`
|
|
||||||
BlockUuid string `json:"block_uuid,omitempty"`
|
|
||||||
SourceType string `json:"source_type,omitempty"`
|
|
||||||
DeleteOnTermination bool `json:"delete_on_termination,omitempty"`
|
|
||||||
Status string `json:"status,omitempty"`
|
|
||||||
MinCount string `json:"min_count,omitempty"`
|
|
||||||
Platform string `json:"platform,omitempty"`
|
|
||||||
Uuid string `json:"uuid,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type PullTaskInfoReq struct {
|
|
||||||
AdapterId int64 `form:"adapterId"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type PullTaskInfoResp struct {
|
|
||||||
HpcInfoList []*HpcInfo `json:"HpcInfoList,omitempty"`
|
|
||||||
CloudInfoList []*CloudInfo `json:"CloudInfoList,omitempty"`
|
|
||||||
AiInfoList []*AiInfo `json:"AiInfoList,omitempty"`
|
|
||||||
VmInfoList []*VmInfo `json:"VmInfoList,omitempty"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type PushTaskInfoReq struct {
|
|
||||||
AdapterId int64 `json:"adapterId"`
|
|
||||||
HpcInfoList []*HpcInfo `json:"hpcInfoList"`
|
|
||||||
CloudInfoList []*CloudInfo `json:"cloudInfoList"`
|
|
||||||
AiInfoList []*AiInfo `json:"aiInfoList"`
|
|
||||||
VmInfoList []*VmInfo `json:"vmInfoList"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type PushTaskInfoResp struct {
|
|
||||||
Code int64 `json:"code"`
|
|
||||||
Msg string `json:"msg"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type PushResourceInfoReq struct {
|
|
||||||
AdapterId int64 `json:"adapterId"`
|
|
||||||
ResourceStats []ResourceStats `json:"resourceStats"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type PushResourceInfoResp struct {
|
|
||||||
Code int64 `json:"code"`
|
|
||||||
Msg string `json:"msg"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type NoticeInfo struct {
|
|
||||||
AdapterId int64 `json:"adapterId"`
|
|
||||||
AdapterName string `json:"adapterName"`
|
|
||||||
ClusterId int64 `json:"clusterId"`
|
|
||||||
ClusterName string `json:"clusterName"`
|
|
||||||
NoticeType string `json:"noticeType"`
|
|
||||||
TaskName string `json:"taskName"`
|
|
||||||
Incident string `json:"incident"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type ListNoticeReq struct {
|
|
||||||
}
|
|
||||||
|
|
||||||
type ListNoticeResp struct {
|
|
||||||
Code int64 `json:"code"`
|
|
||||||
Msg string `json:"msg"`
|
|
||||||
Data []NoticeInfo `json:"data"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type PushNoticeReq struct {
|
|
||||||
NoticeInfo NoticeInfo `json:"noticeInfo"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type PushNoticeResp struct {
|
|
||||||
Code int64 `json:"code"`
|
|
||||||
Msg string `json:"msg"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type ResourceStats struct {
|
|
||||||
ClusterId int64 `json:"clusterId"`
|
|
||||||
Name string `json:"name"`
|
|
||||||
CpuCoreAvail int64 `json:"cpuCoreAvail"`
|
|
||||||
CpuCoreTotal int64 `json:"cpuCoreTotal"`
|
|
||||||
MemAvail float64 `json:"memAvail"`
|
|
||||||
MemTotal float64 `json:"memTotal"`
|
|
||||||
DiskAvail float64 `json:"diskAvail"`
|
|
||||||
DiskTotal float64 `json:"diskTotal"`
|
|
||||||
GpuAvail int64 `json:"gpuAvail"`
|
|
||||||
CardsAvail []*Card `json:"cardsAvail"`
|
|
||||||
CpuCoreHours float64 `json:"cpuCoreHours"`
|
|
||||||
Balance float64 `json:"balance"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type Card struct {
|
|
||||||
Platform string `json:"platform"`
|
|
||||||
Type string `json:"type"`
|
|
||||||
Name string `json:"name"`
|
|
||||||
TOpsAtFp16 float64 `json:"TOpsAtFp16"`
|
|
||||||
CardHours float64 `json:"cardHours"`
|
|
||||||
CardNum int32 `json:"cardNum"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type TaskStatusResp struct {
|
type TaskStatusResp struct {
|
||||||
Succeeded int `json:"Succeeded"`
|
Succeeded int `json:"Succeeded"`
|
||||||
Failed int `json:"Failed"`
|
Failed int `json:"Failed"`
|
||||||
|
|
Loading…
Reference in New Issue