diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 2c76692c..cf485b54 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -922,6 +922,9 @@ service pcm { @handler ScheduleGetAlgorithmsHandler get /schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset (AiAlgorithmsReq) returns (AiAlgorithmsResp) + @handler ScheduleGetAiJobLogLogHandler + get /schedule/ai/getJobLog/:taskId/:instanceNum (AiJobLogReq) returns (AiJobLogResp) + @handler ScheduleSubmitHandler post /schedule/submit (ScheduleReq) returns (ScheduleResp) } diff --git a/api/desc/schedule/pcm-schedule.api b/api/desc/schedule/pcm-schedule.api index 3eccf3e5..d3d61990 100644 --- a/api/desc/schedule/pcm-schedule.api +++ b/api/desc/schedule/pcm-schedule.api @@ -70,4 +70,13 @@ type ( AiAlgorithmsResp { Algorithms []string `json:"algorithms"` } + + AiJobLogReq { + TaskId string `path:"taskId"` + instanceNum string `path:"instanceNum"` + } + + AiJobLogResp { + Log string `json:"log"` + } ) \ No newline at end of file diff --git a/api/internal/scheduler/service/collector/collector.go b/api/internal/scheduler/service/collector/collector.go index a20b1d36..e313baff 100644 --- a/api/internal/scheduler/service/collector/collector.go +++ b/api/internal/scheduler/service/collector/collector.go @@ -6,6 +6,7 @@ type AiCollector interface { GetResourceStats(ctx context.Context) (*ResourceStats, error) GetDatasetsSpecs(ctx context.Context) ([]*DatasetsSpecs, error) GetAlgorithms(ctx context.Context) ([]*Algorithm, error) + GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) } type ResourceStats struct { diff --git a/api/internal/storeLink/modelarts.go b/api/internal/storeLink/modelarts.go index 6dffc2ec..5a9bed87 100644 --- a/api/internal/storeLink/modelarts.go +++ b/api/internal/storeLink/modelarts.go @@ -162,6 +162,10 @@ func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorit return nil, nil } +func (m *ModelArtsLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) { + return "", nil +} + func (m *ModelArtsLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) { err := m.GenerateSubmitParams(ctx, option) if err != nil { diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index b643c1d6..e052f637 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -337,6 +337,25 @@ func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm return algorithms, nil } +func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) { + instance, err := strconv.ParseInt(instanceNum, 10, 32) + if err != nil { + return "", err + } + req := &octopus.GetTrainJobLogReq{ + Platform: o.platform, + TaskId: taskId, + TaskNum: "task0", + Num: int32(instance), + } + resp, err := o.octopusRpc.GetTrainJobLog(ctx, req) + if err != nil { + return "", err + } + + return resp.Content, nil +} + func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) { err := o.GenerateSubmitParams(ctx, option) if err != nil { diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index 53f9bf52..b07c3401 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -447,6 +447,24 @@ func (s *ShuguangAi) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, return algorithms, nil } +func (s *ShuguangAi) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) { + req := &hpcAC.GetInstanceLogReq{ + TaskId: taskId, + InstanceNum: instanceNum, + LineCount: 1000, + StartLineNum: -1, + } + resp, err := s.aCRpc.GetInstanceLog(ctx, req) + if err != nil { + return "", err + } + if resp.Code != "0" { + return "", errors.New(resp.Msg) + } + + return resp.Data.Content, nil +} + func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) { err := s.GenerateSubmitParams(ctx, option) if err != nil { diff --git a/go.mod b/go.mod index 03670445..a46abb60 100644 --- a/go.mod +++ b/go.mod @@ -24,9 +24,9 @@ require ( github.com/robfig/cron/v3 v3.0.1 github.com/rs/zerolog v1.28.0 github.com/zeromicro/go-zero v1.6.3 - gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8 + gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c - gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8 + gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142 gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d diff --git a/go.sum b/go.sum index cdd78495..31133cce 100644 --- a/go.sum +++ b/go.sum @@ -1080,10 +1080,15 @@ github.com/zeromicro/go-zero v1.6.3 h1:OL0NnHD5LdRNDolfcK9vUkJt7K8TcBE3RkzfM8poO github.com/zeromicro/go-zero v1.6.3/go.mod h1:XZL435ZxVi9MSXXtw2MRQhHgx6OoX3++MRMOE9xU70c= gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8 h1:cX6U2gUcp/sIP3TKFv4q/1O8gp10q+M3k5Ql15yaEMI= gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo= +gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb h1:k6mNEWKp+haQUaK2dWs/rI9OKgzJHY1/9KNKuBDN0Vw= +gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo= gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c h1:2Wl/hvaSFjh6fmCSIQhjkr9llMRREQeqcXNLZ/HPY18= gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c/go.mod h1:lSRfGs+PxFvw7CcndHWRd6UlLlGrZn0b0hp5cfaMNGw= gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8 h1:jdwYydJxYPlfIS9yZvnNX1w08aJGYWq5ADD5EXLW3+Q= gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= +gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240422093351-5b8bac8bb953 h1:45omfrELx1Ealvs20NwFgdubCUnTyiatBAQyUwG6aAk= +gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240422093351-5b8bac8bb953/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= +gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203/go.mod h1:i2rrbMQ+Fve345BY9Heh4MUqVTAimZQElQhzzRee5B8= gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnzkJBfMRnya1NrhOzlroUtRa5ePiYbPKlHLoLV0=