diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 2c76692c..12d338a0 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -922,6 +922,9 @@ service pcm { @handler ScheduleGetAlgorithmsHandler get /schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset (AiAlgorithmsReq) returns (AiAlgorithmsResp) + @handler ScheduleGetAiJobLogLogHandler + get /schedule/ai/getJobLog/:adapterId/:clusterId/:taskId/:instanceNum (AiJobLogReq) returns (AiJobLogResp) + @handler ScheduleSubmitHandler post /schedule/submit (ScheduleReq) returns (ScheduleResp) } diff --git a/api/desc/schedule/pcm-schedule.api b/api/desc/schedule/pcm-schedule.api index 3eccf3e5..02783746 100644 --- a/api/desc/schedule/pcm-schedule.api +++ b/api/desc/schedule/pcm-schedule.api @@ -70,4 +70,15 @@ type ( AiAlgorithmsResp { Algorithms []string `json:"algorithms"` } + + AiJobLogReq { + AdapterId string `path:"adapterId"` + ClusterId string `path:"clusterId"` + TaskId string `path:"taskId"` + instanceNum string `path:"instanceNum"` + } + + AiJobLogResp { + Log string `json:"log"` + } ) \ No newline at end of file diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index a68efc58..5402a03d 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -1145,6 +1145,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset", Handler: schedule.ScheduleGetAlgorithmsHandler(serverCtx), }, + { + Method: http.MethodGet, + Path: "/schedule/ai/getJobLog/:adapterId/:clusterId/:taskId/:instanceNum", + Handler: schedule.ScheduleGetAiJobLogLogHandler(serverCtx), + }, { Method: http.MethodPost, Path: "/schedule/submit", diff --git a/api/internal/handler/schedule/schedulegetaijoblogloghandler.go b/api/internal/handler/schedule/schedulegetaijoblogloghandler.go new file mode 100644 index 00000000..9eecc66d --- /dev/null +++ b/api/internal/handler/schedule/schedulegetaijoblogloghandler.go @@ -0,0 +1,28 @@ +package schedule + +import ( + "net/http" + + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" +) + +func ScheduleGetAiJobLogLogHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.AiJobLogReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + + l := schedule.NewScheduleGetAiJobLogLogLogic(r.Context(), svcCtx) + resp, err := l.ScheduleGetAiJobLogLog(&req) + if err != nil { + httpx.ErrorCtx(r.Context(), w, err) + } else { + httpx.OkJsonCtx(r.Context(), w, resp) + } + } +} diff --git a/api/internal/logic/schedule/schedulegetaijoblogloglogic.go b/api/internal/logic/schedule/schedulegetaijoblogloglogic.go new file mode 100644 index 00000000..da5a0c7a --- /dev/null +++ b/api/internal/logic/schedule/schedulegetaijoblogloglogic.go @@ -0,0 +1,36 @@ +package schedule + +import ( + "context" + + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + + "github.com/zeromicro/go-zero/core/logx" +) + +type ScheduleGetAiJobLogLogLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewScheduleGetAiJobLogLogLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ScheduleGetAiJobLogLogLogic { + return &ScheduleGetAiJobLogLogLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +func (l *ScheduleGetAiJobLogLogLogic) ScheduleGetAiJobLogLog(req *types.AiJobLogReq) (resp *types.AiJobLogResp, err error) { + resp = &types.AiJobLogResp{} + + log, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].GetTrainingTaskLog(l.ctx, req.TaskId, req.InstanceNum) + if err != nil { + return nil, err + } + + resp.Log = log + return resp, nil +} diff --git a/api/internal/scheduler/service/collector/collector.go b/api/internal/scheduler/service/collector/collector.go index a20b1d36..e313baff 100644 --- a/api/internal/scheduler/service/collector/collector.go +++ b/api/internal/scheduler/service/collector/collector.go @@ -6,6 +6,7 @@ type AiCollector interface { GetResourceStats(ctx context.Context) (*ResourceStats, error) GetDatasetsSpecs(ctx context.Context) ([]*DatasetsSpecs, error) GetAlgorithms(ctx context.Context) ([]*Algorithm, error) + GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) } type ResourceStats struct { diff --git a/api/internal/storeLink/modelarts.go b/api/internal/storeLink/modelarts.go index 6dffc2ec..5a9bed87 100644 --- a/api/internal/storeLink/modelarts.go +++ b/api/internal/storeLink/modelarts.go @@ -162,6 +162,10 @@ func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorit return nil, nil } +func (m *ModelArtsLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) { + return "", nil +} + func (m *ModelArtsLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) { err := m.GenerateSubmitParams(ctx, option) if err != nil { diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index b643c1d6..e052f637 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -337,6 +337,25 @@ func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm return algorithms, nil } +func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) { + instance, err := strconv.ParseInt(instanceNum, 10, 32) + if err != nil { + return "", err + } + req := &octopus.GetTrainJobLogReq{ + Platform: o.platform, + TaskId: taskId, + TaskNum: "task0", + Num: int32(instance), + } + resp, err := o.octopusRpc.GetTrainJobLog(ctx, req) + if err != nil { + return "", err + } + + return resp.Content, nil +} + func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) { err := o.GenerateSubmitParams(ctx, option) if err != nil { diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index 53f9bf52..b07c3401 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -447,6 +447,24 @@ func (s *ShuguangAi) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, return algorithms, nil } +func (s *ShuguangAi) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) { + req := &hpcAC.GetInstanceLogReq{ + TaskId: taskId, + InstanceNum: instanceNum, + LineCount: 1000, + StartLineNum: -1, + } + resp, err := s.aCRpc.GetInstanceLog(ctx, req) + if err != nil { + return "", err + } + if resp.Code != "0" { + return "", errors.New(resp.Msg) + } + + return resp.Data.Content, nil +} + func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) { err := s.GenerateSubmitParams(ctx, option) if err != nil { diff --git a/api/internal/svc/servicecontext.go b/api/internal/svc/servicecontext.go index 0c73eb17..1aca2ac4 100644 --- a/api/internal/svc/servicecontext.go +++ b/api/internal/svc/servicecontext.go @@ -117,10 +117,6 @@ func NewServiceContext(c config.Config) *ServiceContext { }) // scheduler - //octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(c.OctopusRpcConf)) - //aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(c.ACRpcConf)) - //modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(c.ModelArtsRpcConf)) - //modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(c.ModelArtsImgRpcConf)) storage := &database.AiStorage{DbEngin: dbEngin} aiService, err := service.NewAiService(&c, storage) if err != nil { diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 59dd185d..52164d6e 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -5535,6 +5535,17 @@ type AiAlgorithmsResp struct { Algorithms []string `json:"algorithms"` } +type AiJobLogReq struct { + AdapterId string `path:"adapterId"` + ClusterId string `path:"clusterId"` + TaskId string `path:"taskId"` + InstanceNum string `path:"instanceNum"` +} + +type AiJobLogResp struct { + Log string `json:"log"` +} + type CreateAlertRuleReq struct { CLusterId string `json:"clusterId"` ClusterName string `json:"clusterName"` diff --git a/go.mod b/go.mod index 03670445..a46abb60 100644 --- a/go.mod +++ b/go.mod @@ -24,9 +24,9 @@ require ( github.com/robfig/cron/v3 v3.0.1 github.com/rs/zerolog v1.28.0 github.com/zeromicro/go-zero v1.6.3 - gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8 + gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c - gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8 + gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142 gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d diff --git a/go.sum b/go.sum index cdd78495..6f886672 100644 --- a/go.sum +++ b/go.sum @@ -1078,12 +1078,12 @@ github.com/yuin/gopher-lua v1.1.0/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7 github.com/zeromicro/go-zero v1.5.1/go.mod h1:bGYm4XWsGN9GhDsO2O2BngpVoWjf3Eog2a5hUOMhlXs= github.com/zeromicro/go-zero v1.6.3 h1:OL0NnHD5LdRNDolfcK9vUkJt7K8TcBE3RkzfM8poOVw= github.com/zeromicro/go-zero v1.6.3/go.mod h1:XZL435ZxVi9MSXXtw2MRQhHgx6OoX3++MRMOE9xU70c= -gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8 h1:cX6U2gUcp/sIP3TKFv4q/1O8gp10q+M3k5Ql15yaEMI= -gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo= +gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb h1:k6mNEWKp+haQUaK2dWs/rI9OKgzJHY1/9KNKuBDN0Vw= +gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo= gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c h1:2Wl/hvaSFjh6fmCSIQhjkr9llMRREQeqcXNLZ/HPY18= gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c/go.mod h1:lSRfGs+PxFvw7CcndHWRd6UlLlGrZn0b0hp5cfaMNGw= -gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8 h1:jdwYydJxYPlfIS9yZvnNX1w08aJGYWq5ADD5EXLW3+Q= -gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= +gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142 h1:+po0nesBDSWsgCySBG7eEXk7i9Ytd58wqvjL1M9y6d8= +gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203/go.mod h1:i2rrbMQ+Fve345BY9Heh4MUqVTAimZQElQhzzRee5B8= gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnzkJBfMRnya1NrhOzlroUtRa5ePiYbPKlHLoLV0=