Merge pull request 'added ai getJobLog api' (#123) from tzwang/pcm-coordinator:master into master

Former-commit-id: e599d45c1005440550b4e78c4bf1bbaca7c60ce7
This commit is contained in:
tzwang 2024-04-26 11:10:45 +08:00
commit 14c3af1e1b
13 changed files with 142 additions and 10 deletions

View File

@ -922,6 +922,9 @@ service pcm {
@handler ScheduleGetAlgorithmsHandler
get /schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset (AiAlgorithmsReq) returns (AiAlgorithmsResp)
@handler ScheduleGetAiJobLogLogHandler
get /schedule/ai/getJobLog/:adapterId/:clusterId/:taskId/:instanceNum (AiJobLogReq) returns (AiJobLogResp)
@handler ScheduleSubmitHandler
post /schedule/submit (ScheduleReq) returns (ScheduleResp)
}

View File

@ -70,4 +70,15 @@ type (
AiAlgorithmsResp {
Algorithms []string `json:"algorithms"`
}
AiJobLogReq {
AdapterId string `path:"adapterId"`
ClusterId string `path:"clusterId"`
TaskId string `path:"taskId"`
instanceNum string `path:"instanceNum"`
}
AiJobLogResp {
Log string `json:"log"`
}
)

View File

@ -1145,6 +1145,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset",
Handler: schedule.ScheduleGetAlgorithmsHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/schedule/ai/getJobLog/:adapterId/:clusterId/:taskId/:instanceNum",
Handler: schedule.ScheduleGetAiJobLogLogHandler(serverCtx),
},
{
Method: http.MethodPost,
Path: "/schedule/submit",

View File

@ -0,0 +1,28 @@
package schedule
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
)
func ScheduleGetAiJobLogLogHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.AiJobLogReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
return
}
l := schedule.NewScheduleGetAiJobLogLogLogic(r.Context(), svcCtx)
resp, err := l.ScheduleGetAiJobLogLog(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -0,0 +1,36 @@
package schedule
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type ScheduleGetAiJobLogLogLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewScheduleGetAiJobLogLogLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ScheduleGetAiJobLogLogLogic {
return &ScheduleGetAiJobLogLogLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *ScheduleGetAiJobLogLogLogic) ScheduleGetAiJobLogLog(req *types.AiJobLogReq) (resp *types.AiJobLogResp, err error) {
resp = &types.AiJobLogResp{}
log, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].GetTrainingTaskLog(l.ctx, req.TaskId, req.InstanceNum)
if err != nil {
return nil, err
}
resp.Log = log
return resp, nil
}

View File

@ -6,6 +6,7 @@ type AiCollector interface {
GetResourceStats(ctx context.Context) (*ResourceStats, error)
GetDatasetsSpecs(ctx context.Context) ([]*DatasetsSpecs, error)
GetAlgorithms(ctx context.Context) ([]*Algorithm, error)
GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error)
}
type ResourceStats struct {

View File

@ -162,6 +162,10 @@ func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorit
return nil, nil
}
func (m *ModelArtsLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
return "", nil
}
func (m *ModelArtsLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
err := m.GenerateSubmitParams(ctx, option)
if err != nil {

View File

@ -337,6 +337,25 @@ func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm
return algorithms, nil
}
func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
instance, err := strconv.ParseInt(instanceNum, 10, 32)
if err != nil {
return "", err
}
req := &octopus.GetTrainJobLogReq{
Platform: o.platform,
TaskId: taskId,
TaskNum: "task0",
Num: int32(instance),
}
resp, err := o.octopusRpc.GetTrainJobLog(ctx, req)
if err != nil {
return "", err
}
return resp.Content, nil
}
func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
err := o.GenerateSubmitParams(ctx, option)
if err != nil {

View File

@ -447,6 +447,24 @@ func (s *ShuguangAi) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm,
return algorithms, nil
}
func (s *ShuguangAi) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
req := &hpcAC.GetInstanceLogReq{
TaskId: taskId,
InstanceNum: instanceNum,
LineCount: 1000,
StartLineNum: -1,
}
resp, err := s.aCRpc.GetInstanceLog(ctx, req)
if err != nil {
return "", err
}
if resp.Code != "0" {
return "", errors.New(resp.Msg)
}
return resp.Data.Content, nil
}
func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
err := s.GenerateSubmitParams(ctx, option)
if err != nil {

View File

@ -117,10 +117,6 @@ func NewServiceContext(c config.Config) *ServiceContext {
})
// scheduler
//octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(c.OctopusRpcConf))
//aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(c.ACRpcConf))
//modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(c.ModelArtsRpcConf))
//modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(c.ModelArtsImgRpcConf))
storage := &database.AiStorage{DbEngin: dbEngin}
aiService, err := service.NewAiService(&c, storage)
if err != nil {

View File

@ -5535,6 +5535,17 @@ type AiAlgorithmsResp struct {
Algorithms []string `json:"algorithms"`
}
type AiJobLogReq struct {
AdapterId string `path:"adapterId"`
ClusterId string `path:"clusterId"`
TaskId string `path:"taskId"`
InstanceNum string `path:"instanceNum"`
}
type AiJobLogResp struct {
Log string `json:"log"`
}
type CreateAlertRuleReq struct {
CLusterId string `json:"clusterId"`
ClusterName string `json:"clusterName"`

4
go.mod
View File

@ -24,9 +24,9 @@ require (
github.com/robfig/cron/v3 v3.0.1
github.com/rs/zerolog v1.28.0
github.com/zeromicro/go-zero v1.6.3
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203
gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5
gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d

8
go.sum
View File

@ -1078,12 +1078,12 @@ github.com/yuin/gopher-lua v1.1.0/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7
github.com/zeromicro/go-zero v1.5.1/go.mod h1:bGYm4XWsGN9GhDsO2O2BngpVoWjf3Eog2a5hUOMhlXs=
github.com/zeromicro/go-zero v1.6.3 h1:OL0NnHD5LdRNDolfcK9vUkJt7K8TcBE3RkzfM8poOVw=
github.com/zeromicro/go-zero v1.6.3/go.mod h1:XZL435ZxVi9MSXXtw2MRQhHgx6OoX3++MRMOE9xU70c=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8 h1:cX6U2gUcp/sIP3TKFv4q/1O8gp10q+M3k5Ql15yaEMI=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb h1:k6mNEWKp+haQUaK2dWs/rI9OKgzJHY1/9KNKuBDN0Vw=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo=
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c h1:2Wl/hvaSFjh6fmCSIQhjkr9llMRREQeqcXNLZ/HPY18=
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c/go.mod h1:lSRfGs+PxFvw7CcndHWRd6UlLlGrZn0b0hp5cfaMNGw=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8 h1:jdwYydJxYPlfIS9yZvnNX1w08aJGYWq5ADD5EXLW3+Q=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142 h1:+po0nesBDSWsgCySBG7eEXk7i9Ytd58wqvjL1M9y6d8=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ=
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI=
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203/go.mod h1:i2rrbMQ+Fve345BY9Heh4MUqVTAimZQElQhzzRee5B8=
gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnzkJBfMRnya1NrhOzlroUtRa5ePiYbPKlHLoLV0=