Merge pull request 'fix aiOverview timeout bugs' (#145) from tzwang/pcm-coordinator:master into master

Former-commit-id: 38786fb3fe33c433b06ae25214c691d8d7600d78
This commit is contained in:
tzwang 2024-05-09 16:17:40 +08:00
commit fa2c7b1aa9
10 changed files with 125 additions and 14 deletions

View File

@ -966,6 +966,12 @@ service pcm {
@handler ScheduleGetOverviewHandler
post /schedule/getOverview returns (ScheduleOverviewResp)
@handler DownloadAlgothmCodeHandler
get /schedule/getDownloadAlgothmCode (DownloadAlgorithmCodeReq) returns (DownloadAlgorithmCodeResp)
@handler UploadAlgothmCodeHandler
post /schedule/getDownloadAlgothmCode (UploadAlgorithmCodeReq) returns (UploadAlgorithmCodeResp)
}
@server(

View File

@ -100,4 +100,33 @@ type (
StartTime string `json:"startTime,omitempty" db:"start_time"`
EndTime string `json:"endTime,omitempty" db:"end_time"`
}
DownloadAlgorithmCodeReq {
AdapterId string `form:"adapterId"`
ClusterId string `form:"clusterId"`
ResourceType string `form:"resourceType"`
Card string `form:"card"`
TaskType string `form:"taskType"`
Dataset string `form:"dataset"`
Algorithm string `form:"algorithm"`
Code string `form:"code"`
}
DownloadAlgorithmCodeResp {
Code string `json:"algorithms"`
}
UploadAlgorithmCodeReq {
AdapterId string `json:"adapterId"`
ClusterId string `json:"clusterId"`
ResourceType string `json:"resourceType"`
Card string `json:"card"`
TaskType string `json:"taskType"`
Dataset string `json:"dataset"`
Algorithm string `json:"algorithm"`
Code string `json:"code"`
}
UploadAlgorithmCodeResp {
}
)

View File

@ -1190,6 +1190,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset",
Handler: schedule.ScheduleGetAlgorithmsHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/schedule/ai/getJobLog/:adapterId/:clusterId/:taskId/:instanceNum",
Handler: schedule.ScheduleGetAiJobLogLogHandler(serverCtx),
},
{
Method: http.MethodPost,
Path: "/schedule/submit",
@ -1200,6 +1205,16 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/schedule/getOverview",
Handler: schedule.ScheduleGetOverviewHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/schedule/getDownloadAlgothmCode",
Handler: schedule.DownloadAlgothmCodeHandler(serverCtx),
},
{
Method: http.MethodPost,
Path: "/schedule/getDownloadAlgothmCode",
Handler: schedule.UploadAlgothmCodeHandler(serverCtx),
},
},
rest.WithPrefix("/pcm/v1"),
)
@ -1294,7 +1309,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
},
{
Method: http.MethodPost,
Path: "/core/syncClusterAlert",
Path: "/monitoring/syncClusterAlert",
Handler: monitoring.SyncClusterAlertHandler(serverCtx),
},
{

View File

@ -8,6 +8,7 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"strconv"
"sync"
"time"
)
type GetCenterOverviewLogic struct {
@ -71,9 +72,14 @@ func (l *GetCenterOverviewLogic) GetCenterOverview() (resp *types.CenterOverview
}
resp.CardNum = cardNum
resp.PowerInTops = totalTops
<-ch
return resp, nil
select {
case _ = <-ch:
return resp, nil
case <-time.After(2 * time.Second):
return resp, nil
}
}
func (l *GetCenterOverviewLogic) updateClusterResource(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) {

View File

@ -73,9 +73,14 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList
resp.List = append(resp.List, t)
}
}
<-ch
return resp, nil
select {
case _ = <-ch:
return resp, nil
case <-time.After(1 * time.Second):
return resp, nil
}
}
func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) {

View File

@ -8,8 +8,8 @@ type AiCollector interface {
GetAlgorithms(ctx context.Context) ([]*Algorithm, error)
GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error)
GetTrainingTask(ctx context.Context, taskId string) (*Task, error)
DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error)
UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error
DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error)
UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error
}
type ResourceStats struct {

View File

@ -162,11 +162,11 @@ func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorit
return nil, nil
}
func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) {
func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
return "", nil
}
func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error {
func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
return nil
}

View File

@ -339,11 +339,11 @@ func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm
return algorithms, nil
}
func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) {
func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
return "", nil
}
func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error {
func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
return nil
}

View File

@ -447,11 +447,32 @@ func (s *ShuguangAi) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm,
return algorithms, nil
}
func (s *ShuguangAi) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) {
return "", nil
func (s *ShuguangAi) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
algoName := dataset + DASH + algorithm
req := &hpcAC.GetFileReq{
Path: ALGORITHM_DIR + FORWARD_SLASH + taskType + FORWARD_SLASH + algoName + FORWARD_SLASH + TRAIN_FILE,
}
resp, err := s.aCRpc.GetFile(ctx, req)
if err != nil {
return "", err
}
return resp.Content, nil
}
func (s *ShuguangAi) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error {
func (s *ShuguangAi) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
algoName := dataset + DASH + algorithm
req := &hpcAC.UploadFileReq{
Path: ALGORITHM_DIR + FORWARD_SLASH + taskType + FORWARD_SLASH + algoName + FORWARD_SLASH,
Cover: "cover",
File: code,
}
_, err := s.aCRpc.UploadFile(ctx, req)
if err != nil {
return err
}
return nil
}

View File

@ -5676,6 +5676,35 @@ type AiTaskDb struct {
EndTime string `json:"endTime,omitempty" db:"end_time"`
}
type DownloadAlgorithmCodeReq struct {
AdapterId string `form:"adapterId"`
ClusterId string `form:"clusterId"`
ResourceType string `form:"resourceType"`
Card string `form:"card"`
TaskType string `form:"taskType"`
Dataset string `form:"dataset"`
Algorithm string `form:"algorithm"`
Code string `form:"code"`
}
type DownloadAlgorithmCodeResp struct {
Code string `json:"algorithms"`
}
type UploadAlgorithmCodeReq struct {
AdapterId string `json:"adapterId"`
ClusterId string `json:"clusterId"`
ResourceType string `json:"resourceType"`
Card string `json:"card"`
TaskType string `json:"taskType"`
Dataset string `json:"dataset"`
Algorithm string `json:"algorithm"`
Code string `json:"code"`
}
type UploadAlgorithmCodeResp struct {
}
type CreateAlertRuleReq struct {
CLusterId string `json:"clusterId"`
ClusterName string `json:"clusterName"`