# Conflicts:
#	api/desc/pcm.api


Former-commit-id: 683c0fa4f8ec772254f969f39f8d9fb0c5c8d894
This commit is contained in:
qiwang 2024-04-29 10:42:38 +08:00
commit b794b9d16f
22 changed files with 365 additions and 14 deletions

View File

@ -14,6 +14,8 @@ type CreateAlertRuleReq {
type (
AlertRulesReq {
AlertType string `form:"alertType"`
AdapterId string `form:"adapterId,optional"`
ClusterId string `form:"clusterId,optional"`
}
AlertRulesResp {
alertRules []AlertRule `json:"alertRules"`
@ -56,4 +58,26 @@ type (
type SyncClusterAlertReq {
AlertRecordsMap map[string]interface{} `json:"alertRecordsMap"`
}
}
type (
taskNumReq {
clusterId string `form:"clusterId"`
}
taskNumResp {
current int `json:"current"`
today int `json:"today"`
history int `json:"history"`
failed int `json:"failed"`
}
)
type (
adapterInfoReq{
clusterId string `form:"clusterId"`
}
adapterInfoResp{
name string `json:"name"`
version string `json:"version"`
}
)

View File

@ -132,10 +132,11 @@ service pcm {
@doc "paging queries the task list"
@handler pageListTaskHandler
get /core/task/list (pageTaskReq) returns(PageResult)
get /core/task/list (pageTaskReq) returns (PageResult)
@doc "Statistical task status"
@handler countTaskStatus
get /core/task/countTaskStatus () returns (TaskStatusResp)
get /core/task/countTaskStatus () returns(TaskStatusResp)
@doc "Home Page Overview"

View File

@ -70,4 +70,15 @@ type (
AiAlgorithmsResp {
Algorithms []string `json:"algorithms"`
}
AiJobLogReq {
AdapterId string `path:"adapterId"`
ClusterId string `path:"clusterId"`
TaskId string `path:"taskId"`
instanceNum string `path:"instanceNum"`
}
AiJobLogResp {
Log string `json:"log"`
}
)

View File

@ -0,0 +1,25 @@
package monitoring
import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/monitoring"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
)
func AdapterInfoHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.AdapterInfoReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
return
}
l := monitoring.NewAdapterInfoLogic(r.Context(), svcCtx)
resp, err := l.AdapterInfo(&req)
result.HttpResult(r, w, resp, err)
}
}

View File

@ -0,0 +1,25 @@
package monitoring
import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/monitoring"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
)
func TaskNumHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.TaskNumReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
return
}
l := monitoring.NewTaskNumLogic(r.Context(), svcCtx)
resp, err := l.TaskNum(&req)
result.HttpResult(r, w, resp, err)
}
}

View File

@ -1150,6 +1150,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset",
Handler: schedule.ScheduleGetAlgorithmsHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/schedule/ai/getJobLog/:adapterId/:clusterId/:taskId/:instanceNum",
Handler: schedule.ScheduleGetAiJobLogLogHandler(serverCtx),
},
{
Method: http.MethodPost,
Path: "/schedule/submit",
@ -1249,9 +1254,19 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
},
{
Method: http.MethodPost,
Path: "/core/syncClusterAlert",
Path: "/monitoring/syncClusterAlert",
Handler: monitoring.SyncClusterAlertHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/monitoring/task/num",
Handler: monitoring.TaskNumHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/monitoring/adapter/info",
Handler: monitoring.AdapterInfoHandler(serverCtx),
},
},
rest.WithPrefix("/pcm/v1"),
)

View File

@ -0,0 +1,28 @@
package schedule
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
)
func ScheduleGetAiJobLogLogHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.AiJobLogReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
return
}
l := schedule.NewScheduleGetAiJobLogLogLogic(r.Context(), svcCtx)
resp, err := l.ScheduleGetAiJobLogLog(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -0,0 +1,31 @@
package monitoring
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type AdapterInfoLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewAdapterInfoLogic(ctx context.Context, svcCtx *svc.ServiceContext) *AdapterInfoLogic {
return &AdapterInfoLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *AdapterInfoLogic) AdapterInfo(req *types.AdapterInfoReq) (resp *types.AdapterInfoResp, err error) {
// todo: add your logic here and delete this line
resp = &types.AdapterInfoResp{}
l.svcCtx.DbEngin.Raw("select ta.name , ta.version from t_adapter ta,t_cluster tc where tc.id = ? and tc.adapter_id = ta.id", req.ClusterId).Scan(resp)
return resp, nil
}

View File

@ -2,6 +2,7 @@ package monitoring
import (
"context"
"fmt"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
@ -26,7 +27,14 @@ func NewAlertRulesLogic(ctx context.Context, svcCtx *svc.ServiceContext) *AlertR
func (l *AlertRulesLogic) AlertRules(req *types.AlertRulesReq) (resp *types.AlertRulesResp, err error) {
resp = &types.AlertRulesResp{}
var alertRules []types.AlertRule
l.svcCtx.DbEngin.Raw("SELECT ar.id,ar.*,GROUP_CONCAT(tc.`name` ORDER BY tc.`name` ASC SEPARATOR ',') as cluster_name FROM alert_rule ar JOIN t_cluster tc ON ar.cluster_id = tc.id WHERE ar.alert_type = ? AND ar.deleted_at IS NULL AND tc.deleted_at IS NULL GROUP BY ar.id", req.AlertType).Scan(&alertRules)
sql := fmt.Sprintf("SELECT ar.*,GROUP_CONCAT(tc.`name` ORDER BY tc.`name` ASC SEPARATOR ',') as cluster_name FROM alert_rule ar JOIN t_cluster tc ON ar.cluster_id = tc.id WHERE ar.alert_type = %s AND ar.deleted_at IS NULL AND tc.deleted_at IS NULL GROUP BY ar.id", req.AlertType)
if req.AdapterId != "" {
sql = fmt.Sprintf("SELECT ar.*,GROUP_CONCAT( tc.`name` ORDER BY tc.`name` ASC SEPARATOR ',' ) AS cluster_name FROM alert_rule ar JOIN t_cluster tc ON ar.cluster_id = tc.id JOIN t_adapter ta ON ta.id = tc.adapter_id WHERE ar.alert_type = %s AND ta.id = %s AND ar.deleted_at IS NULL AND tc.deleted_at IS NULL GROUP BY ar.id", req.AlertType, req.AdapterId)
}
if req.ClusterId != "" {
sql = fmt.Sprintf("SELECT ar.*,GROUP_CONCAT(tc.`name` ORDER BY tc.`name` ASC SEPARATOR ',') as cluster_name FROM alert_rule ar JOIN t_cluster tc ON ar.cluster_id = tc.id WHERE ar.alert_type = %s AND ar.cluster_id = %s AND ar.deleted_at IS NULL AND tc.deleted_at IS NULL GROUP BY ar.id", req.AlertType, req.ClusterId)
}
l.svcCtx.DbEngin.Raw(sql).Scan(&alertRules)
resp.AlertRules = alertRules
return resp, nil
}

View File

@ -11,6 +11,7 @@ import (
v12 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/util/json"
"strconv"
)
type CreateAlertRuleLogic struct {
@ -38,6 +39,7 @@ func (l *CreateAlertRuleLogic) CreateAlertRule(req *types.CreateAlertRuleReq) er
// save to db
var alertRule models.AlertRule
tool.Convert(req, &alertRule)
alertRule.ClusterId, _ = strconv.ParseInt(req.CLusterId, 10, 64)
alertRule.Id = tool.GenSnowflakeID()
tx := l.svcCtx.DbEngin.Save(&alertRule)
if tx.Error != nil {

View File

@ -0,0 +1,33 @@
package monitoring
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type TaskNumLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewTaskNumLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TaskNumLogic {
return &TaskNumLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *TaskNumLogic) TaskNum(req *types.TaskNumReq) (resp *types.TaskNumResp, err error) {
resp = &types.TaskNumResp{}
l.svcCtx.DbEngin.Raw("SELECT COUNT(id) from task_cloud where cluster_id = ? and status = 'running'", req.ClusterId).Scan(resp.Current)
l.svcCtx.DbEngin.Raw("SELECT COUNT(id) from task_cloud where cluster_id = '' and DATE(start_time) = CURDATE()", req.ClusterId).Scan(resp.Today)
l.svcCtx.DbEngin.Raw("SELECT COUNT(id) from task_cloud where cluster_id = ?", req.ClusterId).Scan(resp.History)
l.svcCtx.DbEngin.Raw("SELECT COUNT(id) from task_cloud where cluster_id = ? and status = 'failed'", req.ClusterId).Scan(resp.Failed)
return resp, nil
}

View File

@ -0,0 +1,36 @@
package schedule
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type ScheduleGetAiJobLogLogLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewScheduleGetAiJobLogLogLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ScheduleGetAiJobLogLogLogic {
return &ScheduleGetAiJobLogLogLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *ScheduleGetAiJobLogLogLogic) ScheduleGetAiJobLogLog(req *types.AiJobLogReq) (resp *types.AiJobLogResp, err error) {
resp = &types.AiJobLogResp{}
log, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].GetTrainingTaskLog(l.ctx, req.TaskId, req.InstanceNum)
if err != nil {
return nil, err
}
resp.Log = log
return resp, nil
}

View File

@ -6,6 +6,7 @@ type AiCollector interface {
GetResourceStats(ctx context.Context) (*ResourceStats, error)
GetDatasetsSpecs(ctx context.Context) ([]*DatasetsSpecs, error)
GetAlgorithms(ctx context.Context) ([]*Algorithm, error)
GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error)
}
type ResourceStats struct {

View File

@ -162,6 +162,10 @@ func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorit
return nil, nil
}
func (m *ModelArtsLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
return "", nil
}
func (m *ModelArtsLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
err := m.GenerateSubmitParams(ctx, option)
if err != nil {

View File

@ -337,6 +337,25 @@ func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm
return algorithms, nil
}
func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
instance, err := strconv.ParseInt(instanceNum, 10, 32)
if err != nil {
return "", err
}
req := &octopus.GetTrainJobLogReq{
Platform: o.platform,
TaskId: taskId,
TaskNum: "task0",
Num: int32(instance),
}
resp, err := o.octopusRpc.GetTrainJobLog(ctx, req)
if err != nil {
return "", err
}
return resp.Content, nil
}
func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
err := o.GenerateSubmitParams(ctx, option)
if err != nil {

View File

@ -447,6 +447,24 @@ func (s *ShuguangAi) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm,
return algorithms, nil
}
func (s *ShuguangAi) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
req := &hpcAC.GetInstanceLogReq{
TaskId: taskId,
InstanceNum: instanceNum,
LineCount: 1000,
StartLineNum: -1,
}
resp, err := s.aCRpc.GetInstanceLog(ctx, req)
if err != nil {
return "", err
}
if resp.Code != "0" {
return "", errors.New(resp.Msg)
}
return resp.Data.Content, nil
}
func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
err := s.GenerateSubmitParams(ctx, option)
if err != nil {

View File

@ -117,10 +117,6 @@ func NewServiceContext(c config.Config) *ServiceContext {
})
// scheduler
//octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(c.OctopusRpcConf))
//aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(c.ACRpcConf))
//modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(c.ModelArtsRpcConf))
//modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(c.ModelArtsImgRpcConf))
storage := &database.AiStorage{DbEngin: dbEngin}
aiService, err := service.NewAiService(&c, storage)
if err != nil {

View File

@ -5551,6 +5551,17 @@ type AiAlgorithmsResp struct {
Algorithms []string `json:"algorithms"`
}
type AiJobLogReq struct {
AdapterId string `path:"adapterId"`
ClusterId string `path:"clusterId"`
TaskId string `path:"taskId"`
InstanceNum string `path:"instanceNum"`
}
type AiJobLogResp struct {
Log string `json:"log"`
}
type CreateAlertRuleReq struct {
CLusterId string `json:"clusterId"`
ClusterName string `json:"clusterName"`
@ -5564,6 +5575,8 @@ type CreateAlertRuleReq struct {
type AlertRulesReq struct {
AlertType string `form:"alertType"`
AdapterId string `form:"adapterId,optional"`
ClusterId string `form:"clusterId,optional"`
}
type AlertRulesResp struct {
@ -5605,3 +5618,23 @@ type AlertListResp struct {
type SyncClusterAlertReq struct {
AlertRecordsMap map[string]interface{} `json:"alertRecordsMap"`
}
type TaskNumReq struct {
ClusterId string `form:"clusterId"`
}
type TaskNumResp struct {
Current int `json:"current"`
Today int `json:"today"`
History int `json:"history"`
Failed int `json:"failed"`
}
type AdapterInfoReq struct {
ClusterId string `form:"clusterId"`
}
type AdapterInfoResp struct {
Name string `json:"name"`
Version string `json:"version"`
}

BIN
deploy/pcm-yaml.zip Normal file

Binary file not shown.

41
docs/pcm_deploy.md Normal file
View File

@ -0,0 +1,41 @@
## 1 安装部署kubekey
通过以下的命令,可以下载 KubeKey 的最新版本。您可以更改命令中的版本号来下载特定的版本。
```
export KKZONE=cn
curl -sfL https://get-kk.kubesphere.io | VERSION=v3.0.7 sh -
```
## 2 安装部署k8s集群
```
./kk create cluster
```
执行可能会提示部分软件未安装直接yum安装即可
![输入图片说明](/imgs/2024-04-28/qF082JVaumRARK1J.png)
然后重新执行创建集群命令,执行成功后可以验证环境
![输入图片说明](/imgs/2024-04-28/FoVNPbwm1pnt839Z.png)
## 3 部署鉴权、pcm-coordinator、前端服务
yaml文件下载链接https://pan.baidu.com/s/1VU1zE2xcFkrz9Hz2MkgDaQ
鉴权:
kubectl apply -f pcm-auth.yaml
C端
kubectl apply -f pcm-core-api.yaml
kubectl apply -f pcm-core-rpc.yaml
前端:
kubectl apply -f pcm-rip.yaml
## 4 配置驱动器、集群信息
新建一个适配器配置成功后可以获取到对应的adapterId
![输入图片说明](/imgs/2024-04-28/Dtu4KC835jSfcf5R.png)
将对应的id填写到对应的P端配置信息中(configmap 内容)
![输入图片说明](/imgs/2024-04-28/zuFWMVKAycNlPXOF.png)
## 5 部署P端服务
P端
kubectl apply -f pcm-hpc.yaml
kubectl apply -f pcm-kubernetes.yaml
## 7.系统使用

4
go.mod
View File

@ -24,9 +24,9 @@ require (
github.com/robfig/cron/v3 v3.0.1
github.com/rs/zerolog v1.28.0
github.com/zeromicro/go-zero v1.6.3
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203
gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5
gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d

8
go.sum
View File

@ -1078,12 +1078,12 @@ github.com/yuin/gopher-lua v1.1.0/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7
github.com/zeromicro/go-zero v1.5.1/go.mod h1:bGYm4XWsGN9GhDsO2O2BngpVoWjf3Eog2a5hUOMhlXs=
github.com/zeromicro/go-zero v1.6.3 h1:OL0NnHD5LdRNDolfcK9vUkJt7K8TcBE3RkzfM8poOVw=
github.com/zeromicro/go-zero v1.6.3/go.mod h1:XZL435ZxVi9MSXXtw2MRQhHgx6OoX3++MRMOE9xU70c=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8 h1:cX6U2gUcp/sIP3TKFv4q/1O8gp10q+M3k5Ql15yaEMI=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240407112649-e479e74b58c8/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb h1:k6mNEWKp+haQUaK2dWs/rI9OKgzJHY1/9KNKuBDN0Vw=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo=
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c h1:2Wl/hvaSFjh6fmCSIQhjkr9llMRREQeqcXNLZ/HPY18=
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c/go.mod h1:lSRfGs+PxFvw7CcndHWRd6UlLlGrZn0b0hp5cfaMNGw=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8 h1:jdwYydJxYPlfIS9yZvnNX1w08aJGYWq5ADD5EXLW3+Q=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240407105727-38e45468eaa8/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142 h1:+po0nesBDSWsgCySBG7eEXk7i9Ytd58wqvjL1M9y6d8=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ=
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI=
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203/go.mod h1:i2rrbMQ+Fve345BY9Heh4MUqVTAimZQElQhzzRee5B8=
gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnzkJBfMRnya1NrhOzlroUtRa5ePiYbPKlHLoLV0=