Merge remote-tracking branch 'upstream/master' into upmaster_wq

# Conflicts:
#	api/desc/pcm.api
#	api/internal/handler/routes.go


Former-commit-id: e8916b1c9f9f0f56463a8395bae80bb159c0d065
This commit is contained in:
qiwang 2024-05-09 15:16:23 +08:00
commit a57e0b7ba8
69 changed files with 6457 additions and 530 deletions

View File

@ -111,18 +111,17 @@ type HpcInfo struct {
}
type CloudInfo struct {
Participant int64 `json:"participant,omitempty"`
Id int64 `json:"id,omitempty"`
TaskId int64 `json:"taskId,omitempty"`
ApiVersion string `json:"apiVersion,omitempty"`
Kind string `json:"kind,omitempty"`
Namespace string `json:"namespace,omitempty"`
Name string `json:"name,omitempty"`
Status string `json:"status,omitempty"`
StartTime string `json:"startTime,omitempty"`
RunningTime int64 `json:"runningTime,omitempty"`
Result string `json:"result,omitempty"`
YamlString string `json:"yamlString,omitempty"`
Id uint `json:"id,omitempty,optional"`
TaskId int64 `json:"taskId,omitempty,optional"`
AdapterId uint `json:"adapterId,omitempty,optional"`
ClusterId uint `json:"clusterId,omitempty,optional"`
ClusterName string `json:"clusterName,omitempty,optional"`
Kind string `json:"kind,omitempty,optional"`
Status string `json:"status,omitempty,optional"`
StartTime *time.Time `json:"startTime,omitempty,optional,string"`
YamlString string `json:"yamlString,omitempty,optional"`
Result string `json:"result,omitempty,optional"`
Namespace string `json:"namespace,omitempty,optional"`
}
type AiInfo struct {

View File

@ -1697,6 +1697,44 @@ PayloadCreateTrainJob{
jobId string `json:"jobId,optional"`
}
********************/
/******************Ai Center overview*************************/
CenterOverviewResp {
CenterNum int32 `json:"totalCenters,optional"`
TaskNum int32 `json:"totalTasks,optional"`
CardNum int32 `json:"totalCards,optional"`
PowerInTops float64 `json:"totalPower,optional"`
}
CenterQueueingResp {
Current []*CenterQueue `json:"current,optional"`
History []*CenterQueue `json:"history,optional"`
}
CenterQueue {
Name string `json:"name,optional"`
QueueingNum int32 `json:"num,optional"`
}
CenterListResp {
List []*AiCenter `json:"centerList,optional"`
}
AiCenter {
Name string `json:"name,optional"`
StackName string `json:"stack,optional"`
Version string `json:"version,optional"`
}
CenterTaskListResp {
List []*AiTask `json:"taskList,optional"`
}
AiTask {
Name string `json:"name,optional"`
status string `json:"status,optional"`
TimeElapsed int32 `json:"elapsed,optional"`
}
)
/******************create TrainIngJob end*************************/

View File

@ -59,9 +59,9 @@ type (
Type int64 `json:"type"` // 租户所属(0数算1超算2智算
DeletedFlag int64 `json:"deletedFlag"` // 是否删除
CreatedBy int64 `json:"createdBy"` // 创建人
CreatedTime string `json:"createdTime"` // 创建时间
CreateTime string `json:"createdTime"` // 创建时间
UpdatedBy int64 `json:"updatedBy"` // 更新人
UpdatedTime string `json:"updated_time"` // 更新时间
UpdateTime string `json:"updated_time"` // 更新时间
}
UpdateTenantReq {
@ -115,6 +115,6 @@ type Cloud {
StartTime string `json:"startTime"` // 开始时间
RunningTime int64 `json:"runningTime"` // 运行时长
CreatedBy int64 `json:"createdBy"` // 创建人
CreatedTime string `json:"createdTime"` // 创建时间
CreateTime string `json:"createdTime"` // 创建时间
Result string `json:"result"`
}

View File

@ -159,22 +159,12 @@ type (
type (
GeneralTaskReq {
Name string `json:"name"`
ComputeType string `json:"computeType"`
TemplateId string `json:"templateId"`
AdapterId string `json:"adapterId"`
AdapterIds []string `json:"adapterIds"`
ClusterIds []string `json:"clusterIds"`
Strategy Strategy `json:"strategy"`
Strategy string `json:"strategy"`
StaticWeightMap map[string]int32 `json:"staticWeightMap,optional"`
ReqBody []string `json:"reqBody"`
}
Strategy {
Name string `json:"name"`
StaticWeightList []StaticWeightList `json:"staticWeightList"`
}
StaticWeightList {
ClusterName string `json:"clusterName"`
Weight int `json:"weight"`
Replicas int64 `json:"replicas,string"`
}
)
@ -210,32 +200,71 @@ type (
}
)
type (
commitVmTaskReq {
Name string `json:"name"`
NsID string `json:"nsID"`
Replicas int64 `json:"replicas,optional"`
MatchLabels map[string]string `json:"matchLabels,optional"`
AdapterId string `json:"adapterId,optional"`
ClusterType string `json:"clusterType,optional"`
//Virtual Machine Section
// Name string `json:"name"`
// NsID string `json:"nsID"`
// Replicas int64 `json:"replicas,optional"`
// MatchLabels map[string]string `json:"matchLabels,optional"`
// AdapterId string `json:"adapterId,optional"`
// ClusterType string `json:"clusterType,optional"`
// //Virtual Machine Section
CreateMulServer []CreateMulDomainServer `json:"createMulServer,optional"`
VmOption *VmOption `json:"vmOption,optional"`
}
VmOption {
AdapterId string `json:"adapterId"`
VmClusterIds []string `json:"vmClusterIds"`
Replicas int64 `json:"replicas,optional"`
Name string `json:"name"`
//ResourceType string `json:"resourceType"`
//TaskType string `json:"taskType"`
Strategy string `json:"strategy"`
ClusterToStaticWeight map[string]int32 `json:"clusterToStaticWeight"`
MatchLabels map[string]string `json:"matchLabels,optional"`
StaticWeightMap map[string]int32 `json:"staticWeightMap,optional"`
CreateMulServer []CreateMulDomainServer `json:"createMulServer,optional"`
// Id int64 `json:"id"`
// ParticipantId int64 `json:"participantId"`
// TaskId int64 `json:"taskId"`
// AdapterId int64 `json:"adapterId"`
// ClusterId int64 `json:"clusterId"`
// FlavorRef string `json:"flavorRef"`
// ImageRef string `json:"imageRef"`
// Status string `json:"status"`
// Platform string `json:"platform"`
// Description string `json:"description"`
// AvailabilityZone string `json:"availabilityZone"`
// MinCount int64 `json:"minCount"`
// Uuid string `json:"uuid"`
// StartTime string `json:"startTime"`
// RunningTime string `json:"runningTime"`
// Result string `json:"result"`
// DeletedAt string `json:"deletedAt"`
}
CreateMulDomainServer {
Platform string `json:"platform,optional"`
Name string `json:"name,optional"`
Min_count int64 `json:"min_count,optional"`
ImageRef string `json:"imageRef,optional"`
FlavorRef string `json:"flavorRef,optional"`
Uuid string `json:"uuid,optional"`
name string `json:"name,optional"`
min_count int64 `json:"min_count,optional"`
imageRef string `json:"imageRef,optional"`
flavorRef string `json:"flavorRef,optional"`
uuid string `json:"uuid,optional"`
ClusterId string `json:"clusterId,optional"`
}
commitVmTaskResp {
// VmTask []VmTask `json:"vmTask" copier:"VmTask"`
TaskId int64 `json:"taskId"`
Code int32 `json:"code"`
Msg string `json:"msg"`
}
VmTask {
ScheduleVmResult struct {
ClusterId string `json:"clusterId"`
TaskId string `json:"taskId"`
Strategy string `json:"strategy"`
Replica int32 `json:"replica"`
Msg string `json:"msg"`
}
VmTask{
Id string `json:"id" copier:"Id"`
Links []VmLinks `json:"links" copier:"Links"`
OSDCFDiskConfig string `json:"OS_DCF_diskConfig" copier:"OSDCFDiskConfig"`
@ -246,6 +275,41 @@ type (
Href string `json:"href " copier:"Href"`
Rel string `json:"rel" copier:"Rel"`
}
// commitVmTaskReq {
// Name string `json:"name"`
// NsID string `json:"nsID"`
// Replicas int64 `json:"replicas,optional"`
// MatchLabels map[string]string `json:"matchLabels,optional"`
// AdapterId string `json:"adapterId,optional"`
// ClusterType string `json:"clusterType,optional"`
// //Virtual Machine Section
// CreateMulServer []CreateMulDomainServer `json:"createMulServer,optional"`
// }
// CreateMulDomainServer {
// Platform string `json:"platform,optional"`
// Name string `json:"name,optional"`
// Min_count int64 `json:"min_count,optional"`
// ImageRef string `json:"imageRef,optional"`
// FlavorRef string `json:"flavorRef,optional"`
// Uuid string `json:"uuid,optional"`
// }
// commitVmTaskResp {
// // VmTask []VmTask `json:"vmTask" copier:"VmTask"`
// TaskId int64 `json:"taskId"`
// Code int32 `json:"code"`
// Msg string `json:"msg"`
// }
// VmTask {
// Id string `json:"id" copier:"Id"`
// Links []VmLinks `json:"links" copier:"Links"`
// OSDCFDiskConfig string `json:"OS_DCF_diskConfig" copier:"OSDCFDiskConfig"`
// SecurityGroups []VmSecurity_groups_server `json:"security_groups" copier:"SecurityGroups"`
// AdminPass string `json:"adminPass" copier:"AdminPass"`
// }
// VmLinks {
// Href string `json:"href " copier:"Href"`
// Rel string `json:"rel" copier:"Rel"`
// }
VmSecurity_groups_server {
Name string `json:"name" copier:"Name"`
@ -320,7 +384,7 @@ type (
}
TaskModel {
Id int64 `json:"id,omitempty" db:"id"` // id
Id int64 `json:"id,omitempty,string" db:"id"` // id
Name string `json:"name,omitempty" db:"name"` // 作业名称
Description string `json:"description,omitempty" db:"description"` // 作业描述
Status string `json:"status,omitempty" db:"status"` // 作业状态
@ -336,6 +400,7 @@ type (
NsID string `json:"nsId,omitempty" db:"ns_id"`
TenantId string `json:"tenantId,omitempty" db:"tenant_id"`
CreateTime string `json:"createTime,omitempty" db:"create_time" gorm:"autoCreateTime"`
AdapterTypeDict int `json:"adapterTypeDict" db:"create_time" gorm:"adapter_type_dict"` //任务类型(对应字典表的值
}
)
@ -1004,9 +1069,9 @@ type (
Environment string `json:"environment"`
DeletedFlag int64 `json:"deleted_flag"` // 是否删除0-否1-是)
CreatedBy int64 `json:"created_by"` // 创建人
CreatedTime string `json:"created_time"` // 创建时间
CreateTime string `json:"created_time"` // 创建时间
UpdatedBy int64 `json:"updated_by"` // 更新人
UpdatedTime string `json:"updated_time"` // 更新时间
UpdateTime string `json:"updated_time"` // 更新时间
}
CloudInfo {
@ -1155,5 +1220,15 @@ type TaskStatusResp {
Succeeded int `json:"Succeeded"`
Failed int `json:"Failed"`
Running int `json:"Running"`
Pause int `json:"Pause"`
Saved int `json:"Saved"`
}
type TaskDetailsResp {
Name string `json:"name"`
description string `json:"description"`
StartTime string `json:"startTime"`
EndTime string `json:"endTime"`
Strategy int64 `json:"strategy"`
SynergyStatus int64 `json:"synergyStatus"`
ClusterInfos []*ClusterInfo `json:"clusterInfos"`
}

View File

@ -81,3 +81,28 @@ type (
version string `json:"version"`
}
)
type (
scheduleSituationResp{
nodes []NodeRegion `json:"nodes"`
links []Link `json:"links"`
categories []Category `json:"categories"`
}
NodeRegion{
id string `json:"id"`
name string `json:"name"`
category int `json:"category"`
value int `json:"value"`
}
Link{
source string `json:"source"`
target string `json:"target"`
}
Category{
name string `json:"name"`
}
)

View File

@ -142,6 +142,10 @@ service pcm {
@handler homeOverviewHandler
get /core/homeOverview (HomeOverviewReq) returns (HomeOverviewResp)
@doc "task details"
@handler taskDetails
get /core/task/details (FId) returns(TaskDetailsResp)
@doc "Get Public Image"
@handler getPublicImageHandler
get /core/getPublicImage (PublicImageReq) returns (PublicImageResp)
@ -226,7 +230,7 @@ service pcm {
@doc "Create cloud computing common tasks"
@handler commitGeneralTask
post /cloud/task/create (GeneralTaskReq) returns()
post /cloud/task/create (GeneralTaskReq) returns ()
}
//智算二级接口
@ -235,6 +239,22 @@ service pcm {
group: ai
)
service pcm {
@doc "智算中心概览"
@handler getCenterOverviewHandler
get /ai/getCenterOverview returns (CenterOverviewResp)
@doc "智算中心排队状况"
@handler getCenterQueueingHandler
get /ai/getCenterQueueing returns (CenterQueueingResp)
@doc "智算中心列表"
@handler getCenterListHandler
get /ai/getCenterList returns (CenterListResp)
@doc "智算中心任务列表"
@handler getCenterTaskListHandler
get /ai/getCenterTaskList returns (CenterTaskListResp)
@doc "查询数据集列表"
@handler listDataSetHandler
get /ai/listDataSet/:projectId (DataSetReq) returns (DataSetResp)
@ -938,8 +958,14 @@ service pcm {
@handler ScheduleGetAlgorithmsHandler
get /schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset (AiAlgorithmsReq) returns (AiAlgorithmsResp)
@handler ScheduleGetAiJobLogLogHandler
get /schedule/ai/getJobLog/:adapterId/:clusterId/:taskId/:instanceNum (AiJobLogReq) returns (AiJobLogResp)
@handler ScheduleSubmitHandler
post /schedule/submit (ScheduleReq) returns (ScheduleResp)
@handler ScheduleGetOverviewHandler
post /schedule/getOverview returns (ScheduleOverviewResp)
}
@server(
@ -991,7 +1017,7 @@ service pcm {
@doc "alert rules"
@handler alertRulesHandler
get /monitoring/alert/rule (AlertRulesReq)returns (AlertRulesResp)
get /monitoring/alert/rule (AlertRulesReq) returns (AlertRulesResp)
@doc "cluster resource load"
@handler clustersLoadHandler
@ -1007,5 +1033,14 @@ service pcm {
@doc "Synchronize Cluster alert Information"
@handler syncClusterAlertHandler
post /core/syncClusterAlert (SyncClusterAlertReq)
post /monitoring/syncClusterAlert (SyncClusterAlertReq)
@handler taskNumHandler
get /monitoring/task/num (taskNumReq) returns (taskNumResp)
@handler adapterInfoHandler
get /monitoring/adapter/info (adapterInfoReq) returns (adapterInfoResp)
@handler scheduleSituationHandler
get /monitoring/schedule/situation returns (scheduleSituationResp)
}

View File

@ -24,6 +24,9 @@ type (
Msg string `json:"msg"`
}
ScheduleOverviewResp {
}
AiOption {
TaskName string `json:"taskName"`
AdapterId string `json:"adapterId"`
@ -81,4 +84,20 @@ type (
AiJobLogResp {
Log string `json:"log"`
}
AiTaskDb {
Id string `json:"id,omitempty" db:"id"`
TaskId string `json:"taskId,omitempty" db:"task_id"`
AdapterId string `json:"adapterId,omitempty" db:"adapter_id"`
ClusterId string `json:"clusterId,omitempty" db:"cluster_id"`
Name string `json:"name,omitempty" db:"name"`
Replica string `json:"replica,omitempty" db:"replica"`
ClusterTaskId string `json:"clusterTaskId,omitempty" db:"c_task_id"`
Strategy string `json:"strategy,omitempty" db:"strategy"`
Status string `json:"status,omitempty" db:"status"`
Msg string `json:"msg,omitempty" db:"msg"`
CommitTime string `json:"commitTime,omitempty" db:"commit_time"`
StartTime string `json:"startTime,omitempty" db:"start_time"`
EndTime string `json:"endTime,omitempty" db:"end_time"`
}
)

View File

@ -6,7 +6,7 @@ Timeout: 50000
DB:
DataSource: root:uJpLd6u-J?HC1@(10.206.0.12:3306)/pcm?parseTime=true&loc=Local
# DataSource: root:uJpLd6u-J?HC1@(47.92.88.143:3306)/pcm?parseTime=true&loc=Local
# DataSource: root:uJpLd6u-J?HC1@(47.92.88.143:3306)/pcm?parseTime=true&loc=Local
Redis:
Host: 10.206.0.12:6379
Pass: redisPW123

View File

@ -0,0 +1,21 @@
package ai
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/ai"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
func GetCenterListHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
l := ai.NewGetCenterListLogic(r.Context(), svcCtx)
resp, err := l.GetCenterList()
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -0,0 +1,21 @@
package ai
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/ai"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
func GetCenterOverviewHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
l := ai.NewGetCenterOverviewLogic(r.Context(), svcCtx)
resp, err := l.GetCenterOverview()
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -0,0 +1,21 @@
package ai
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/ai"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
func GetCenterQueueingHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
l := ai.NewGetCenterQueueingLogic(r.Context(), svcCtx)
resp, err := l.GetCenterQueueing()
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -0,0 +1,21 @@
package ai
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/ai"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
func GetCenterTaskListHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
l := ai.NewGetCenterTaskListLogic(r.Context(), svcCtx)
resp, err := l.GetCenterTaskList()
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -0,0 +1,24 @@
package core
import (
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/core"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
)
func TaskDetailsHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.FId
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}
l := core.NewTaskDetailsLogic(r.Context(), svcCtx)
resp, err := l.TaskDetails(&req)
result.HttpResult(r, w, resp, err)
}
}

View File

@ -0,0 +1,17 @@
package monitoring
import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/monitoring"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
func ScheduleSituationHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
l := monitoring.NewScheduleSituationLogic(r.Context(), svcCtx)
resp, err := l.ScheduleSituation()
result.HttpResult(r, w, resp, err)
}
}

View File

@ -170,6 +170,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/core/homeOverview",
Handler: core.HomeOverviewHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/core/task/details",
Handler: core.TaskDetailsHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/core/getPublicImage",
@ -278,6 +283,26 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
server.AddRoutes(
[]rest.Route{
{
Method: http.MethodGet,
Path: "/ai/getCenterOverview",
Handler: ai.GetCenterOverviewHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/ai/getCenterQueueing",
Handler: ai.GetCenterQueueingHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/ai/getCenterList",
Handler: ai.GetCenterListHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/ai/getCenterTaskList",
Handler: ai.GetCenterTaskListHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/ai/listDataSet/:projectId",
@ -1170,6 +1195,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/schedule/submit",
Handler: schedule.ScheduleSubmitHandler(serverCtx),
},
{
Method: http.MethodPost,
Path: "/schedule/getOverview",
Handler: schedule.ScheduleGetOverviewHandler(serverCtx),
},
},
rest.WithPrefix("/pcm/v1"),
)
@ -1267,6 +1297,21 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/core/syncClusterAlert",
Handler: monitoring.SyncClusterAlertHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/monitoring/task/num",
Handler: monitoring.TaskNumHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/monitoring/adapter/info",
Handler: monitoring.AdapterInfoHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/monitoring/schedule/situation",
Handler: monitoring.ScheduleSituationHandler(serverCtx),
},
},
rest.WithPrefix("/pcm/v1"),
)

View File

@ -0,0 +1,21 @@
package schedule
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
func ScheduleGetOverviewHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
l := schedule.NewScheduleGetOverviewLogic(r.Context(), svcCtx)
resp, err := l.ScheduleGetOverview()
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -0,0 +1,43 @@
package ai
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type GetCenterListLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewGetCenterListLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetCenterListLogic {
return &GetCenterListLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *GetCenterListLogic) GetCenterList() (resp *types.CenterListResp, err error) {
resp = &types.CenterListResp{}
adapterList, err := l.svcCtx.Scheduler.AiStorages.GetAdaptersByType("1")
if err != nil {
return nil, err
}
for _, adapter := range adapterList {
a := &types.AiCenter{
Name: adapter.Name,
StackName: adapter.Nickname,
Version: adapter.Version,
}
resp.List = append(resp.List, a)
}
return resp, nil
}

View File

@ -0,0 +1,139 @@
package ai
import (
"context"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"strconv"
"sync"
)
type GetCenterOverviewLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewGetCenterOverviewLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetCenterOverviewLogic {
return &GetCenterOverviewLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *GetCenterOverviewLogic) GetCenterOverview() (resp *types.CenterOverviewResp, err error) {
resp = &types.CenterOverviewResp{}
var mu sync.RWMutex
ch := make(chan struct{})
var centerNum int32
var taskNum int32
var cardNum int32
var totalTops float64
adapterList, err := l.svcCtx.Scheduler.AiStorages.GetAdaptersByType("1")
if err != nil {
return nil, err
}
centerNum = int32(len(adapterList))
resp.CenterNum = centerNum
go l.updateClusterResource(&mu, ch, adapterList)
for _, adapter := range adapterList {
taskList, err := l.svcCtx.Scheduler.AiStorages.GetAiTasksByAdapterId(adapter.Id)
if err != nil {
continue
}
taskNum += int32(len(taskList))
}
resp.TaskNum = taskNum
for _, adapter := range adapterList {
clusters, err := l.svcCtx.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id)
if err != nil {
continue
}
for _, cluster := range clusters.List {
mu.RLock()
clusterResource, err := l.svcCtx.Scheduler.AiStorages.GetClusterResourcesById(cluster.Id)
mu.RUnlock()
if err != nil {
continue
}
cardNum += int32(clusterResource.CardTotal)
totalTops += clusterResource.CardTopsTotal
}
}
resp.CardNum = cardNum
resp.PowerInTops = totalTops
<-ch
return resp, nil
}
func (l *GetCenterOverviewLogic) updateClusterResource(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) {
var wg sync.WaitGroup
for _, adapter := range list {
clusters, err := l.svcCtx.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id)
if err != nil {
continue
}
for _, cluster := range clusters.List {
c := cluster
mu.RLock()
clusterResource, err := l.svcCtx.Scheduler.AiStorages.GetClusterResourcesById(c.Id)
mu.RUnlock()
if err != nil {
continue
}
wg.Add(1)
go func() {
stat, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id].GetResourceStats(l.ctx)
if err != nil {
wg.Done()
return
}
clusterType, err := strconv.ParseInt(adapter.Type, 10, 64)
if err != nil {
wg.Done()
return
}
var cardTotal int64
var topsTotal float64
for _, card := range stat.CardsAvail {
cardTotal += int64(card.CardNum)
topsTotal += card.TOpsAtFp16 * float64(card.CardNum)
}
mu.Lock()
if (models.TClusterResource{} == *clusterResource) {
err = l.svcCtx.Scheduler.AiStorages.SaveClusterResources(c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal),
stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal)
if err != nil {
mu.Unlock()
wg.Done()
return
}
} else {
clusterResource.CardTotal = cardTotal
clusterResource.CardTopsTotal = topsTotal
err := l.svcCtx.Scheduler.AiStorages.UpdateClusterResources(clusterResource)
if err != nil {
mu.Unlock()
wg.Done()
return
}
}
mu.Unlock()
wg.Done()
}()
}
}
wg.Wait()
ch <- struct{}{}
}

View File

@ -0,0 +1,70 @@
package ai
import (
"context"
"sort"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type GetCenterQueueingLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewGetCenterQueueingLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetCenterQueueingLogic {
return &GetCenterQueueingLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *GetCenterQueueingLogic) GetCenterQueueing() (resp *types.CenterQueueingResp, err error) {
resp = &types.CenterQueueingResp{}
adapters, err := l.svcCtx.Scheduler.AiStorages.GetAdaptersByType("1")
if err != nil {
return nil, err
}
for _, adapter := range adapters {
clusters, err := l.svcCtx.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id)
if err != nil {
continue
}
for _, cluster := range clusters.List {
queues, err := l.svcCtx.Scheduler.AiStorages.GetClusterTaskQueues(adapter.Id, cluster.Id)
if err != nil {
continue
}
//todo sync current task queues
current := &types.CenterQueue{
Name: cluster.Name,
QueueingNum: int32(queues[0].QueueNum),
}
history := &types.CenterQueue{
Name: cluster.Name,
QueueingNum: int32(queues[0].QueueNum),
}
resp.Current = append(resp.Current, current)
resp.History = append(resp.History, history)
}
}
sortQueueingNum(resp.Current)
sortQueueingNum(resp.History)
return resp, nil
}
func sortQueueingNum(q []*types.CenterQueue) {
sort.Slice(q, func(i, j int) bool {
return q[i].QueueingNum > q[j].QueueingNum
})
}

View File

@ -0,0 +1,116 @@
package ai
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"strconv"
"sync"
"time"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type GetCenterTaskListLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewGetCenterTaskListLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetCenterTaskListLogic {
return &GetCenterTaskListLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskListResp, err error) {
resp = &types.CenterTaskListResp{}
var mu sync.RWMutex
ch := make(chan struct{})
adapterList, err := l.svcCtx.Scheduler.AiStorages.GetAdaptersByType("1")
if err != nil {
return nil, err
}
go l.updateAiTaskStatus(&mu, ch, adapterList)
for _, adapter := range adapterList {
mu.RLock()
taskList, err := l.svcCtx.Scheduler.AiStorages.GetAiTasksByAdapterId(adapter.Id)
mu.RUnlock()
if err != nil {
continue
}
for _, task := range taskList {
var elapsed time.Duration
switch task.Status {
case constants.Completed:
end, err := time.ParseInLocation(constants.Layout, task.EndTime, time.Local)
if err != nil {
elapsed = time.Duration(0)
}
start, err := time.ParseInLocation(constants.Layout, task.StartTime, time.Local)
if err != nil {
elapsed = time.Duration(0)
}
elapsed = end.Sub(start)
case constants.Running:
elapsed = time.Now().Sub(task.CommitTime)
default:
elapsed = 0
}
t := &types.AiTask{
Name: task.Name,
Status: task.Status,
TimeElapsed: int32(elapsed.Seconds()),
}
resp.List = append(resp.List, t)
}
}
<-ch
return resp, nil
}
func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) {
var wg sync.WaitGroup
for _, adapter := range list {
taskList, err := l.svcCtx.Scheduler.AiStorages.GetAiTasksByAdapterId(adapter.Id)
if err != nil {
continue
}
for _, task := range taskList {
t := task
if t.Status == constants.Completed {
continue
}
wg.Add(1)
go func() {
trainingTask, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(l.ctx, t.JobId)
if err != nil {
wg.Done()
return
}
t.Status = trainingTask.Status
t.StartTime = trainingTask.Start
t.EndTime = trainingTask.End
mu.Lock()
err = l.svcCtx.Scheduler.AiStorages.UpdateAiTask(t)
mu.Unlock()
if err != nil {
wg.Done()
return
}
wg.Done()
}()
}
}
wg.Wait()
ch <- struct{}{}
}

View File

@ -4,15 +4,19 @@ import (
"bytes"
"context"
"github.com/pkg/errors"
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models/cloud"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"io"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
syaml "k8s.io/apimachinery/pkg/runtime/serializer/yaml"
kyaml "k8s.io/apimachinery/pkg/util/yaml"
"sigs.k8s.io/yaml"
"strconv"
"strings"
"time"
@ -37,62 +41,102 @@ func NewCommitGeneralTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext)
}
func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) error {
var yamlStr []string
for _, s := range req.ReqBody {
j2, err := yaml.YAMLToJSON([]byte(s))
if err != nil {
logx.Errorf("Failed to convert yaml to JSON, err: %v", err)
return err
tx := l.svcCtx.DbEngin.Begin()
// 执行回滚或者提交操作
defer func() {
if p := recover(); p != nil {
tx.Rollback()
logx.Error(p)
} else if tx.Error != nil {
logx.Info("rollback, error", tx.Error)
tx.Rollback()
} else {
tx = tx.Commit()
logx.Info("commit success")
}
yamlStr = append(yamlStr, string(j2))
}
result := strings.Join(yamlStr, ",")
//TODO The namespace is fixed to ns-admin for the time being. Later, the namespace is obtained based on the user
taskModel := models.Task{
Status: constants.Saved,
Name: req.Name,
CommitTime: time.Now(),
NsID: "ns-admin",
YamlString: "[" + result + "]",
}
// Save the task data to the database
tx := l.svcCtx.DbEngin.Create(&taskModel)
if tx.Error != nil {
return tx.Error
}
}()
//TODO adapter
adapterId, _ := strconv.ParseUint(req.AdapterIds[0], 10, 64)
var clusters []*models.CloudModel
err := l.svcCtx.DbEngin.Raw("SELECT * FROM `t_cluster` where adapter_id = ? and id in ?", req.AdapterId, req.ClusterIds).Scan(&clusters).Error
err := tx.Raw("SELECT * FROM `t_cluster` where adapter_id in ? and id in ?", req.AdapterIds, req.ClusterIds).Scan(&clusters).Error
if err != nil {
logx.Errorf("CommitGeneralTask() => sql execution error: %v", err)
return errors.Errorf("the cluster does not match the drive resources. Check the data")
}
taskCloud := cloud.TaskCloudModel{}
//TODO 执行策略返回集群跟 Replica
for _, c := range clusters {
opt := &option.CloudOption{}
utils.Convert(&req, &opt)
sc, _ := schedulers.NewCloudScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, tx, l.svcCtx.PromClient)
results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc)
if err != nil {
logx.Errorf("AssignAndSchedule() => execution error: %v", err)
return err
}
rs := (results).([]*schedulers.CloudResult)
var synergyStatus int64
if len(rs) > 1 {
synergyStatus = 1
}
var strategy int64
sqlStr := `select t_dict_item.item_value
from t_dict
left join t_dict_item on t_dict.id = t_dict_item.dict_id
where item_text = ?
and t_dict.dict_code = 'schedule_Strategy'`
//查询调度策略
err = tx.Raw(sqlStr, req.Strategy).Scan(&strategy).Error
taskModel := models.Task{
Id: utils.GenSnowflakeID(),
Status: constants.Saved,
Name: req.Name,
CommitTime: time.Now(),
YamlString: strings.Join(req.ReqBody, "\n---\n"),
AdapterTypeDict: 0,
SynergyStatus: synergyStatus,
Strategy: strategy,
}
var taskClouds []cloud.TaskCloudModel
for _, r := range rs {
for _, s := range req.ReqBody {
sStruct := UnMarshalK8sStruct(s)
sStruct := UnMarshalK8sStruct(s, int64(r.Replica))
unString, _ := sStruct.MarshalJSON()
taskCloud.Id = utils.GenSnowflakeIDUint()
taskCloud.TaskId = uint(taskModel.Id)
taskCloud.AdapterId = c.AdapterId
taskCloud.ClusterId = c.Id
taskCloud.ClusterName = c.Name
taskCloud.Status = "Saved"
clusterId, _ := strconv.ParseUint(r.ClusterId, 10, 64)
taskCloud.AdapterId = uint(adapterId)
taskCloud.ClusterId = uint(clusterId)
taskCloud.ClusterName = r.ClusterName
taskCloud.Status = constants.Saved
taskCloud.YamlString = string(unString)
taskCloud.Kind = sStruct.GetKind()
taskCloud.Namespace = sStruct.GetNamespace()
tx = l.svcCtx.DbEngin.Create(&taskCloud)
if tx.Error != nil {
logx.Errorf("CommitGeneralTask() create taskCloud => sql execution error: %v", err)
return tx.Error
taskClouds = append(taskClouds, taskCloud)
}
}
adapterName := ""
tx.Table("t_adapter").Select("name").Where("id=?", adapterId).Find(&adapterName)
noticeInfo := clientCore.NoticeInfo{
AdapterId: int64(adapterId),
AdapterName: adapterName,
NoticeType: "create",
TaskName: req.Name,
Incident: "任务创建中",
CreatedTime: time.Now(),
}
db := tx.Table("task").Create(&taskModel)
db = tx.Table("task_cloud").Create(&taskClouds)
db = tx.Table("t_notice").Create(&noticeInfo)
if db.Error != nil {
logx.Errorf("Task creation failure, err: %v", db.Error)
return errors.New("task creation failure")
}
return nil
}
func UnMarshalK8sStruct(yamlString string) *unstructured.Unstructured {
func UnMarshalK8sStruct(yamlString string, replica int64) *unstructured.Unstructured {
unstructuredObj := &unstructured.Unstructured{}
d := kyaml.NewYAMLOrJSONDecoder(bytes.NewBufferString(yamlString), 4096)
var err error
@ -113,6 +157,10 @@ func UnMarshalK8sStruct(yamlString string) *unstructured.Unstructured {
if len(unstructuredObj.GetNamespace()) == 0 {
unstructuredObj.SetNamespace("default")
}
//设置副本数
if unstructuredObj.GetKind() == "Deployment" || unstructuredObj.GetKind() == "StatefulSet" {
unstructured.SetNestedField(unstructuredObj.Object, replica, "spec", "replicas")
}
}
return unstructuredObj
}

View File

@ -1,65 +0,0 @@
package core
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
tool "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"k8s.io/apimachinery/pkg/util/json"
"time"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type CommitHpcTaskLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewCommitHpcTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CommitHpcTaskLogic {
return &CommitHpcTaskLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *types.CommitHpcTaskResp, err error) {
// 构建主任务结构体
taskModel := models.Task{
Status: constants.Saved,
Description: req.Description,
Name: req.Name,
CommitTime: time.Now(),
}
// 保存任务数据到数据库
tx := l.svcCtx.DbEngin.Create(&taskModel)
if tx.Error != nil {
return nil, tx.Error
}
hpc := models.Hpc{}
tool.Convert(req, &hpc)
mqInfo := response.TaskInfo{
TaskId: taskModel.Id,
TaskType: "hpc",
MatchLabels: req.MatchLabels,
//Metadata: hpc,
}
req.TaskId = taskModel.Id
// 将任务数据转换成消息体
reqMessage, err := json.Marshal(mqInfo)
if err != nil {
logx.Error(err)
return nil, err
}
publish := l.svcCtx.RedisClient.Publish(context.Background(), mqInfo.TaskType, reqMessage)
if publish.Err() != nil {
return nil, publish.Err()
}
return
}

View File

@ -3,11 +3,13 @@ package core
import (
"context"
"fmt"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"math/rand"
"strconv"
"time"
"github.com/zeromicro/go-zero/core/logx"
@ -29,11 +31,24 @@ func NewCommitVmTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Comm
func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *types.CommitVmTaskResp, err error) {
// todo: add your logic here and delete this line
resp = &types.CommitVmTaskResp{}
//Building the main task structure
opt := &option.VmOption{
AdapterId: req.VmOption.AdapterId,
Replicas: req.VmOption.Replicas,
Strategy: req.VmOption.Strategy,
ClusterToStaticWeight: req.VmOption.StaticWeightMap,
Status: constants.Saved,
MatchLabels: req.VmOption.MatchLabels,
StaticWeightMap: req.VmOption.StaticWeightMap,
Name: req.VmOption.Name,
CommitTime: time.Now(),
}
taskModel := models.Task{
Status: constants.Saved,
Name: req.Name,
Name: req.VmOption.Name,
CommitTime: time.Now(),
Description: "vm task",
}
// Save task data to database
tx := l.svcCtx.DbEngin.Create(&taskModel)
@ -41,38 +56,63 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type
return nil, tx.Error
}
//var clusters []*models.VmModel
//err2 := l.svcCtx.DbEngin.Raw("SELECT * FROM `t_cluster` where adapter_id in ? and id in ?", req.VmOption.AdapterId, req.VmOption.VmClusterIds).Scan(&clusters).Error
//if err2 != nil {
// logx.Errorf("CommitGeneralTask() => sql execution error: %v", err)
// //return errors.Errorf("the cluster does not match the drive resources. Check the data"), nil
//}
taskVm := models.TaskVm{}
//TODO 执行策略返回集群跟 Replica
/*opt := &option.VmOption{}
utils.Convert(&req, &opt)*/
// 2、Initialize scheduler
vmSchdl, err := schedulers.NewVmScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, l.svcCtx.DbEngin, l.svcCtx.PromClient)
if err != nil {
return nil, err
}
// 3、Return scheduling results
results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl)
if err != nil {
return nil, err
}
rs := (results).([]*schedulers.VmResult)
for _, r := range rs {
for _, CreateMulServer := range req.CreateMulServer {
if r.Replica > 0 && r.ClusterId == CreateMulServer.ClusterId {
fmt.Println("", req.CreateMulServer)
var clusterIds []int64
l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? and label = ?", req.AdapterId, req.ClusterType).Scan(&clusterIds)
l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? ", req.VmOption.AdapterId).Scan(&clusterIds)
if len(clusterIds) == 0 || clusterIds == nil {
return nil, nil
}
vmInfo := models.TaskVm{
TaskId: taskModel.Id,
ClusterId: clusterIds[rand.Intn(len(clusterIds))],
Name: taskModel.Name,
Status: "Saved",
StartTime: time.Now().String(),
MinCount: CreateMulServer.Min_count,
ImageRef: CreateMulServer.ImageRef,
FlavorRef: CreateMulServer.FlavorRef,
Uuid: CreateMulServer.Uuid,
Platform: CreateMulServer.Platform,
}
tx = l.svcCtx.DbEngin.Create(&vmInfo)
adapterId, _ := strconv.ParseUint(req.VmOption.AdapterId, 10, 64)
taskVm.AdapterId = int64(adapterId)
clusterId, _ := strconv.ParseUint(r.ClusterId, 10, 64)
taskVm.ClusterId = int64(clusterId)
taskVm.Name = req.VmOption.Name
taskVm.TaskId = taskModel.Id
clusterId, _ = strconv.ParseUint(r.ClusterId, 10, 64)
taskVm.ClusterId = int64(clusterId)
taskVm.Status = "Saved"
taskVm.StartTime = time.Now().String()
taskVm.MinCount = CreateMulServer.Min_count
taskVm.ImageRef = CreateMulServer.ImageRef
taskVm.FlavorRef = CreateMulServer.FlavorRef
taskVm.Uuid = CreateMulServer.Uuid
taskVm.Platform = CreateMulServer.Platform
tx = l.svcCtx.DbEngin.Create(&taskVm)
if tx.Error != nil {
return nil, tx.Error
}
resp = &types.CommitVmTaskResp{
Code: 200,
Msg: "success",
TaskId: taskModel.Id,
}
}
}
resp.Code = 200
resp.Msg = "Success"
return resp, nil
}

View File

@ -30,7 +30,7 @@ func (l *CountTaskStatusLogic) CountTaskStatus() (resp *types.TaskStatusResp, er
COUNT(CASE WHEN status = 'Succeeded' THEN 1 END) AS Succeeded,
COUNT(CASE WHEN status = 'Failed' THEN 1 END) AS Failed,
COUNT(CASE WHEN status = 'Running' THEN 1 END) AS Running,
COUNT(CASE WHEN status = 'Pause' THEN 1 END) AS Pause
COUNT(CASE WHEN status = 'Saved' THEN 1 END) AS Saved
FROM task;`
err = l.svcCtx.DbEngin.Raw(sqlStr).Scan(&resp).Error
if err != nil {

View File

@ -3,6 +3,8 @@ package core
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils"
"time"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
@ -28,7 +30,7 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa
limit := req.PageSize
offset := req.PageSize * (req.PageNum - 1)
resp = &types.PageResult{}
var list []types.TaskModel
var list []*types.TaskModel
db := l.svcCtx.DbEngin.Model(&types.TaskModel{}).Table("task")
db = db.Where("deleted_at is null")
@ -48,8 +50,18 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa
if err != nil {
return nil, result.NewDefaultError(err.Error())
}
resp.List = list
for _, model := range list {
if model.EndTime != "" && model.StartTime != "" {
startTime := timeutils.TimeStringToGoTime(model.StartTime)
endTime := timeutils.TimeStringToGoTime(model.EndTime)
model.RunningTime = int64(endTime.Sub(startTime).Seconds())
}
if model.StartTime != "" {
startTime := timeutils.TimeStringToGoTime(model.StartTime)
model.RunningTime = int64(time.Now().Sub(startTime).Seconds())
}
}
resp.List = &list
resp.PageSize = req.PageSize
resp.PageNum = req.PageNum
resp.Total = total

View File

@ -5,6 +5,7 @@ import (
"github.com/jinzhu/copier"
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models/cloud"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"gorm.io/gorm"
@ -54,7 +55,7 @@ func (l *PullTaskInfoLogic) PullTaskInfo(req *clientCore.PullTaskInfoReq) (*clie
}
}
case 0:
var cloudModelList []models.Cloud
var cloudModelList []cloud.TaskCloudModel
err := findModelList(req.AdapterId, l.svcCtx.DbEngin, &cloudModelList)
if err != nil {
return nil, err

View File

@ -2,14 +2,15 @@ package core
import (
"context"
"github.com/pkg/errors"
"github.com/zeromicro/go-zero/core/logx"
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gorm.io/gorm"
"strings"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"time"
)
type PushTaskInfoLogic struct {
@ -33,9 +34,14 @@ func (l *PushTaskInfoLogic) PushTaskInfo(req *clientCore.PushTaskInfoReq) (*clie
switch kind {
case 0:
for _, cloudInfo := range req.CloudInfoList {
l.svcCtx.DbEngin.Exec("update cloud set status = ?,start_time = ?,result = ? where participant_id = ? and id = ?",
cloudInfo.Status, cloudInfo.StartTime, cloudInfo.Result, req.AdapterId, cloudInfo.Id)
syncTask(l.svcCtx.DbEngin, cloudInfo.TaskId)
var taskId uint
result := l.svcCtx.DbEngin.Table("task_cloud").Select("task_id").Where("task_id = ?", cloudInfo.TaskId).Find(&taskId)
if errors.Is(result.Error, gorm.ErrRecordNotFound) {
return nil, errors.New("Record does not exist")
}
l.svcCtx.DbEngin.Exec("update task_cloud set status = ?,start_time = ?,result = ? where task_id = ?",
cloudInfo.Status, cloudInfo.StartTime, cloudInfo.Result, cloudInfo.TaskId)
syncTask(l.svcCtx.DbEngin, int64(taskId))
}
case 2:
for _, hpcInfo := range req.HpcInfoList {
@ -63,7 +69,7 @@ func (l *PushTaskInfoLogic) PushTaskInfo(req *clientCore.PushTaskInfoReq) (*clie
func syncTask(gorm *gorm.DB, taskId int64) {
var allStatus string
tx := gorm.Raw("SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.status) ,GROUP_CONCAT(DISTINCT a.status) ,GROUP_CONCAT(DISTINCT c.status))as status from task t left join hpc h on t.id = h.task_id left join cloud c on t.id = c.task_id left join ai a on t.id = a.task_id where t.id = ?", taskId).Scan(&allStatus)
tx := gorm.Raw("SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.status) ,GROUP_CONCAT(DISTINCT a.status) ,GROUP_CONCAT(DISTINCT c.status))as status from task t left join hpc h on t.id = h.task_id left join task_cloud c on t.id = c.task_id left join ai a on t.id = a.task_id where t.id = ?", taskId).Scan(&allStatus)
if tx.Error != nil {
logx.Error(tx.Error)
}
@ -71,7 +77,6 @@ func syncTask(gorm *gorm.DB, taskId int64) {
statusArray := strings.Split(allStatus, ",")
if len(removeRepeatedElement(statusArray)) == 1 {
updateTask(gorm, taskId, statusArray[0])
}
// 子任务包含失败状态 主任务则失败
if strings.Contains(allStatus, constants.Failed) {
@ -85,10 +90,14 @@ func syncTask(gorm *gorm.DB, taskId int64) {
}
func updateTask(gorm *gorm.DB, taskId int64, status string) {
now := time.Now()
var task models.Task
gorm.Where("id = ? ", taskId).Find(&task)
if task.Status != status {
task.Status = status
if status == constants.Running {
task.StartTime = &now
}
gorm.Updates(&task)
}
}

View File

@ -0,0 +1,54 @@
package core
import (
"context"
"github.com/pkg/errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"gorm.io/gorm"
"github.com/zeromicro/go-zero/core/logx"
)
type TaskDetailsLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewTaskDetailsLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TaskDetailsLogic {
return &TaskDetailsLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *TaskDetailsLogic) TaskDetails(req *types.FId) (resp *types.TaskDetailsResp, err error) {
resp = &types.TaskDetailsResp{}
task := &models.Task{}
if errors.Is(l.svcCtx.DbEngin.Where("id", req.Id).First(&task).Error, gorm.ErrRecordNotFound) {
return nil, errors.New("记录不存在")
}
clusterIds := make([]int64, 0)
var cList []*types.ClusterInfo
switch task.AdapterTypeDict {
case 0:
l.svcCtx.DbEngin.Table("task_cloud").Select("cluster_id").Where("task_id", task.Id).Scan(&clusterIds)
case 1:
l.svcCtx.DbEngin.Table("task_ai").Select("cluster_id").Where("task_id", task.Id).Scan(&clusterIds)
case 2:
l.svcCtx.DbEngin.Table("task_hpc").Select("cluster_id").Where("task_id", task.Id).Scan(&clusterIds)
case 3:
l.svcCtx.DbEngin.Table("task_vm").Select("cluster_id").Where("task_id", task.Id).Find(&clusterIds)
}
err = l.svcCtx.DbEngin.Table("t_cluster").Where("id in ?", clusterIds).Scan(&cList).Error
if err != nil {
return resp, err
}
utils.Convert(&task, &resp)
resp.ClusterInfos = cList
return
}

View File

@ -93,13 +93,12 @@ func (l *TaskListLogic) TaskList(req *types.TaskListReq) (resp *types.TaskListRe
pStatus = "Normal"
}
}
resp.Tasks = append(resp.Tasks, types.Task{
Id: task.Id,
Name: task.Name,
Status: task.Status,
StartTime: task.StartTime,
EndTime: task.EndTime,
StartTime: task.StartTime.Format("2006-01-02 15:04:05"),
EndTime: task.EndTime.Format("2006-01-02 15:04:05"),
ParticipantId: pInfo.Id,
ParticipantName: pInfo.Name,
ParticipantStatus: pStatus,

View File

@ -32,11 +32,15 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t
// 构建主任务结构体
taskModel := models.Task{
Status: constants.Saved,
Description: req.Description,
Name: req.Name,
Description: req.Description,
Status: constants.Saved,
Strategy: 0,
SynergyStatus: 0,
CommitTime: time.Now(),
AdapterTypeDict: 2,
}
// 保存任务数据到数据库
tx := l.svcCtx.DbEngin.Create(&taskModel)
if tx.Error != nil {
@ -49,7 +53,9 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t
env, _ := json.Marshal(req.Environment)
if len(clusterIds) == 0 || clusterIds == nil {
return nil, nil
resp.Code = 400
resp.Msg = "no cluster found"
return resp, nil
}
hpcInfo := models.TaskHpc{

View File

@ -0,0 +1,82 @@
package monitoring
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"strings"
"github.com/zeromicro/go-zero/core/logx"
)
type ScheduleSituationLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewScheduleSituationLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ScheduleSituationLogic {
return &ScheduleSituationLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *ScheduleSituationLogic) ScheduleSituation() (resp *types.ScheduleSituationResp, err error) {
resp = &types.ScheduleSituationResp{}
// node region
tx := l.svcCtx.DbEngin.Raw("SELECT c.id, c.name, tdi.id AS category, count(DISTINCT ta.id)+count(DISTINCT tc.id)+COUNT(DISTINCT th.id)+COUNT(tv.id) as value FROM t_cluster c LEFT JOIN t_dict_item tdi ON c.region_dict = tdi.id left JOIN task_ai ta ON ta.cluster_id = c.id left JOIN task_cloud tc ON tc.cluster_id = c.id left JOIN task_hpc th ON th.cluster_id = c.id left JOIN task_vm tv ON tv.cluster_id = c.id WHERE tc.deleted_at IS NULL GROUP BY c.id").Scan(&resp.Nodes)
if tx.Error != nil {
return nil, tx.Error
}
// hpc
var hpcLinks []string
tx = l.svcCtx.DbEngin.Raw("SELECT GROUP_CONCAT(cluster_id SEPARATOR ',') as cluster_ids FROM task_hpc WHERE deleted_at IS NULL GROUP BY task_id HAVING COUNT(*) > 1;").Scan(&hpcLinks)
if tx.Error != nil {
return nil, tx.Error
}
LinksHandler(hpcLinks, resp)
// cloud
var cloudLinks []string
tx = l.svcCtx.DbEngin.Raw("SELECT GROUP_CONCAT(cluster_id SEPARATOR ',') as cluster_ids FROM task_cloud WHERE deleted_at IS NULL GROUP BY task_id HAVING COUNT(*) > 1;").Scan(&cloudLinks)
if tx.Error != nil {
return nil, tx.Error
}
LinksHandler(cloudLinks, resp)
// ai
var aiLinks []string
tx = l.svcCtx.DbEngin.Raw("SELECT GROUP_CONCAT(cluster_id SEPARATOR ',') as cluster_ids FROM task_ai WHERE deleted_at IS NULL GROUP BY task_id HAVING COUNT(*) > 1;").Scan(&aiLinks)
if tx.Error != nil {
return nil, tx.Error
}
LinksHandler(aiLinks, resp)
// vm
var vmLinks []string
tx = l.svcCtx.DbEngin.Raw("SELECT GROUP_CONCAT(cluster_id SEPARATOR ',') as cluster_ids FROM task_vm WHERE deleted_at IS NULL GROUP BY task_id HAVING COUNT(*) > 1;").Scan(&vmLinks)
if tx.Error != nil {
return nil, tx.Error
}
LinksHandler(vmLinks, resp)
// categories
tx = l.svcCtx.DbEngin.Raw("select tdi.item_text as name from t_dict_item tdi,t_dict td where td.dict_code = 'cluster_region_dict' and tdi.dict_id = td.id").Scan(&resp.Categories)
if tx.Error != nil {
return nil, tx.Error
}
return resp, nil
}
func LinksHandler(sources []string, resp *types.ScheduleSituationResp) {
for _, source := range sources {
links := strings.Split(source, ",")
for i := 1; i < len(links); i++ {
if links[i] != links[i-1] {
resp.Links = append(resp.Links, types.Link{Source: links[i], Target: links[i-1]})
}
}
}
}

View File

@ -26,7 +26,11 @@ func NewScheduleGetAiJobLogLogLogic(ctx context.Context, svcCtx *svc.ServiceCont
func (l *ScheduleGetAiJobLogLogLogic) ScheduleGetAiJobLogLog(req *types.AiJobLogReq) (resp *types.AiJobLogResp, err error) {
resp = &types.AiJobLogResp{}
log, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].GetTrainingTaskLog(l.ctx, req.TaskId, req.InstanceNum)
id, err := l.svcCtx.Scheduler.AiStorages.GetAiTaskIdByClusterIdAndTaskId(req.ClusterId, req.TaskId)
if err != nil {
return nil, err
}
log, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].GetTrainingTaskLog(l.ctx, id, req.InstanceNum)
if err != nil {
return nil, err
}

View File

@ -0,0 +1,30 @@
package schedule
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type ScheduleGetOverviewLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewScheduleGetOverviewLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ScheduleGetOverviewLogic {
return &ScheduleGetOverviewLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *ScheduleGetOverviewLogic) ScheduleGetOverview() (resp *types.ScheduleOverviewResp, err error) {
// todo: add your logic here and delete this line
return
}

View File

@ -6,6 +6,7 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"github.com/zeromicro/go-zero/core/logx"
)
@ -28,7 +29,9 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
resp = &types.ScheduleResp{}
opt := &option.AiOption{
AdapterId: req.AiOption.AdapterId,
TaskName: req.AiOption.TaskName,
ResourceType: req.AiOption.ResourceType,
Replica: 1,
Tops: req.AiOption.Tops,
TaskType: req.AiOption.TaskType,
DatasetsName: req.AiOption.Datasets,
@ -52,6 +55,17 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
switch opt.GetOptionType() {
case option.AI:
rs := (results).([]*schedulers.AiResult)
var synergystatus int64
if len(rs) > 1 {
synergystatus = 1
}
strategyCode, err := l.svcCtx.Scheduler.AiStorages.GetStrategyCode(req.AiOption.Strategy)
id, err := l.svcCtx.Scheduler.AiStorages.SaveTask(req.AiOption.TaskName, strategyCode, synergystatus)
if err != nil {
return nil, err
}
for _, r := range rs {
scheResult := &types.ScheduleResult{}
scheResult.ClusterId = r.ClusterId
@ -59,12 +73,13 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
scheResult.Strategy = r.Strategy
scheResult.Replica = r.Replica
scheResult.Msg = r.Msg
resp.Results = append(resp.Results, scheResult)
}
err = l.svcCtx.Scheduler.AiStorages.SaveTask(req.AiOption.TaskName)
err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, r.ClusterId, r.TaskId, constants.Saved, r.Msg)
if err != nil {
return nil, err
}
resp.Results = append(resp.Results, scheResult)
}
}
return resp, nil

View File

@ -16,8 +16,6 @@ package mqs
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
@ -38,28 +36,28 @@ func NewCloudMq(ctx context.Context, svcCtx *svc.ServiceContext) *CloudMq {
func (l *CloudMq) Consume(val string) error {
// 接受消息, 根据标签筛选过滤
cloudScheduler := schedulers.NewCloudScheduler()
schdl, err := scheduler.NewScheduler(cloudScheduler, val, l.svcCtx.DbEngin, l.svcCtx.ParticipantRpc)
if err != nil {
return err
}
//检测是否指定了集群列表
schdl.SpecifyClusters()
//检测是否指定了nsID
schdl.SpecifyNsID()
//通过标签匹配筛选出集群范围
schdl.MatchLabels()
//todo 屏蔽原调度算法,因为监控数据暂未上报,临时采用随机调度
schdl.TempAssign()
// 存储数据
err = schdl.SaveToDb()
if err != nil {
return err
}
//cloudScheduler := schedulers.NewCloudScheduler()
//schdl, err := scheduler.NewScheduler(cloudScheduler, val, l.svcCtx.DbEngin, l.svcCtx.ParticipantRpc)
//if err != nil {
// return err
//}
//
////检测是否指定了集群列表
//schdl.SpecifyClusters()
//
////检测是否指定了nsID
//schdl.SpecifyNsID()
//
////通过标签匹配筛选出集群范围
//schdl.MatchLabels()
//
////todo 屏蔽原调度算法,因为监控数据暂未上报,临时采用随机调度
//schdl.TempAssign()
//
//// 存储数据
//err = schdl.SaveToDb()
//if err != nil {
// return err
//}
return nil
}

View File

@ -2,8 +2,6 @@ package mqs
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
@ -24,28 +22,28 @@ func NewVmMq(ctx context.Context, svcCtx *svc.ServiceContext) *VmMq {
func (l *VmMq) Consume(val string) error {
// 接受消息, 根据标签筛选过滤
vmScheduler := schedulers.NewVmScheduler()
schdl, err := scheduler.NewScheduler(vmScheduler, val, l.svcCtx.DbEngin, l.svcCtx.ParticipantRpc)
if err != nil {
return err
}
//检测是否指定了集群列表
schdl.SpecifyClusters()
//检测是否指定了nsID
schdl.SpecifyNsID()
//通过标签匹配筛选出集群范围
schdl.MatchLabels()
//todo 屏蔽原调度算法,因为监控数据暂未上报,临时采用随机调度
schdl.TempAssign()
// 存储数据
err = schdl.SaveToDb()
if err != nil {
return err
}
//vmScheduler := schedulers.NewVmScheduler()
//schdl, err := scheduler.NewScheduler(vmScheduler, val, l.svcCtx.DbEngin, l.svcCtx.ParticipantRpc)
//if err != nil {
// return err
//}
//
////检测是否指定了集群列表
//schdl.SpecifyClusters()
//
////检测是否指定了nsID
//schdl.SpecifyNsID()
//
////通过标签匹配筛选出集群范围
//schdl.MatchLabels()
//
////todo 屏蔽原调度算法,因为监控数据暂未上报,临时采用随机调度
//schdl.TempAssign()
//
//// 存储数据
//err = schdl.SaveToDb()
//if err != nil {
// return err
//}
return nil
}

View File

@ -2,10 +2,12 @@ package database
import (
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gorm.io/gorm"
"strconv"
"time"
)
@ -48,22 +50,183 @@ func (s *AiStorage) GetAdapterIdsByType(adapterType string) ([]string, error) {
return ids, nil
}
func (s *AiStorage) SaveTask(name string) error {
func (s *AiStorage) GetAdaptersByType(adapterType string) ([]*types.AdapterInfo, error) {
var list []*types.AdapterInfo
db := s.DbEngin.Model(&types.AdapterInfo{}).Table("t_adapter")
db = db.Where("type = ?", adapterType)
err := db.Order("create_time desc").Find(&list).Error
if err != nil {
return nil, err
}
return list, nil
}
func (s *AiStorage) GetAiTasksByAdapterId(adapterId string) ([]*models.TaskAi, error) {
var resp []*models.TaskAi
tx := s.DbEngin.Raw("select * from task_ai where `adapter_id` = ? ", adapterId).Scan(&resp)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return nil, tx.Error
}
return resp, nil
}
func (s *AiStorage) SaveTask(name string, strategyCode int64, synergyStatus int64) (int64, error) {
// 构建主任务结构体
taskModel := models.Task{
Status: constants.Saved,
Description: "ai task",
Name: name,
SynergyStatus: synergyStatus,
Strategy: strategyCode,
AdapterTypeDict: 1,
CommitTime: time.Now(),
}
// 保存任务数据到数据库
tx := s.DbEngin.Create(&taskModel)
if tx.Error != nil {
return 0, tx.Error
}
return taskModel.Id, nil
}
func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, clusterId string, jobId string, status string, msg string) error {
// 构建主任务结构体
aId, err := strconv.ParseInt(option.AdapterId, 10, 64)
if err != nil {
return err
}
cId, err := strconv.ParseInt(clusterId, 10, 64)
if err != nil {
return err
}
aiTaskModel := models.TaskAi{
TaskId: taskId,
AdapterId: aId,
ClusterId: cId,
Name: option.TaskName,
Replica: option.Replica,
JobId: jobId,
TaskType: option.TaskType,
Strategy: option.StrategyName,
Status: status,
Msg: msg,
CommitTime: time.Now(),
}
// 保存任务数据到数据库
tx := s.DbEngin.Create(&aiTaskModel)
if tx.Error != nil {
return tx.Error
}
return nil
}
func (s *AiStorage) UpdateTask() error {
func (s *AiStorage) SaveClusterTaskQueue(adapterId string, clusterId string, queueNum int64) error {
aId, err := strconv.ParseInt(adapterId, 10, 64)
if err != nil {
return err
}
cId, err := strconv.ParseInt(clusterId, 10, 64)
if err != nil {
return err
}
taskQueue := models.TClusterTaskQueue{
AdapterId: aId,
ClusterId: cId,
QueueNum: queueNum,
}
tx := s.DbEngin.Create(&taskQueue)
if tx.Error != nil {
return tx.Error
}
return nil
}
func (s *AiStorage) GetClusterTaskQueues(adapterId string, clusterId string) ([]*models.TClusterTaskQueue, error) {
var taskQueues []*models.TClusterTaskQueue
tx := s.DbEngin.Raw("select * from t_cluster_task_queue where `adapter_id` = ? and `cluster_id` = ?", adapterId, clusterId).Scan(&taskQueues)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return nil, tx.Error
}
return taskQueues, nil
}
func (s *AiStorage) GetAiTaskIdByClusterIdAndTaskId(clusterId string, taskId string) (string, error) {
var aiTask models.TaskAi
tx := s.DbEngin.Raw("select * from task_ai where `cluster_id` = ? and `task_id` = ?", clusterId, taskId).Scan(&aiTask)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return "", tx.Error
}
return aiTask.JobId, nil
}
func (s *AiStorage) GetClusterResourcesById(clusterId string) (*models.TClusterResource, error) {
var clusterResource models.TClusterResource
tx := s.DbEngin.Raw("select * from t_cluster_resource where `cluster_id` = ?", clusterId).Scan(&clusterResource)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return nil, tx.Error
}
return &clusterResource, nil
}
func (s *AiStorage) SaveClusterResources(clusterId string, clusterName string, clusterType int64, cpuAvail float64, cpuTotal float64,
memAvail float64, memTotal float64, diskAvail float64, diskTotal float64, gpuAvail float64, gpuTotal float64, cardTotal int64, topsTotal float64) error {
cId, err := strconv.ParseInt(clusterId, 10, 64)
if err != nil {
return err
}
clusterResource := models.TClusterResource{
ClusterId: cId,
ClusterName: clusterName,
ClusterType: clusterType,
CpuAvail: cpuAvail,
CpuTotal: cpuTotal,
MemAvail: memAvail,
MemTotal: memTotal,
DiskAvail: diskAvail,
DiskTotal: diskTotal,
GpuAvail: gpuAvail,
GpuTotal: gpuTotal,
CardTotal: cardTotal,
CardTopsTotal: topsTotal,
}
tx := s.DbEngin.Create(&clusterResource)
if tx.Error != nil {
return tx.Error
}
return nil
}
func (s *AiStorage) UpdateClusterResources(clusterResource *models.TClusterResource) error {
tx := s.DbEngin.Updates(clusterResource)
if tx.Error != nil {
return tx.Error
}
return nil
}
func (s *AiStorage) UpdateAiTask(task *models.TaskAi) error {
tx := s.DbEngin.Updates(task)
if tx.Error != nil {
return tx.Error
}
return nil
}
func (s *AiStorage) GetStrategyCode(name string) (int64, error) {
var strategy int64
sqlStr := `select t_dict_item.item_value
from t_dict
left join t_dict_item on t_dict.id = t_dict_item.dict_id
where item_text = ?
and t_dict.dict_code = 'schedule_Strategy'`
//查询调度策略
err := s.DbEngin.Raw(sqlStr, name).Scan(&strategy).Error
if err != nil {
return strategy, nil
}
return strategy, nil
}

View File

@ -129,42 +129,19 @@ func (s *Scheduler) TempAssign() error {
}
func (s *Scheduler) AssignAndSchedule(ss SubSchedule) (interface{}, error) {
//// 已指定 ParticipantId
//if s.task.ParticipantId != 0 {
// return nil
//}
//// 标签匹配以及后未找到ParticipantIds
//if len(s.participantIds) == 0 {
// return errors.New("未找到匹配的ParticipantIds")
//}
//
//// 指定或者标签匹配的结果只有一个集群,给任务信息指定
//if len(s.participantIds) == 1 {
// s.task.ParticipantId = s.participantIds[0]
// //replicas := s.task.Metadata.(map[string]interface{})["spec"].(map[string]interface{})["replicas"].(float64)
// //result := make(map[int64]string)
// //result[s.participantIds[0]] = strconv.FormatFloat(replicas, 'f', 2, 64)
// //s.result = result
//
// return nil
//}
//choose strategy
strategy, err := ss.PickOptimalStrategy()
if err != nil {
return nil, err
}
//schedule
clusters, err := strategy.Schedule()
if err != nil {
return nil, err
}
//集群数量不满足,指定到标签匹配后第一个集群
//if len(providerList) < 2 {
// s.task.ParticipantId = s.participantIds[0]
// return nil
//}
//assign tasks to clusters
resp, err := ss.AssignTask(clusters)
if err != nil {
return nil, err

View File

@ -26,6 +26,7 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"gitlink.org.cn/JointCloud/pcm-octopus/octopus"
@ -168,32 +169,52 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
errs = append(errs, e)
}
if len(errs) == len(clusters) {
return nil, errors.New("submit task failed")
for s := range ch {
results = append(results, s)
}
if len(errs) != 0 {
var msg string
var synergystatus int64
if len(clusters) > 1 {
synergystatus = 1
}
strategyCode, err := as.AiStorages.GetStrategyCode(as.option.StrategyName)
taskId, err := as.AiStorages.SaveTask(as.option.TaskName, strategyCode, synergystatus)
if err != nil {
return nil, errors.New("database add failed: " + err.Error())
}
var errmsg string
for _, err := range errs {
e := (err).(struct {
err error
clusterId string
})
msg += fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
msg := fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
errmsg += msg
err := as.AiStorages.SaveAiTask(taskId, as.option, e.clusterId, "", constants.Failed, msg)
if err != nil {
return nil, errors.New("database add failed: " + err.Error())
}
}
for s := range ch {
if s.Msg != "" {
msg += fmt.Sprintf("clusterId: %v , error: %v \n", s.ClusterId, s.Msg)
msg := fmt.Sprintf("clusterId: %v , error: %v \n", s.ClusterId, s.Msg)
errmsg += msg
err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, "", constants.Failed, msg)
if err != nil {
return nil, errors.New("database add failed: " + err.Error())
}
} else {
msg += fmt.Sprintf("clusterId: %v , submitted successfully, taskId: %v \n", s.ClusterId, s.TaskId)
msg := fmt.Sprintf("clusterId: %v , submitted successfully, taskId: %v \n", s.ClusterId, s.TaskId)
errmsg += msg
err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, s.TaskId, constants.Succeeded, msg)
if err != nil {
return nil, errors.New("database add failed: " + err.Error())
}
}
return nil, errors.New(msg)
}
for s := range ch {
// TODO: database operation
results = append(results, s)
return nil, errors.New(errmsg)
}
return results, nil

View File

@ -15,106 +15,176 @@
package schedulers
import (
"bytes"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/algorithm/providerPricing"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/database"
"context"
"errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models/cloud"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/tracker"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"io"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
syaml "k8s.io/apimachinery/pkg/runtime/serializer/yaml"
kyaml "k8s.io/apimachinery/pkg/util/yaml"
"gorm.io/gorm"
"math"
"time"
)
type CloudScheduler struct {
storage database.Storage
yamlString string
task *response.TaskInfo
*scheduler.Scheduler
option *option.CloudOption
ctx context.Context
dbEngin *gorm.DB
promClient tracker.Prometheus
svcCtx *svc.ServiceContext
}
func NewCloudScheduler() *CloudScheduler {
return &CloudScheduler{}
type CloudResult struct {
TaskId string
ClusterId string
ClusterName string
Strategy string
Replica int32
Msg string
}
func (cs *CloudScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
//获取所有计算中心
//调度算法
strategy := strategy.NewPricingStrategy(&param.ResourcePricingParams{})
func NewCloudScheduler(ctx context.Context, val string, scheduler *scheduler.Scheduler, option *option.CloudOption, dbEngin *gorm.DB, promClient tracker.Prometheus) (*CloudScheduler, error) {
return &CloudScheduler{ctx: ctx, yamlString: val, Scheduler: scheduler, option: option, dbEngin: dbEngin, promClient: promClient}, nil
}
func (as *CloudScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
c := cloud.TaskCloudModel{
AdapterId: uint(participantId),
TaskId: uint(task.TaskId),
Status: constants.Saved,
YamlString: as.yamlString,
}
utils.Convert(task.Metadata, &c)
return c, nil
}
func (as *CloudScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
if len(as.option.ClusterIds) == 1 {
return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ClusterId: as.option.ClusterIds[0], Replicas: 1}}, nil
}
resources, err := as.findClustersWithResources()
if err != nil {
return nil, err
}
if len(resources) == 0 {
return nil, errors.New("no cluster has resources")
}
if len(resources) == 1 {
var cluster strategy.AssignedCluster
cluster.ClusterId = resources[0].ClusterId
cluster.Replicas = 1
return &strategy.SingleAssignment{Cluster: &cluster}, nil
}
params := &param.Params{Resources: resources}
switch as.option.Strategy {
case strategy.REPLICATION:
var clusterIds []string
for _, resource := range resources {
clusterIds = append(clusterIds, resource.ClusterId)
}
strategy := strategy.NewReplicationStrategy(clusterIds, as.option.Replica)
return strategy, nil
case strategy.RESOURCES_PRICING:
strategy := strategy.NewPricingStrategy(&param.ResourcePricingParams{Params: params, Replicas: as.option.Replica})
return strategy, nil
case strategy.DYNAMIC_RESOURCES:
strategy := strategy.NewDynamicResourcesStrategy(params.Resources, as.option, 1)
return strategy, nil
case strategy.STATIC_WEIGHT:
//todo resources should match cluster StaticWeightMap
strategy := strategy.NewStaticWeightStrategy(as.option.StaticWeightMap, as.option.Replica)
return strategy, nil
}
return nil, errors.New("no strategy has been chosen")
}
func (cs *CloudScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
cloud := cs.UnMarshalK8sStruct(resource, task.TaskId, task.NsID)
cloud.Id = utils.GenSnowflakeID()
cloud.NsID = task.NsID
func (as *CloudScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interface{}, error) {
if clusters == nil {
return nil, errors.New("clusters is nil")
}
cloud.ParticipantId = participantId
return cloud, nil
for i := len(clusters) - 1; i >= 0; i-- {
if clusters[i].Replicas == 0 {
clusters = append(clusters[:i], clusters[i+1:]...)
}
}
if len(clusters) == 0 {
return nil, errors.New("clusters is nil")
}
var results []*CloudResult
for _, cluster := range clusters {
cName := ""
as.dbEngin.Table("t_cluster").Select("name").Where("id=?", cluster.ClusterId).Find(&cName)
cr := CloudResult{
ClusterId: cluster.ClusterId,
ClusterName: cName,
Replica: cluster.Replicas,
}
cr.ClusterId = cluster.ClusterId
cr.Replica = cluster.Replicas
cr.ClusterName = cName
results = append(results, &cr)
}
return results, nil
}
func (cs *CloudScheduler) UnMarshalK8sStruct(yamlString string, taskId int64, nsID string) models.Cloud {
var cloud models.Cloud
d := kyaml.NewYAMLOrJSONDecoder(bytes.NewBufferString(yamlString), 4096)
var err error
for {
var rawObj runtime.RawExtension
err = d.Decode(&rawObj)
if err == io.EOF {
break
func (as *CloudScheduler) findClustersWithResources() ([]*collector.ResourceStats, error) {
resp := []*collector.ResourceStats{}
//查询集群资源信息
var rMetrics []tracker.Metric
metrics := []string{"cluster_cpu_utilisation", "cluster_cpu_avail", "cluster_cpu_total", "cluster_memory_total", "cluster_memory_avail", "cluster_memory_utilisation", "cluster_disk_utilisation", "cluster_disk_avail", "cluster_disk_total", "cluster_pod_utilisation"}
var clusterNames []string
as.dbEngin.Table("t_cluster").Select("name").Where("id in ?", as.option.ClusterIds).Find(&clusterNames)
for _, c := range clusterNames {
rMetrics = as.promClient.GetNamedMetrics(metrics, time.Now(), tracker.ClusterOption{ClusterName: c})
r := collector.ResourceStats{}
var cid string
as.dbEngin.Table("t_cluster").Select("id").Where("name = ?", c).Find(&cid)
r.ClusterId = cid
r.Name = c
for _, metric := range rMetrics {
if metric.MetricName == "cluster_cpu_total" {
r.CpuCoreTotal = int64(metric.MetricData.MetricValues[0].Sample.Value())
}
if err != nil {
if metric.MetricName == "cluster_cpu_avail" {
cpuAvail := metric.MetricData.MetricValues[0].Sample.Value()
r.CpuCoreAvail = int64(math.Round(cpuAvail))
}
obj := &unstructured.Unstructured{}
syaml.NewDecodingSerializer(unstructured.UnstructuredJSONScheme).Decode(rawObj.Raw, nil, obj)
if err != nil {
if metric.MetricName == "cluster_memory_total" {
r.MemTotal = metric.MetricData.MetricValues[0].Sample.Value()
}
unstructuredMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj)
if err != nil {
if metric.MetricName == "cluster_memory_avail" {
r.MemAvail = metric.MetricData.MetricValues[0].Sample.Value()
}
unstructureObj := &unstructured.Unstructured{Object: unstructuredMap}
if len(nsID) != 0 {
unstructureObj.SetNamespace(nsID)
if metric.MetricName == "cluster_disk_total" {
r.DiskTotal = metric.MetricData.MetricValues[0].Sample.Value()
}
cloud = models.Cloud{
TaskId: taskId,
ApiVersion: unstructureObj.GetAPIVersion(),
Name: unstructureObj.GetName(),
Kind: unstructureObj.GetKind(),
Namespace: unstructureObj.GetNamespace(),
Status: "Saved",
if metric.MetricName == "cluster_disk_avail" {
r.DiskAvail = metric.MetricData.MetricValues[0].Sample.Value()
}
// 命名空间为空 设置默认值
if len(unstructureObj.GetNamespace()) == 0 {
cloud.Namespace = "default"
}
//unstructureObj转成string
unString, _ := unstructureObj.MarshalJSON()
cloud.YamlString = string(unString)
resp = append(resp, &r)
}
return cloud
}
func (cs *CloudScheduler) genTaskAndProviders() (*providerPricing.Task, []*providerPricing.Provider, error) {
proParams, err := cs.storage.GetProviderParams()
if err != nil {
return nil, nil, nil
}
var providerList []*providerPricing.Provider
for _, p := range proParams {
provider := providerPricing.NewProvider(p.Participant_id, p.Cpu_avail, p.Mem_avail, p.Disk_avail, 0.0, 0.0, 0.0)
providerList = append(providerList, provider)
}
//replicas := task.Metadata.(map[string]interface{})["spec"].(map[string]interface{})["replicas"].(float64)
//t := algorithm.NewTask(0, int(replicas), 2, 75120000, 301214500, 1200, 2, 6, 2000)
return nil, providerList, nil
}
func (cs *CloudScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interface{}, error) {
return nil, nil
return resp, nil
}

View File

@ -4,6 +4,7 @@ type AiOption struct {
AdapterId string
ClusterIds []string
TaskName string
Replica int64
ResourceType string // cpu/gpu/compute card
CpuCoreNum int64
TaskType string // pytorch/tensorflow/mindspore

View File

@ -1,7 +1,13 @@
package option
type CloudOption struct {
task interface{}
Name string `json:"name"`
AdapterIds []string `json:"adapterIds"`
ClusterIds []string `json:"clusterIds"`
Strategy string `json:"strategy"`
StaticWeightMap map[string]int32 `json:"staticWeightMap,optional"`
ReqBody []string `json:"reqBody"`
Replica int32 `json:"replicas,string"`
}
func (c CloudOption) GetOptionType() string {

View File

@ -4,6 +4,7 @@ const (
AI = "ai"
CLOUD = "cloud"
HPC = "hpc"
VM = "vm"
)
type Option interface {

View File

@ -0,0 +1,49 @@
package option
import "time"
type VmOption struct {
AdapterId string
ClusterIds []string
TaskName string
ResourceType string // cpu/gpu/compute card
TaskType string // pytorch/tensorflow/mindspore
Strategy string
ClusterToStaticWeight map[string]int32
CommitTime time.Time
NsID string
Replicas int64
MatchLabels map[string]string
StaticWeightMap map[string]int32
CreateMulServer []CreateMulDomainServer
Id int64
ParticipantId int64
TaskId int64
Name string
ClusterId int64
FlavorRef string
ImageRef string
Status string
Platform string
Description string
AvailabilityZone string
MinCount int64
Uuid string
StartTime string
RunningTime string
Result string
DeletedAt string
}
type CreateMulDomainServer struct {
Platform string
Name string
Min_count int64
ImageRef string
FlavorRef string
Uuid string
}
func (a VmOption) GetOptionType() string {
return VM
}

View File

@ -1,29 +1,96 @@
package schedulers
import (
"context"
"github.com/pkg/errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/algorithm/providerPricing"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/database"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/tracker"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"gorm.io/gorm"
)
type VmScheduler struct {
yamlString string
storage database.Storage
task *response.TaskInfo
*scheduler.Scheduler
option *option.VmOption
ctx context.Context
promClient tracker.Prometheus
dbEngin *gorm.DB
}
func NewVmScheduler() *VmScheduler {
return &VmScheduler{}
type VmResult struct {
TaskId string
ClusterId string
ClusterName string
Strategy string
Replica int32
Msg string
}
func NewVmScheduler(ctx context.Context, val string, scheduler *scheduler.Scheduler, option *option.VmOption, dbEngin *gorm.DB, promClient tracker.Prometheus) (*VmScheduler, error) {
return &VmScheduler{ctx: ctx, yamlString: val, Scheduler: scheduler, option: option, dbEngin: dbEngin, promClient: promClient}, nil
}
/*func NewCloudScheduler(ctx context.Context, val string, scheduler *scheduler.Scheduler, option *option.CloudOption, dbEngin *gorm.DB, promClient tracker.Prometheus) (*CloudScheduler, error) {
return &CloudScheduler{ctx: ctx, yamlString: val, Scheduler: scheduler, option: option, dbEngin: dbEngin, promClient: promClient}, nil
}*/
func (vm *VmScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
//获取所有计算中心
//调度算法
strategy := strategy.NewPricingStrategy(&param.ResourcePricingParams{})
if len(vm.option.ClusterIds) == 1 {
// TODO database operation Find
return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ClusterId: vm.option.ClusterIds[0], Replicas: 1}}, nil
}
//resources, err := vm.findClustersWithResources()
/* if err != nil {
return nil, err
}*/
/* if len(resources) == 0 {
return nil, errors.New("no cluster has resources")
}*/
//
//if len(resources) == 1 {
// var cluster strategy.AssignedCluster
// cluster.ClusterId = resources[0].ClusterId
// cluster.Replicas = 1
// return &strategy.SingleAssignment{Cluster: &cluster}, nil
//}
//params := &param.Params{Resources: resources}
switch vm.option.Strategy {
/* case strategy.REPLICATION:
var clusterIds []string
for _, resource := range resources {
clusterIds = append(clusterIds, resource.ClusterId)
}
strategy := strategy.NewReplicationStrategy(clusterIds, 1)
return strategy, nil
case strategy.RESOURCES_PRICING:
strategy := strategy.NewPricingStrategy(&param.ResourcePricingParams{Params: params, Replicas: 1})
return strategy, nil
case strategy.DYNAMIC_RESOURCES:
strategy := strategy.NewDynamicResourcesStrategy(params.Resources, vm.option, 1)
return strategy, nil*/
case strategy.STATIC_WEIGHT:
//todo resources should match cluster StaticWeightMap
strategy := strategy.NewStaticWeightStrategy(vm.option.ClusterToStaticWeight, 1)
return strategy, nil
}
/*strategy := strategy.NewPricingStrategy(&param.ResourcePricingParams{})
return strategy, nil*/
return nil, errors.New("no strategy has been chosen")
}
func (v *VmScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
@ -41,12 +108,6 @@ func (v *VmScheduler) GetNewStructForDb(task *response.TaskInfo, resource string
vm.ParticipantId = participantId*/
}
/*
func (vm *VmScheduler) UnMarshalVmStruct(yamlString string, taskId int64, nsID string) models.vm {
var vm models.Vm
vm := kyaml.NewYAMLOrJSONDecoder(bytes.NewBufferString(yamlString), 4096)
}
*/
func (vm *VmScheduler) genTaskAndProviders() (*providerPricing.Task, []*providerPricing.Provider, error) {
proParams, err := vm.storage.GetProviderParams()
if err != nil {
@ -64,7 +125,38 @@ func (vm *VmScheduler) genTaskAndProviders() (*providerPricing.Task, []*provider
return nil, providerList, nil
}
func (v VmScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interface{}, error) {
func (as *VmScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interface{}, error) {
//TODO implement me
panic("implement me")
if clusters == nil {
return nil, errors.New("clusters is nil")
}
for i := len(clusters) - 1; i >= 0; i-- {
if clusters[i].Replicas == 0 {
clusters = append(clusters[:i], clusters[i+1:]...)
}
}
if len(clusters) == 0 {
return nil, errors.New("clusters is nil")
}
var results []*VmResult
for _, cluster := range clusters {
cName := ""
as.dbEngin.Table("t_cluster").Select("name").Where("id=?", cluster.ClusterId).Find(&cName)
cr := VmResult{
ClusterId: cluster.ClusterId,
ClusterName: cName,
Replica: cluster.Replicas,
}
cr.ClusterId = cluster.ClusterId
cr.Replica = cluster.Replicas
cr.ClusterName = cName
results = append(results, &cr)
}
return results, nil
}

View File

@ -7,6 +7,9 @@ type AiCollector interface {
GetDatasetsSpecs(ctx context.Context) ([]*DatasetsSpecs, error)
GetAlgorithms(ctx context.Context) ([]*Algorithm, error)
GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error)
GetTrainingTask(ctx context.Context, taskId string) (*Task, error)
DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error)
UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error
}
type ResourceStats struct {
@ -19,6 +22,7 @@ type ResourceStats struct {
DiskAvail float64
DiskTotal float64
GpuAvail int64
GpuTotal int64
CardsAvail []*Card
CpuCoreHours float64
Balance float64
@ -43,3 +47,10 @@ type Algorithm struct {
Platform string
TaskType string
}
type Task struct {
Id string
Start string
End string
Status string
}

View File

@ -162,10 +162,22 @@ func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorit
return nil, nil
}
func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) {
return "", nil
}
func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error {
return nil
}
func (m *ModelArtsLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
return "", nil
}
func (m *ModelArtsLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
return nil, nil
}
func (m *ModelArtsLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
err := m.GenerateSubmitParams(ctx, option)
if err != nil {

View File

@ -19,12 +19,14 @@ import (
"errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"gitlink.org.cn/JointCloud/pcm-octopus/octopus"
"gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
"math"
"strconv"
"strings"
"time"
)
type OctopusLink struct {
@ -337,6 +339,14 @@ func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm
return algorithms, nil
}
func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) {
return "", nil
}
func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error {
return nil
}
func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
instance, err := strconv.ParseInt(instanceNum, 10, 32)
if err != nil {
@ -356,6 +366,35 @@ func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, ins
return resp.Content, nil
}
func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
resp, err := o.QueryTask(ctx, taskId)
if err != nil {
return nil, err
}
jobresp := (resp).(*octopus.GetTrainJobResp)
if !jobresp.Success {
return nil, errors.New(jobresp.Error.Message)
}
var task collector.Task
task.Id = jobresp.Payload.TrainJob.Id
task.Start = time.Unix(jobresp.Payload.TrainJob.StartedAt, 0).Format(constants.Layout)
task.End = time.Unix(jobresp.Payload.TrainJob.CompletedAt, 0).Format(constants.Layout)
switch jobresp.Payload.TrainJob.Status {
case "succeeded":
task.Status = constants.Completed
case "failed":
task.Status = constants.Failed
case "running":
task.Status = constants.Running
case "stopped":
task.Status = constants.Stopped
default:
task.Status = "undefined"
}
return &task, nil
}
func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
err := o.GenerateSubmitParams(ctx, option)
if err != nil {

View File

@ -447,6 +447,14 @@ func (s *ShuguangAi) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm,
return algorithms, nil
}
func (s *ShuguangAi) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) {
return "", nil
}
func (s *ShuguangAi) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error {
return nil
}
func (s *ShuguangAi) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
req := &hpcAC.GetInstanceLogReq{
TaskId: taskId,
@ -465,6 +473,24 @@ func (s *ShuguangAi) GetTrainingTaskLog(ctx context.Context, taskId string, inst
return resp.Data.Content, nil
}
func (s *ShuguangAi) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
resp, err := s.QueryTask(ctx, taskId)
if err != nil {
return nil, err
}
jobresp := (resp).(*hpcAC.GetPytorchTaskResp)
if jobresp.Code != "0" {
return nil, errors.New(jobresp.Msg)
}
var task collector.Task
task.Id = jobresp.Data.Id
task.Start = jobresp.Data.StartTime
task.End = jobresp.Data.EndTime
task.Status = jobresp.Data.Status
return &task, nil
}
func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
err := s.GenerateSubmitParams(ctx, option)
if err != nil {

View File

@ -142,22 +142,12 @@ type Region struct {
type GeneralTaskReq struct {
Name string `json:"name"`
ComputeType string `json:"computeType"`
TemplateId string `json:"templateId"`
AdapterId string `json:"adapterId"`
AdapterIds []string `json:"adapterIds"`
ClusterIds []string `json:"clusterIds"`
Strategy Strategy `json:"strategy"`
Strategy string `json:"strategy"`
StaticWeightMap map[string]int32 `json:"staticWeightMap,optional"`
ReqBody []string `json:"reqBody"`
}
type Strategy struct {
Name string `json:"name"`
StaticWeightList []StaticWeightList `json:"staticWeightList"`
}
type StaticWeightList struct {
ClusterName string `json:"clusterName"`
Weight int `json:"weight"`
Replicas int64 `json:"replicas,string"`
}
type DeleteTaskReq struct {
@ -192,12 +182,19 @@ type TaskYaml struct {
}
type CommitVmTaskReq struct {
Name string `json:"name"`
NsID string `json:"nsID"`
CreateMulServer []CreateMulDomainServer `json:"createMulServer,optional"`
VmOption *VmOption `json:"vmOption,optional"`
}
type VmOption struct {
AdapterId string `json:"adapterId"`
VmClusterIds []string `json:"vmClusterIds"`
Replicas int64 `json:"replicas,optional"`
Name string `json:"name"`
Strategy string `json:"strategy"`
ClusterToStaticWeight map[string]int32 `json:"clusterToStaticWeight"`
MatchLabels map[string]string `json:"matchLabels,optional"`
AdapterId string `json:"adapterId,optional"`
ClusterType string `json:"clusterType,optional"`
StaticWeightMap map[string]int32 `json:"staticWeightMap,optional"`
CreateMulServer []CreateMulDomainServer `json:"createMulServer,optional"`
}
@ -208,14 +205,22 @@ type CreateMulDomainServer struct {
ImageRef string `json:"imageRef,optional"`
FlavorRef string `json:"flavorRef,optional"`
Uuid string `json:"uuid,optional"`
ClusterId string `json:"clusterId,optional"`
}
type CommitVmTaskResp struct {
TaskId int64 `json:"taskId"`
Code int32 `json:"code"`
Msg string `json:"msg"`
}
type ScheduleVmResult struct {
ClusterId string `json:"clusterId"`
TaskId string `json:"taskId"`
Strategy string `json:"strategy"`
Replica int32 `json:"replica"`
Msg string `json:"msg"`
}
type VmTask struct {
Id string `json:"id" copier:"Id"`
Links []VmLinks `json:"links" copier:"Links"`
@ -298,7 +303,7 @@ type PageTaskReq struct {
}
type TaskModel struct {
Id int64 `json:"id,omitempty" db:"id"` // id
Id int64 `json:"id,omitempty,string" db:"id"` // id
Name string `json:"name,omitempty" db:"name"` // 作业名称
Description string `json:"description,omitempty" db:"description"` // 作业描述
Status string `json:"status,omitempty" db:"status"` // 作业状态
@ -314,6 +319,7 @@ type TaskModel struct {
NsID string `json:"nsId,omitempty" db:"ns_id"`
TenantId string `json:"tenantId,omitempty" db:"tenant_id"`
CreateTime string `json:"createTime,omitempty" db:"create_time" gorm:"autoCreateTime"`
AdapterTypeDict int `json:"adapterTypeDict" db:"create_time" gorm:"adapter_type_dict"` //任务类型(对应字典表的值
}
type TaskDetailReq struct {
@ -970,9 +976,9 @@ type HpcInfo struct {
Environment string `json:"environment"`
DeletedFlag int64 `json:"deleted_flag"` // 是否删除0-否1-是)
CreatedBy int64 `json:"created_by"` // 创建人
CreatedTime string `json:"created_time"` // 创建时间
CreateTime string `json:"created_time"` // 创建时间
UpdatedBy int64 `json:"updated_by"` // 更新人
UpdatedTime string `json:"updated_time"` // 更新时间
UpdateTime string `json:"updated_time"` // 更新时间
}
type CloudInfo struct {
@ -1114,7 +1120,17 @@ type TaskStatusResp struct {
Succeeded int `json:"Succeeded"`
Failed int `json:"Failed"`
Running int `json:"Running"`
Pause int `json:"Pause"`
Saved int `json:"Saved"`
}
type TaskDetailsResp struct {
Name string `json:"name"`
Description string `json:"description"`
StartTime string `json:"startTime"`
EndTime string `json:"endTime"`
Strategy int64 `json:"strategy"`
SynergyStatus int64 `json:"synergyStatus"`
ClusterInfos []*ClusterInfo `json:"clusterInfos"`
}
type CommitHpcTaskReq struct {
@ -2765,6 +2781,43 @@ type Nfs struct {
ReadOnly bool `json:"readOnly,optional"`
}
type CenterOverviewResp struct {
CenterNum int32 `json:"totalCenters,optional"`
TaskNum int32 `json:"totalTasks,optional"`
CardNum int32 `json:"totalCards,optional"`
PowerInTops float64 `json:"totalPower,optional"`
}
type CenterQueueingResp struct {
Current []*CenterQueue `json:"current,optional"`
History []*CenterQueue `json:"history,optional"`
}
type CenterQueue struct {
Name string `json:"name,optional"`
QueueingNum int32 `json:"num,optional"`
}
type CenterListResp struct {
List []*AiCenter `json:"centerList,optional"`
}
type AiCenter struct {
Name string `json:"name,optional"`
StackName string `json:"stack,optional"`
Version string `json:"version,optional"`
}
type CenterTaskListResp struct {
List []*AiTask `json:"taskList,optional"`
}
type AiTask struct {
Name string `json:"name,optional"`
Status string `json:"status,optional"`
TimeElapsed int32 `json:"elapsed,optional"`
}
type StorageScreenReq struct {
}
@ -5347,9 +5400,9 @@ type TenantInfo struct {
Type int64 `json:"type"` // 租户所属(0数算1超算2智算
DeletedFlag int64 `json:"deletedFlag"` // 是否删除
CreatedBy int64 `json:"createdBy"` // 创建人
CreatedTime string `json:"createdTime"` // 创建时间
CreateTime string `json:"createdTime"` // 创建时间
UpdatedBy int64 `json:"updatedBy"` // 更新人
UpdatedTime string `json:"updated_time"` // 更新时间
UpdateTime string `json:"updated_time"` // 更新时间
}
type UpdateTenantReq struct {
@ -5403,7 +5456,7 @@ type Cloud struct {
StartTime string `json:"startTime"` // 开始时间
RunningTime int64 `json:"runningTime"` // 运行时长
CreatedBy int64 `json:"createdBy"` // 创建人
CreatedTime string `json:"createdTime"` // 创建时间
CreateTime string `json:"createdTime"` // 创建时间
Result string `json:"result"`
}
@ -5546,6 +5599,9 @@ type ScheduleResult struct {
Msg string `json:"msg"`
}
type ScheduleOverviewResp struct {
}
type AiOption struct {
TaskName string `json:"taskName"`
AdapterId string `json:"adapterId"`
@ -5604,6 +5660,22 @@ type AiJobLogResp struct {
Log string `json:"log"`
}
type AiTaskDb struct {
Id string `json:"id,omitempty" db:"id"`
TaskId string `json:"taskId,omitempty" db:"task_id"`
AdapterId string `json:"adapterId,omitempty" db:"adapter_id"`
ClusterId string `json:"clusterId,omitempty" db:"cluster_id"`
Name string `json:"name,omitempty" db:"name"`
Replica string `json:"replica,omitempty" db:"replica"`
ClusterTaskId string `json:"clusterTaskId,omitempty" db:"c_task_id"`
Strategy string `json:"strategy,omitempty" db:"strategy"`
Status string `json:"status,omitempty" db:"status"`
Msg string `json:"msg,omitempty" db:"msg"`
CommitTime string `json:"commitTime,omitempty" db:"commit_time"`
StartTime string `json:"startTime,omitempty" db:"start_time"`
EndTime string `json:"endTime,omitempty" db:"end_time"`
}
type CreateAlertRuleReq struct {
CLusterId string `json:"clusterId"`
ClusterName string `json:"clusterName"`
@ -5680,3 +5752,25 @@ type AdapterInfoResp struct {
Name string `json:"name"`
Version string `json:"version"`
}
type ScheduleSituationResp struct {
Nodes []NodeRegion `json:"nodes"`
Links []Link `json:"links"`
Categories []Category `json:"categories"`
}
type NodeRegion struct {
Id string `json:"id"`
Name string `json:"name"`
Category int `json:"category"`
Value int `json:"value"`
}
type Link struct {
Source string `json:"source"`
Target string `json:"target"`
}
type Category struct {
Name string `json:"name"`
}

1159
deploy/pcm-auth.sql Normal file

File diff suppressed because one or more lines are too long

Binary file not shown.

2882
deploy/pcm.sql Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,41 +1,116 @@
## 1 安装部署kubekey
通过以下的命令,可以下载 KubeKey 的最新版本。您可以更改命令中的版本号来下载特定的版本。
```
```shell
export KKZONE=cn
curl -sfL https://get-kk.kubesphere.io | VERSION=v3.0.7 sh -
```
## 2 mysql部署及数据导入
#### 卸载已有的mariadb
`yum remove -y mariadb-server mariadb mariadb-libs`
#### 下载对应系统版本的mysql包
wget https://dev.mysql.com/get/Downloads/MySQL-8.0/mysql-8.0.36-1.el7.x86_64.rpm-bundle.tar
##### 解压
`tar -xvf mysql-8.0.36-1.el7.x86_64.rpm-bundle.tar`
##### 安装
```shell
rpm -ivh mysql-community-libs-8.0.36-1.el7.x86_64.rpm
rpm -ivh mysql-community-libs-compat-8.0.36-1.el7.x86_64.rpm
rpm -ivh mysql-community-client-plugins-8.0.36-1.el7.x86_64.rpm
rpm -ivh mysql-community-client-8.0.36-1.el7.x86_64.rpm
rpm -ivh mysql-community-common-8.0.36-1.el7.x86_64.rpm
rpm -ivh mysql-community-debuginfo-8.0.36-1.el7.x86_64.rpm
rpm -ivh mysql-community-devel-8.0.36-1.el7.x86_64.rpm
rpm -ivh mysql-community-server-8.0.36-1.el7.x86_64.rpm
```
##### 启动服务
`systemctl start mysqld`
##### 查看初始密码
`grep 'temporary password' /var/log/mysqld.log`
使用mysql -u root -p 登录
##### 修改密码
`ALTER USER 'root'@'localhost' IDENTIFIED BY 'Nudt!123';`
##### 配置外部访问
```sql
use mysql;
update user set host = '%' where user = 'root';
flush privileges;
```
##### 创建数据库
```sql
create database pcm;
create database pcm_auth;
```
##### 关闭防火墙
`systemctl stop firewalld`
##### 下载脚本
`wget -O pcm_auth.sql https://www.gitlink.org.cn/attachments/entries/get_file?download_url=https://www.gitlink.org.cn/api/JointCloud/pcm-coordinator/raw/deploy%2Fpcm-auth.sql?ref=master`
`wget -O pcm.sql https://www.gitlink.org.cn/attachments/entries/get_file?download_url=https://www.gitlink.org.cn/api/JointCloud/pcm-coordinator/raw/deploy%2Fpcm.sql?ref=master`
## 2 安装部署k8s集群
##### 执行sql脚本导入数据
`mysql -u root -p pcm < pcm.sql`
`mysql -u root -p pcm_auth < pcm_auth.sql`
## 3 安装部署k8s集群
```
./kk create cluster
export KKZONE=cn
sudo ./kk create cluster
```
执行可能会提示部分软件未安装直接yum安装即可
![输入图片说明](/imgs/2024-04-28/qF082JVaumRARK1J.png)
然后重新执行创建集群命令,执行成功后可以验证环境
![输入图片说明](/imgs/2024-04-28/FoVNPbwm1pnt839Z.png)
eg:
`sudo yum install -y conntrack`
`sudo yum install -y socat`
![](/api/attachments/3f8b9884-03b3-4e84-b408-d2ec451a533b)
然后重新执行创建集群命令执行成功后可以执行kubectl get pod 验证环境
![](/api/attachments/2e282429-d3ae-4019-8280-d6409da50b80)
## 3 部署鉴权、pcm-coordinator、前端服务
### 3.1 yaml文件下载
pcm所有服务的yaml文件包下载地址在[这里](https://www.gitlink.org.cn/attachments/entries/get_file?download_url=https://www.gitlink.org.cn/api/JointCloud/pcm-coordinator/raw/deploy%2Fpcm-yaml.zip?ref=master "这里")
或者在服务器上直接执行
```shell
wget -O yaml.zip https://www.gitlink.org.cn/attachments/entries/get_file?download_url=https://www.gitlink.org.cn/api/JointCloud/pcm-coordinator/raw/deploy%2Fpcm-yaml.zip?ref=master
```
下载完成解压
```shell
unzip yaml.zip
```
### 3.2 yaml执行完成服务、负载、配置文件的部署
#### 修改地址
需要修改配置文件中的数据库地址为mysql服务安装的地址
yaml文件下载链接https://pan.baidu.com/s/1VU1zE2xcFkrz9Hz2MkgDaQ
#### 一次性部署所有的文件
```shell
kubectl apply -f .
```
#### 或者单模块部署
##### 鉴权:
`kubectl apply -f pcm-auth.yaml`
##### C端
`kubectl apply -f pcm-core-api.yaml`
`kubectl apply -f pcm-core-rpc.yaml`
##### 前端:
`kubectl apply -f pcm-rip.yaml`
鉴权:
kubectl apply -f pcm-auth.yaml
C端
kubectl apply -f pcm-core-api.yaml
kubectl apply -f pcm-core-rpc.yaml
前端:
kubectl apply -f pcm-rip.yaml
部署情况可以通过以下命令查看
`kubectl get pod`
![](/api/attachments/644de412-1155-4e07-a90d-367f63260a81)
## 4 配置驱动器、集群信息
此时前端服务可以通过服务器ip的31149端口访问到
默认账号密码为admin/Nudt@123
新建一个适配器配置成功后可以获取到对应的adapterId
![输入图片说明](/imgs/2024-04-28/Dtu4KC835jSfcf5R.png)
![](/api/attachments/ad8e33d9-7155-4030-a813-227bb019c6e0)
将对应的id填写到对应的P端配置信息中(configmap 内容)
![输入图片说明](/imgs/2024-04-28/zuFWMVKAycNlPXOF.png)
![](/api/attachments/f0d8ee8d-f94f-40c7-8785-58ce09c89ba0)
## 5 部署P端服务
P端
### HPC服务端:
kubectl apply -f pcm-hpc.yaml
### kubernetes适配器:
kubectl apply -f pcm-kubernetes.yaml
## 7.系统使用

2
go.mod
View File

@ -24,7 +24,7 @@ require (
github.com/robfig/cron/v3 v3.0.1
github.com/rs/zerolog v1.28.0
github.com/zeromicro/go-zero v1.6.3
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203

4
go.sum
View File

@ -1078,8 +1078,8 @@ github.com/yuin/gopher-lua v1.1.0/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7
github.com/zeromicro/go-zero v1.5.1/go.mod h1:bGYm4XWsGN9GhDsO2O2BngpVoWjf3Eog2a5hUOMhlXs=
github.com/zeromicro/go-zero v1.6.3 h1:OL0NnHD5LdRNDolfcK9vUkJt7K8TcBE3RkzfM8poOVw=
github.com/zeromicro/go-zero v1.6.3/go.mod h1:XZL435ZxVi9MSXXtw2MRQhHgx6OoX3++MRMOE9xU70c=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb h1:k6mNEWKp+haQUaK2dWs/rI9OKgzJHY1/9KNKuBDN0Vw=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240420083915-58d6e2958aeb/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece h1:W3yBnvAVV8dlRNQKYD6Mf8ySRrYsP0tPk7JjvqZzNHQ=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo=
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c h1:2Wl/hvaSFjh6fmCSIQhjkr9llMRREQeqcXNLZ/HPY18=
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c/go.mod h1:lSRfGs+PxFvw7CcndHWRd6UlLlGrZn0b0hp5cfaMNGw=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142 h1:+po0nesBDSWsgCySBG7eEXk7i9Ytd58wqvjL1M9y6d8=

View File

@ -26,4 +26,6 @@ const (
WaitRestart = "WaitRestart"
WaitPause = "WaitPause"
WaitStart = "WaitStart"
Pending = "Pending"
Stopped = "Stopped"
)

3
pkg/constants/time.go Normal file
View File

@ -0,0 +1,3 @@
package constants
const Layout = "2006-01-02 15:04:05"

View File

@ -8,7 +8,7 @@ import (
type BaseModel struct {
DeletedAt gorm.DeletedAt `gorm:"index;comment:删除时间" json:"-"` // 删除时间
CreatedBy uint `gorm:"created_by;comment:创建人" json:"createdBy"` //创建人
CreatedTime time.Time `gorm:"comment:创建时间" json:"-"` // 创建时间
CreateTime time.Time `gorm:"autoCreateTime:nano;comment:创建时间" json:"-"` // 创建时间
UpdatedBy uint `gorm:"updated_by;comment:更新人" json:"UpdatedBy"` //创建人
UpdatedTime time.Time `gorm:"comment:更新时间" json:"-"` // 更新时间
UpdateTime time.Time `gorm:"autoUpdateTime:nano;;comment:更新时间" json:"-"` // 更新时间
}

View File

@ -13,11 +13,10 @@ type TaskCloudModel struct {
ClusterName string `json:"clusterName" gorm:"not null;comment:集群名称"`
Kind string `json:"kind" gorm:"comment:种类"`
Status string `json:"status" gorm:"comment:状态"`
StartTime time.Time `json:"startTime" gorm:"comment:开始时间"`
StartTime *time.Time `json:"startTime,string" gorm:"comment:开始时间"`
YamlString string `json:"yamlString" gorm:"not null;comment:入参"`
Result string `json:"result" gorm:"comment:运行结果"`
Namespace string `json:"namespace" gorm:"comment:命名空间"`
Replica int `json:"replica" gorm:"not null;comment:副本数"`
base.BaseModel
}

View File

@ -37,9 +37,9 @@ type File struct {
Status string `gorm:"column:status" json:"Status"` //type:string comment:hash version:2023-05-06 09:58
DeletedFlag *int `gorm:"column:deleted_flag" json:"DeletedFlag"` //type:*int comment:是否删除 version:2023-05-06 09:58
CreatedBy *int `gorm:"column:created_by" json:"CreatedBy"` //type:*int comment:创建人 version:2023-05-06 09:58
CreatedTime *time.Time `gorm:"column:created_time" json:"CreatedTime"` //type:*time.Time comment:创建时间 version:2023-05-06 09:58
CreatedTime *time.Time `gorm:"column:created_time" json:"CreateTime"` //type:*time.Time comment:创建时间 version:2023-05-06 09:58
UpdatedBy *int `gorm:"column:updated_by" json:"UpdatedBy"` //type:*int comment:更新人 version:2023-05-06 09:58
UpdatedTime *time.Time `gorm:"column:updated_time" json:"UpdatedTime"` //type:*time.Time comment:更新时间 version:2023-05-06 09:58
UpdatedTime *time.Time `gorm:"column:updated_time" json:"UpdateTime"` //type:*time.Time comment:更新时间 version:2023-05-06 09:58
}
// TableName 表名:data_set

24
pkg/models/taskaimodel.go Normal file
View File

@ -0,0 +1,24 @@
package models
import "github.com/zeromicro/go-zero/core/stores/sqlx"
var _ TaskAiModel = (*customTaskAiModel)(nil)
type (
// TaskAiModel is an interface to be customized, add more methods here,
// and implement the added methods in customTaskAiModel.
TaskAiModel interface {
taskAiModel
}
customTaskAiModel struct {
*defaultTaskAiModel
}
)
// NewTaskAiModel returns a model for the database table.
func NewTaskAiModel(conn sqlx.SqlConn) TaskAiModel {
return &customTaskAiModel{
defaultTaskAiModel: newTaskAiModel(conn),
}
}

View File

@ -0,0 +1,104 @@
// Code generated by goctl. DO NOT EDIT.
package models
import (
"context"
"database/sql"
"fmt"
"strings"
"time"
"github.com/zeromicro/go-zero/core/stores/builder"
"github.com/zeromicro/go-zero/core/stores/sqlc"
"github.com/zeromicro/go-zero/core/stores/sqlx"
"github.com/zeromicro/go-zero/core/stringx"
)
var (
taskAiFieldNames = builder.RawFieldNames(&TaskAi{})
taskAiRows = strings.Join(taskAiFieldNames, ",")
taskAiRowsExpectAutoSet = strings.Join(stringx.Remove(taskAiFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), ",")
taskAiRowsWithPlaceHolder = strings.Join(stringx.Remove(taskAiFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), "=?,") + "=?"
)
type (
taskAiModel interface {
Insert(ctx context.Context, data *TaskAi) (sql.Result, error)
FindOne(ctx context.Context, id int64) (*TaskAi, error)
Update(ctx context.Context, data *TaskAi) error
Delete(ctx context.Context, id int64) error
}
defaultTaskAiModel struct {
conn sqlx.SqlConn
table string
}
TaskAi struct {
Id int64 `db:"id"` // id
TaskId int64 `db:"task_id"` // 任务id
AdapterId int64 `db:"adapter_id"` // 设配器id
ClusterId int64 `db:"cluster_id"` // 集群id
Name string `db:"name"` // 任务名
Replica int64 `db:"replica"` // 执行数
JobId string `db:"job_id"` // 集群返回任务id
Strategy string `db:"strategy"` // 主任务使用策略
Status string `db:"status"` // 任务状态
Msg string `db:"msg"` // 集群返回任务信息
CommitTime time.Time `db:"commit_time"` // 提交时间
StartTime string `db:"start_time"` // 开始时间
EndTime string `db:"end_time"` // 结束时间
TaskType string `db:"task_type"`
}
)
func newTaskAiModel(conn sqlx.SqlConn) *defaultTaskAiModel {
return &defaultTaskAiModel{
conn: conn,
table: "`task_ai`",
}
}
func (m *defaultTaskAiModel) withSession(session sqlx.Session) *defaultTaskAiModel {
return &defaultTaskAiModel{
conn: sqlx.NewSqlConnFromSession(session),
table: "`task_ai`",
}
}
func (m *defaultTaskAiModel) Delete(ctx context.Context, id int64) error {
query := fmt.Sprintf("delete from %s where `id` = ?", m.table)
_, err := m.conn.ExecCtx(ctx, query, id)
return err
}
func (m *defaultTaskAiModel) FindOne(ctx context.Context, id int64) (*TaskAi, error) {
query := fmt.Sprintf("select %s from %s where `id` = ? limit 1", taskAiRows, m.table)
var resp TaskAi
err := m.conn.QueryRowCtx(ctx, &resp, query, id)
switch err {
case nil:
return &resp, nil
case sqlc.ErrNotFound:
return nil, ErrNotFound
default:
return nil, err
}
}
func (m *defaultTaskAiModel) Insert(ctx context.Context, data *TaskAi) (sql.Result, error) {
query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskAiRowsExpectAutoSet)
ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.ClusterId, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType)
return ret, err
}
func (m *defaultTaskAiModel) Update(ctx context.Context, data *TaskAi) error {
query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskAiRowsWithPlaceHolder)
_, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.ClusterId, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.Id)
return err
}
func (m *defaultTaskAiModel) tableName() string {
return m.table
}

View File

@ -42,13 +42,14 @@ type (
Strategy int64 `db:"strategy"` // 策略
SynergyStatus int64 `db:"synergy_status"` // 协同状态0-未协同、1-已协同)
CommitTime time.Time `db:"commit_time"` // 提交时间
StartTime string `db:"start_time"` // 开始时间
EndTime string `db:"end_time"` // 结束运行时间
StartTime *time.Time `db:"start_time"` // 开始时间
EndTime *time.Time `db:"end_time"` // 结束运行时间
RunningTime int64 `db:"running_time"` // 已运行时间(单位秒)
YamlString string `db:"yaml_string"`
Result string `db:"result"` // 作业结果
DeletedAt gorm.DeletedAt `gorm:"index"`
NsID string `db:"ns_id"`
AdapterTypeDict int `db:"adapter_type_dict"` //任务类型(对应字典表的值)
}
)

View File

@ -46,6 +46,8 @@ type (
DiskTotal float64 `db:"disk_total"`
GpuAvail float64 `db:"gpu_avail"`
GpuTotal float64 `db:"gpu_total"`
CardTotal int64 `db:"card_total"` // 算力卡数量
CardTopsTotal float64 `db:"card_tops_total"` // 算力总量tops
}
)
@ -56,6 +58,13 @@ func newTClusterResourceModel(conn sqlx.SqlConn) *defaultTClusterResourceModel {
}
}
func (m *defaultTClusterResourceModel) withSession(session sqlx.Session) *defaultTClusterResourceModel {
return &defaultTClusterResourceModel{
conn: sqlx.NewSqlConnFromSession(session),
table: "`t_cluster_resource`",
}
}
func (m *defaultTClusterResourceModel) Delete(ctx context.Context, clusterId int64) error {
query := fmt.Sprintf("delete from %s where `cluster_id` = ?", m.table)
_, err := m.conn.ExecCtx(ctx, query, clusterId)
@ -77,14 +86,14 @@ func (m *defaultTClusterResourceModel) FindOne(ctx context.Context, clusterId in
}
func (m *defaultTClusterResourceModel) Insert(ctx context.Context, data *TClusterResource) (sql.Result, error) {
query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, tClusterResourceRowsExpectAutoSet)
ret, err := m.conn.ExecCtx(ctx, query, data.ClusterId, data.ClusterName, data.ClusterType, data.CpuAvail, data.CpuTotal, data.MemAvail, data.MemTotal, data.DiskAvail, data.DiskTotal, data.GpuAvail, data.GpuTotal)
query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, tClusterResourceRowsExpectAutoSet)
ret, err := m.conn.ExecCtx(ctx, query, data.ClusterId, data.ClusterName, data.ClusterType, data.CpuAvail, data.CpuTotal, data.MemAvail, data.MemTotal, data.DiskAvail, data.DiskTotal, data.GpuAvail, data.GpuTotal, data.CardTotal, data.CardTopsTotal)
return ret, err
}
func (m *defaultTClusterResourceModel) Update(ctx context.Context, data *TClusterResource) error {
query := fmt.Sprintf("update %s set %s where `cluster_id` = ?", m.table, tClusterResourceRowsWithPlaceHolder)
_, err := m.conn.ExecCtx(ctx, query, data.ClusterName, data.ClusterType, data.CpuAvail, data.CpuTotal, data.MemAvail, data.MemTotal, data.DiskAvail, data.DiskTotal, data.GpuAvail, data.GpuTotal, data.ClusterId)
_, err := m.conn.ExecCtx(ctx, query, data.ClusterName, data.ClusterType, data.CpuAvail, data.CpuTotal, data.MemAvail, data.MemTotal, data.DiskAvail, data.DiskTotal, data.GpuAvail, data.GpuTotal, data.CardTotal, data.CardTopsTotal, data.ClusterId)
return err
}

View File

@ -0,0 +1,24 @@
package models
import "github.com/zeromicro/go-zero/core/stores/sqlx"
var _ TClusterTaskQueueModel = (*customTClusterTaskQueueModel)(nil)
type (
// TClusterTaskQueueModel is an interface to be customized, add more methods here,
// and implement the added methods in customTClusterTaskQueueModel.
TClusterTaskQueueModel interface {
tClusterTaskQueueModel
}
customTClusterTaskQueueModel struct {
*defaultTClusterTaskQueueModel
}
)
// NewTClusterTaskQueueModel returns a model for the database table.
func NewTClusterTaskQueueModel(conn sqlx.SqlConn) TClusterTaskQueueModel {
return &customTClusterTaskQueueModel{
defaultTClusterTaskQueueModel: newTClusterTaskQueueModel(conn),
}
}

View File

@ -0,0 +1,95 @@
// Code generated by goctl. DO NOT EDIT.
package models
import (
"context"
"database/sql"
"fmt"
"strings"
"time"
"github.com/zeromicro/go-zero/core/stores/builder"
"github.com/zeromicro/go-zero/core/stores/sqlc"
"github.com/zeromicro/go-zero/core/stores/sqlx"
"github.com/zeromicro/go-zero/core/stringx"
)
var (
tClusterTaskQueueFieldNames = builder.RawFieldNames(&TClusterTaskQueue{})
tClusterTaskQueueRows = strings.Join(tClusterTaskQueueFieldNames, ",")
tClusterTaskQueueRowsExpectAutoSet = strings.Join(stringx.Remove(tClusterTaskQueueFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), ",")
tClusterTaskQueueRowsWithPlaceHolder = strings.Join(stringx.Remove(tClusterTaskQueueFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), "=?,") + "=?"
)
type (
tClusterTaskQueueModel interface {
Insert(ctx context.Context, data *TClusterTaskQueue) (sql.Result, error)
FindOne(ctx context.Context, id int64) (*TClusterTaskQueue, error)
Update(ctx context.Context, data *TClusterTaskQueue) error
Delete(ctx context.Context, id int64) error
}
defaultTClusterTaskQueueModel struct {
conn sqlx.SqlConn
table string
}
TClusterTaskQueue struct {
Id int64 `db:"id"` // id
AdapterId int64 `db:"adapter_id"` // 适配器id
ClusterId int64 `db:"cluster_id"` // 集群id
QueueNum int64 `db:"queue_num"` // 任务排队数量
Date time.Time `db:"date"`
}
)
func newTClusterTaskQueueModel(conn sqlx.SqlConn) *defaultTClusterTaskQueueModel {
return &defaultTClusterTaskQueueModel{
conn: conn,
table: "`t_cluster_task_queue`",
}
}
func (m *defaultTClusterTaskQueueModel) withSession(session sqlx.Session) *defaultTClusterTaskQueueModel {
return &defaultTClusterTaskQueueModel{
conn: sqlx.NewSqlConnFromSession(session),
table: "`t_cluster_task_queue`",
}
}
func (m *defaultTClusterTaskQueueModel) Delete(ctx context.Context, id int64) error {
query := fmt.Sprintf("delete from %s where `id` = ?", m.table)
_, err := m.conn.ExecCtx(ctx, query, id)
return err
}
func (m *defaultTClusterTaskQueueModel) FindOne(ctx context.Context, id int64) (*TClusterTaskQueue, error) {
query := fmt.Sprintf("select %s from %s where `id` = ? limit 1", tClusterTaskQueueRows, m.table)
var resp TClusterTaskQueue
err := m.conn.QueryRowCtx(ctx, &resp, query, id)
switch err {
case nil:
return &resp, nil
case sqlc.ErrNotFound:
return nil, ErrNotFound
default:
return nil, err
}
}
func (m *defaultTClusterTaskQueueModel) Insert(ctx context.Context, data *TClusterTaskQueue) (sql.Result, error) {
query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?)", m.table, tClusterTaskQueueRowsExpectAutoSet)
ret, err := m.conn.ExecCtx(ctx, query, data.AdapterId, data.ClusterId, data.QueueNum, data.Date)
return ret, err
}
func (m *defaultTClusterTaskQueueModel) Update(ctx context.Context, data *TClusterTaskQueue) error {
query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, tClusterTaskQueueRowsWithPlaceHolder)
_, err := m.conn.ExecCtx(ctx, query, data.AdapterId, data.ClusterId, data.QueueNum, data.Date, data.Id)
return err
}
func (m *defaultTClusterTaskQueueModel) tableName() string {
return m.table
}

View File

@ -41,3 +41,8 @@ func GenSnowflakeID() int64 {
func GenSnowflakeIDStr() string {
return node.Generate().String()
}
// GenSnowflakeIDStr 工作id
func GenSnowflakeIDUint() uint {
return uint(node.Generate().Int64())
}

View File

@ -19,7 +19,9 @@ import (
)
var timeTemplates = []string{
"2006-01-02T15:04:05Z07:00", //RFC3339
"2006-01-02 15:04:05", //常规类型
"2006/01/02T15:04:05Z07:00", //RFC3339
"2006/01/02 15:04:05",
"2006-01-02",
"2006/01/02",