Merge branch 'refs/heads/upstream'

Former-commit-id: aabbc1719d3a845983eefacd0614322f32bd2246
This commit is contained in:
jagger 2024-04-19 15:45:01 +08:00
commit e195bb4d89
50 changed files with 2020 additions and 1190 deletions

View File

@ -6,6 +6,7 @@ type Options struct {
} }
type Client interface { type Client interface {
Task(TaskOptions) (Task, error) Task(TaskOptions) (Task, error)
Notice(NoticeOptions) (Notice, error)
} }
func NewClient(options Options) (Client, error) { func NewClient(options Options) (Client, error) {

View File

@ -19,6 +19,11 @@ func (c *client) Task(options TaskOptions) (Task, error) {
return task, nil return task, nil
} }
func (c *client) Notice(options NoticeOptions) (Notice, error) {
notice, _ := newNotice(c, &options)
return notice, nil
}
func newClient(options Options) (Client, error) { func newClient(options Options) (Client, error) {
//init dbEngine //init dbEngine
dbEngin, _ := gorm.Open(mysql.Open(options.DataSource), &gorm.Config{ dbEngin, _ := gorm.Open(mysql.Open(options.DataSource), &gorm.Config{

9
api/client/notice.go Normal file
View File

@ -0,0 +1,9 @@
package client
type NoticeOptions struct {
pushNoticeReq PushNoticeReq
}
type Notice interface {
PushNotice(pushNoticeReq PushNoticeReq) (*PushNoticeResp, error)
}

46
api/client/notice_impl.go Normal file
View File

@ -0,0 +1,46 @@
package client
import (
"io/ioutil"
"k8s.io/apimachinery/pkg/util/json"
"log"
"net/http"
"strings"
"sync"
)
type notice struct {
sync.RWMutex
client *client
options *NoticeOptions
log log.Logger
}
func newNotice(client *client, options *NoticeOptions) (*notice, error) {
notice := &notice{
RWMutex: sync.RWMutex{},
client: client,
options: options,
log: log.Logger{},
}
return notice, nil
}
func (n *notice) PushNotice(pushNoticeReq PushNoticeReq) (*PushNoticeResp, error) {
url := n.client.url + "/pcm/v1/core/pushNotice"
method := "GET"
jsonStr, _ := json.Marshal(pushNoticeReq)
payload := strings.NewReader(string(jsonStr))
client := &http.Client{}
req, _ := http.NewRequest(method, url, payload)
req.Header.Add("Content-Type", "application/json")
res, _ := client.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
var resp PushNoticeResp
json.Unmarshal(body, &resp)
return &resp, nil
}

View File

@ -9,5 +9,5 @@ type TaskOptions struct {
type Task interface { type Task interface {
PullTaskInfo(pullTaskInfoReq PullTaskInfoReq) (*PullTaskInfoResp, error) PullTaskInfo(pullTaskInfoReq PullTaskInfoReq) (*PullTaskInfoResp, error)
PushTaskInfo(pushTaskInfoReq PushTaskInfoReq) (*PushTaskInfoResp, error) PushTaskInfo(pushTaskInfoReq PushTaskInfoReq) (*PushTaskInfoResp, error)
PushResourceInfo(pushResourceInfoReq PushResourceInfoReq) error PushResourceInfo(pushResourceInfoReq PushResourceInfoReq) (*PushResourceInfoResp, error)
} }

View File

@ -50,8 +50,8 @@ func (t *task) PushTaskInfo(pushTaskInfoReq PushTaskInfoReq) (*PushTaskInfoResp,
url := t.client.url + "/pcm/v1/core/pushTaskInfo" url := t.client.url + "/pcm/v1/core/pushTaskInfo"
method := "POST" method := "POST"
infoReq := PullTaskInfoReq{AdapterId: pushTaskInfoReq.AdapterId} //infoReq := PullTaskInfoReq{AdapterId: pushTaskInfoReq.AdapterId}
jsonStr, _ := json.Marshal(infoReq) jsonStr, _ := json.Marshal(pushTaskInfoReq)
payload := strings.NewReader(string(jsonStr)) payload := strings.NewReader(string(jsonStr))
client := &http.Client{} client := &http.Client{}
@ -66,7 +66,22 @@ func (t *task) PushTaskInfo(pushTaskInfoReq PushTaskInfoReq) (*PushTaskInfoResp,
return &resp, nil return &resp, nil
} }
func (t *task) PushResourceInfo(pushResourceInfoReq PushResourceInfoReq) error { func (t *task) PushResourceInfo(pushResourceInfoReq PushResourceInfoReq) (*PushResourceInfoResp, error) {
//TODO implement me
panic("implement me") url := t.client.url + "/pcm/v1/core/pushResourceInfo"
method := "POST"
//infoReq := PushResourceInfoReq{AdapterId: pushResourceInfoReq.AdapterId}
jsonStr, _ := json.Marshal(pushResourceInfoReq)
payload := strings.NewReader(string(jsonStr))
client := &http.Client{}
req, _ := http.NewRequest(method, url, payload)
req.Header.Add("Content-Type", "application/json")
res, _ := client.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
var resp PushResourceInfoResp
json.Unmarshal(body, &resp)
return &resp, nil
} }

View File

@ -25,12 +25,46 @@ type PushTaskInfoReq struct {
} }
type PushTaskInfoResp struct { type PushTaskInfoResp struct {
Code int64 Code int64 `json:"code"`
Msg string Msg string `json:"msg"`
} }
type PushResourceInfoReq struct { type PushResourceInfoReq struct {
AdapterId int64 `json:"adapterId"` AdapterId int64 `json:"adapterId"`
ResourceStats []ResourceStats `json:"resourceStats"`
}
type PushResourceInfoResp struct {
Code int64 `json:"code"`
Msg string `json:"msg"`
}
type NoticeInfo struct {
AdapterId int64 `json:"adapterId"`
AdapterName string `json:"adapterName"`
ClusterId int64 `json:"clusterId"`
ClusterName string `json:"clusterName"`
NoticeType string `json:"noticeType"`
TaskName string `json:"taskName"`
Incident string `json:"incident"`
CreatedTime time.Time `json:"createdTime"`
}
type ListNoticeReq struct {
}
type ListNoticeResp struct {
Code int64 `json:"code"`
Msg string `json:"msg"`
Data []NoticeInfo `json:"data"`
}
type PushNoticeReq struct {
NoticeInfo NoticeInfo `json:"noticeInfo"`
}
type PushNoticeResp struct {
Code int64 `json:"code"`
Msg string `json:"msg"`
} }
type HpcInfo struct { type HpcInfo struct {
@ -119,5 +153,30 @@ type VmInfo struct {
BlockUuid string `json:"block_uuid,omitempty"` BlockUuid string `json:"block_uuid,omitempty"`
SourceType string `json:"source_type,omitempty"` SourceType string `json:"source_type,omitempty"`
DeleteOnTermination bool `json:"delete_on_termination,omitempty"` DeleteOnTermination bool `json:"delete_on_termination,omitempty"`
State string `json:"state,omitempty"` Status string `json:"Status,omitempty"`
StartTime string `json:"startTime,omitempty"`
}
type ResourceStats struct {
ClusterId int64 `json:"clusterId"`
Name string `json:"name"`
CpuCoreAvail int64 `json:"cpuCoreAvail"`
CpuCoreTotal int64 `json:"cpuCoreTotal"`
MemAvail float64 `json:"memAvail"`
MemTotal float64 `json:"memTotal"`
DiskAvail float64 `json:"diskAvail"`
DiskTotal float64 `json:"diskTotal"`
GpuAvail int64 `json:"gpuAvail"`
CardsAvail []*Card `json:"cardsAvail"`
CpuCoreHours float64 `json:"cpuCoreHours"`
Balance float64 `json:"balance"`
}
type Card struct {
Platform string `json:"platform"`
Type string `json:"type"`
Name string `json:"name"`
TOpsAtFp16 float64 `json:"TOpsAtFp16"`
CardHours float64 `json:"cardHours"`
CardNum int32 `json:"cardNum"`
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,126 +0,0 @@
syntax = "v1"
info(
title: "type title here"
desc: "type desc here"
author: "type author here"
email: "type email here"
version: "type version here"
)
type PullTaskInfoReq {
AdapterId int64 `form:"adapterId"`
}
type PullTaskInfoResp struct {
HpcInfoList []*HpcInfo `json:"HpcInfoList,omitempty"`
CloudInfoList []*CloudInfo `json:"CloudInfoList,omitempty"`
AiInfoList []*AiInfo `json:"AiInfoList,omitempty"`
VmInfoList []*VmInfo `json:"VmInfoList,omitempty"`
}
type HpcInfo struct {
Id int64 `json:"id"` // id
TaskId int64 `json:"task_id"` // 任务id
JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id)
AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id
ClusterId int64 `json:"cluster_id"` // 执行任务的集群id
ClusterType string `json:"cluster_type"` // 执行任务的集群类型
Name string `json:"name"` // 名称
Status string `json:"status"` // 状态
CmdScript string `json:"cmd_script"`
StartTime string `json:"start_time"` // 开始时间
RunningTime int64 `json:"running_time"` // 运行时间
DerivedEs string `json:"derived_es"`
Cluster string `json:"cluster"`
BlockId int64 `json:"block_id"`
AllocNodes int64 `json:"alloc_nodes"`
AllocCpu int64 `json:"alloc_cpu"`
CardCount int64 `json:"card_count"` // 卡数
Version string `json:"version"`
Account string `json:"account"`
WorkDir string `json:"work_dir"` // 工作路径
AssocId int64 `json:"assoc_id"`
ExitCode int64 `json:"exit_code"`
WallTime string `json:"wall_time"` // 最大运行时间
Result string `json:"result"` // 运行结果
DeletedAt string `json:"deleted_at"` // 删除时间
YamlString string `json:"yaml_string"`
AppType string `json:"app_type"` // 应用类型
AppName string `json:"app_name"` // 应用名称
Queue string `json:"queue"` // 队列名称
SubmitType string `json:"submit_type"` // cmd命令行模式
NNode string `json:"n_node"` // 节点个数当指定该参数时GAP_NODE_STRING必须为""
StdOutFile string `json:"std_out_file"` // 工作路径/std.err.%j
StdErrFile string `json:"std_err_file"` // 工作路径/std.err.%j
StdInput string `json:"std_input"`
Environment string `json:"environment"`
DeletedFlag int64 `json:"deleted_flag"` // 是否删除0-否1-是)
CreatedBy int64 `json:"created_by"` // 创建人
CreatedTime string `json:"created_time"` // 创建时间
UpdatedBy int64 `json:"updated_by"` // 更新人
UpdatedTime string `json:"updated_time"` // 更新时间
}
type CloudInfo struct {
Participant int64 `json:"participant,omitempty"`
Id int64 `json:"id,omitempty"`
TaskId int64 `json:"taskId,omitempty"`
ApiVersion string `json:"apiVersion,omitempty"`
Kind string `json:"kind,omitempty"`
Namespace string `json:"namespace,omitempty"`
Name string `json:"name,omitempty"`
Status string `json:"status,omitempty"`
StartTime string `json:"startTime,omitempty"`
RunningTime int64 `json:"runningTime,omitempty"`
Result string `json:"result,omitempty"`
YamlString string `json:"yamlString,omitempty"`
}
type AiInfo struct {
ParticipantId int64 `json:"participantId,omitempty"`
TaskId int64 `json:"taskId,omitempty"`
ProjectId string `json:"project_id,omitempty"`
Name string `json:"name,omitempty"`
Status string `json:"status,omitempty"`
StartTime string `json:"startTime,omitempty"`
RunningTime int64 `json:"runningTime,omitempty"`
Result string `json:"result,omitempty"`
JobId string `json:"jobId,omitempty"`
CreateTime string `json:"createTime,omitempty"`
ImageUrl string `json:"imageUrl,omitempty"`
Command string `json:"command,omitempty"`
FlavorId string `json:"flavorId,omitempty"`
SubscriptionId string `json:"subscriptionId,omitempty"`
ItemVersionId string `json:"itemVersionId,omitempty"`
}
type VmInfo struct {
ParticipantId int64 `json:"participantId,omitempty"`
TaskId int64 `json:"taskId,omitempty"`
Name string `json:"name,omitempty"`
FlavorRef string `json:"flavor_ref,omitempty"`
ImageRef string `json:"image_ref,omitempty"`
NetworkUuid string `json:"network_uuid,omitempty"`
BlockUuid string `json:"block_uuid,omitempty"`
SourceType string `json:"source_type,omitempty"`
DeleteOnTermination bool `json:"delete_on_termination,omitempty"`
State string `json:"state,omitempty"`
}
type PushTaskInfoReq struct {
AdapterId int64 `json:"adapterId"`
HpcInfoList []*HpcInfo `json:"hpcInfoList"`
CloudInfoList []*CloudInfo `json:"cloudInfoList"`
AiInfoList []*AiInfo `json:"aiInfoList"`
VmInfoList []*VmInfo `json:"vmInfoList"`
}
type PushTaskInfoResp struct {
Code int64 `json:"code"`
Msg string `json:"msg"`
}
type PushResourceInfoReq struct {
AdapterId int64 `json:"adapterId"`
}

View File

@ -9,7 +9,6 @@ import (
"cloud/pcm-cloud.api" "cloud/pcm-cloud.api"
"storelink/pcm-storelink.api" "storelink/pcm-storelink.api"
"schedule/pcm-schedule.api" "schedule/pcm-schedule.api"
"participant/pcm-participant.api"
"monitoring/pcm-monitoring.api" "monitoring/pcm-monitoring.api"
) )
@ -111,14 +110,26 @@ service pcm {
@handler metricsHandler @handler metricsHandler
get /core/metrics get /core/metrics
@doc "provided to participant to pull task info from core" @doc "provide for adapter to pull task info from core"
@handler pullTaskInfoHandler @handler pullTaskInfoHandler
get /core/pullTaskInfo (PullTaskInfoReq) returns (PullTaskInfoResp) get /core/pullTaskInfo (PullTaskInfoReq) returns (PullTaskInfoResp)
@doc "provided to participant to push task info to core" @doc "provide for adapter to push task info to core"
@handler pushTaskInfoHandler @handler pushTaskInfoHandler
post /core/pushTaskInfo (PushTaskInfoReq) returns (PushTaskInfoResp) post /core/pushTaskInfo (PushTaskInfoReq) returns (PushTaskInfoResp)
@doc "provide for adapter to push resource info to core"
@handler pushResourceInfoHandler
post /core/pushResourceInfo (PushResourceInfoReq) returns (PushResourceInfoResp)
@doc "provide for adapter to push notice info to core"
@handler pushNoticeHandler
post /core/pushNotice (PushNoticeReq) returns (PushNoticeResp)
@doc "list notice"
@handler listNoticeHandler
get /core/listNotice (ListNoticeReq) returns (ListNoticeResp)
@doc "paging queries the task list" @doc "paging queries the task list"
@handler pageListTaskHandler @handler pageListTaskHandler
get /core/task/list (pageTaskReq) returns(PageResult) get /core/task/list (pageTaskReq) returns(PageResult)
@ -146,6 +157,10 @@ service pcm {
@handler jobHandler @handler jobHandler
get /hpc/job (hpcJobReq) returns (hpcJobResp) get /hpc/job (hpcJobReq) returns (hpcJobResp)
@doc "超算资源总览"
@handler resourceHandler
get /hpc/resource (hpcResourceReq) returns (hpcResourceResp)
@doc "超算查询资产列表" @doc "超算查询资产列表"
@handler queueAssetsHandler @handler queueAssetsHandler
get /hpc/queueAssets returns (QueueAssetsResp) get /hpc/queueAssets returns (QueueAssetsResp)
@ -895,13 +910,13 @@ service pcm {
get /schedule/ai/getTaskTypes returns (AiTaskTypesResp) get /schedule/ai/getTaskTypes returns (AiTaskTypesResp)
@handler ScheduleGetDatasetsHandler @handler ScheduleGetDatasetsHandler
get /schedule/ai/getDatasets returns (AiDatasetsResp) get /schedule/ai/getDatasets/:adapterId (AiDatasetsReq) returns (AiDatasetsResp)
@handler ScheduleGetStrategyHandler @handler ScheduleGetStrategyHandler
get /schedule/ai/getStrategies returns (AiStrategyResp) get /schedule/ai/getStrategies returns (AiStrategyResp)
@handler ScheduleGetAlgorithmsHandler @handler ScheduleGetAlgorithmsHandler
get /schedule/ai/getAlgorithms/:resourceType/:taskType/:dataset (AiAlgorithmsReq) returns (AiAlgorithmsResp) get /schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset (AiAlgorithmsReq) returns (AiAlgorithmsResp)
@handler ScheduleSubmitHandler @handler ScheduleSubmitHandler
post /schedule/submit (ScheduleReq) returns (ScheduleResp) post /schedule/submit (ScheduleReq) returns (ScheduleResp)

View File

@ -19,13 +19,15 @@ type (
ScheduleResult { ScheduleResult {
ClusterId string `json:"clusterId"` ClusterId string `json:"clusterId"`
TaskId string `json:"taskId"` TaskId string `json:"taskId"`
Strategy string `json:"strategy"`
Replica int32 `json:"replica"` Replica int32 `json:"replica"`
Msg string `json:"msg"` Msg string `json:"msg"`
} }
AiOption { AiOption {
TaskName string `json:"taskName"` TaskName string `json:"taskName"`
AiClusterId string `json:"aiClusterId,optional"` AdapterId string `json:"adapterId"`
AiClusterIds []string `json:"aiClusterIds"`
ResourceType string `json:"resourceType"` ResourceType string `json:"resourceType"`
Tops float64 `json:"Tops,optional"` Tops float64 `json:"Tops,optional"`
TaskType string `json:"taskType"` TaskType string `json:"taskType"`
@ -46,6 +48,10 @@ type (
TaskTypes []string `json:"taskTypes"` TaskTypes []string `json:"taskTypes"`
} }
AiDatasetsReq {
AdapterId string `path:"adapterId"`
}
AiDatasetsResp { AiDatasetsResp {
Datasets []string `json:"datasets"` Datasets []string `json:"datasets"`
} }
@ -55,6 +61,7 @@ type (
} }
AiAlgorithmsReq { AiAlgorithmsReq {
AdapterId string `path:"adapterId"`
ResourceType string `path:"resourceType"` ResourceType string `path:"resourceType"`
TaskType string `path:"taskType"` TaskType string `path:"taskType"`
Dataset string `path:"dataset"` Dataset string `path:"dataset"`

View File

@ -0,0 +1,28 @@
package core
import (
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/core"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
func ListNoticeHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req clientCore.ListNoticeReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
return
}
l := core.NewListNoticeLogic(r.Context(), svcCtx)
resp, err := l.ListNotice(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -0,0 +1,28 @@
package core
import (
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/core"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
func PushNoticeHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req clientCore.PushNoticeReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
return
}
l := core.NewPushNoticeLogic(r.Context(), svcCtx)
resp, err := l.PushNotice(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -0,0 +1,28 @@
package core
import (
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/core"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
func PushResourceInfoHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req clientCore.PushResourceInfoReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
return
}
l := core.NewPushResourceInfoLogic(r.Context(), svcCtx)
resp, err := l.PushResourceInfo(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -0,0 +1,28 @@
package hpc
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/hpc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
)
func ResourceHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.HpcResourceReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
return
}
l := hpc.NewResourceLogic(r.Context(), svcCtx)
resp, err := l.Resource(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -140,6 +140,21 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/core/pushTaskInfo", Path: "/core/pushTaskInfo",
Handler: core.PushTaskInfoHandler(serverCtx), Handler: core.PushTaskInfoHandler(serverCtx),
}, },
{
Method: http.MethodPost,
Path: "/core/pushResourceInfo",
Handler: core.PushResourceInfoHandler(serverCtx),
},
{
Method: http.MethodPost,
Path: "/core/pushNotice",
Handler: core.PushNoticeHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/core/listNotice",
Handler: core.ListNoticeHandler(serverCtx),
},
{ {
Method: http.MethodGet, Method: http.MethodGet,
Path: "/core/task/list", Path: "/core/task/list",
@ -171,6 +186,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/hpc/job", Path: "/hpc/job",
Handler: hpc.JobHandler(serverCtx), Handler: hpc.JobHandler(serverCtx),
}, },
{
Method: http.MethodGet,
Path: "/hpc/resource",
Handler: hpc.ResourceHandler(serverCtx),
},
{ {
Method: http.MethodGet, Method: http.MethodGet,
Path: "/hpc/queueAssets", Path: "/hpc/queueAssets",
@ -1107,7 +1127,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
}, },
{ {
Method: http.MethodGet, Method: http.MethodGet,
Path: "/schedule/ai/getDatasets", Path: "/schedule/ai/getDatasets/:adapterId",
Handler: schedule.ScheduleGetDatasetsHandler(serverCtx), Handler: schedule.ScheduleGetDatasetsHandler(serverCtx),
}, },
{ {
@ -1117,7 +1137,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
}, },
{ {
Method: http.MethodGet, Method: http.MethodGet,
Path: "/schedule/ai/getAlgorithms/:resourceType/:taskType/:dataset", Path: "/schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset",
Handler: schedule.ScheduleGetAlgorithmsHandler(serverCtx), Handler: schedule.ScheduleGetAlgorithmsHandler(serverCtx),
}, },
{ {

View File

@ -1,16 +1,24 @@
package schedule package schedule
import ( import (
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http" "net/http"
) )
func ScheduleGetDatasetsHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { func ScheduleGetDatasetsHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) { return func(w http.ResponseWriter, r *http.Request) {
var req types.AiDatasetsReq
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}
l := schedule.NewScheduleGetDatasetsLogic(r.Context(), svcCtx) l := schedule.NewScheduleGetDatasetsLogic(r.Context(), svcCtx)
resp, err := l.ScheduleGetDatasets() resp, err := l.ScheduleGetDatasets(&req)
result.HttpResult(r, w, resp, err) result.HttpResult(r, w, resp, err)
} }
} }

View File

@ -2,13 +2,12 @@ package core
import ( import (
"context" "context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/mqs" "fmt"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
tool "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "math/rand"
"time" "time"
"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
@ -35,7 +34,6 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type
Status: constants.Saved, Status: constants.Saved,
Name: req.Name, Name: req.Name,
CommitTime: time.Now(), CommitTime: time.Now(),
NsID: req.NsID,
} }
// Save task data to database // Save task data to database
tx := l.svcCtx.DbEngin.Create(&taskModel) tx := l.svcCtx.DbEngin.Create(&taskModel)
@ -43,28 +41,38 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type
return nil, tx.Error return nil, tx.Error
} }
var clusterIds []int64 for _, CreateMulServer := range req.CreateMulServer {
l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? and label = ?", req.AdapterId, req.ClusterType).Scan(&clusterIds) fmt.Println("", req.CreateMulServer)
var clusterIds []int64
l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? and label = ?", req.AdapterId, req.ClusterType).Scan(&clusterIds)
if len(clusterIds) == 0 || clusterIds == nil { if len(clusterIds) == 0 || clusterIds == nil {
return nil, nil return nil, nil
}
vmInfo := models.TaskVm{
TaskId: taskModel.Id,
ClusterId: clusterIds[rand.Intn(len(clusterIds))],
Name: taskModel.Name,
Status: "Saved",
StartTime: time.Now().String(),
MinCount: CreateMulServer.Min_count,
ImageRef: CreateMulServer.ImageRef,
FlavorRef: CreateMulServer.FlavorRef,
Uuid: CreateMulServer.Uuid,
Platform: CreateMulServer.Platform,
}
tx = l.svcCtx.DbEngin.Create(&vmInfo)
if tx.Error != nil {
return nil, tx.Error
}
resp = &types.CommitVmTaskResp{
Code: 200,
Msg: "success",
TaskId: taskModel.Id,
}
} }
vm := models.Vm{}
tool.Convert(req, &vm)
mqInfo := response.TaskInfo{
TaskId: taskModel.Id,
TaskType: "vm",
MatchLabels: req.MatchLabels,
NsID: req.NsID,
}
//req.TaskId = taskModel.Id
mqs.InsQueue.Beta.Add(&mqInfo)
tx = l.svcCtx.DbEngin.Create(&mqInfo)
resp = &types.CommitVmTaskResp{
Code: 200,
Msg: "success",
TaskId: taskModel.Id,
}
return resp, nil return resp, nil
} }

View File

@ -0,0 +1,36 @@
package core
import (
"context"
"github.com/zeromicro/go-zero/core/logx"
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
type ListNoticeLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewListNoticeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ListNoticeLogic {
return &ListNoticeLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *ListNoticeLogic) ListNotice(req *clientCore.ListNoticeReq) (*clientCore.ListNoticeResp, error) {
var notices []clientCore.NoticeInfo
var resp clientCore.ListNoticeResp
l.svcCtx.DbEngin.Raw("select * from t_notice order by created_time desc").Scan(&notices)
for _, notice := range notices {
resp.Data = append(resp.Data, notice)
}
resp.Code = 200
resp.Msg = "success"
return &resp, nil
}

View File

@ -67,6 +67,13 @@ func (l *PullTaskInfoLogic) PullTaskInfo(req *clientCore.PullTaskInfoReq) (*clie
return nil, err return nil, err
} }
utils.Convert(aiModelList, &resp.AiInfoList) utils.Convert(aiModelList, &resp.AiInfoList)
case 3:
var vmModelList []models.TaskVm
err := findModelList(req.AdapterId, l.svcCtx.DbEngin, &vmModelList)
if err != nil {
return nil, err
}
utils.Convert(vmModelList, &resp.VmInfoList)
} }
return &resp, nil return &resp, nil
} }

View File

@ -0,0 +1,31 @@
package core
import (
"context"
"github.com/zeromicro/go-zero/core/logx"
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
type PushNoticeLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewPushNoticeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *PushNoticeLogic {
return &PushNoticeLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *PushNoticeLogic) PushNotice(req *clientCore.PushNoticeReq) (resp *clientCore.PushNoticeResp, err error) {
result := l.svcCtx.DbEngin.Table("t_notice").Create(&req.NoticeInfo)
if result.Error != nil {
return nil, result.Error
}
return
}

View File

@ -0,0 +1,28 @@
package core
import (
"context"
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
)
type PushResourceInfoLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewPushResourceInfoLogic(ctx context.Context, svcCtx *svc.ServiceContext) *PushResourceInfoLogic {
return &PushResourceInfoLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *PushResourceInfoLogic) PushResourceInfo(req *clientCore.PushResourceInfoReq) (resp *clientCore.PushResourceInfoResp, err error) {
return
}

View File

@ -49,6 +49,12 @@ func (l *PushTaskInfoLogic) PushTaskInfo(req *clientCore.PushTaskInfoReq) (*clie
aiInfo.Status, aiInfo.StartTime, aiInfo.ProjectId, aiInfo.JobId, req.AdapterId, aiInfo.TaskId, aiInfo.Name) aiInfo.Status, aiInfo.StartTime, aiInfo.ProjectId, aiInfo.JobId, req.AdapterId, aiInfo.TaskId, aiInfo.Name)
syncTask(l.svcCtx.DbEngin, aiInfo.TaskId) syncTask(l.svcCtx.DbEngin, aiInfo.TaskId)
} }
case 3:
for _, vmInfo := range req.VmInfoList {
l.svcCtx.DbEngin.Exec("update task_vm set status = ?,start_time = ? where participant_id = ? and task_id = ? and name = ?",
vmInfo.Status, vmInfo.StartTime, req.AdapterId, vmInfo.TaskId, vmInfo.Name)
syncTask(l.svcCtx.DbEngin, vmInfo.TaskId)
}
} }
return &resp, nil return &resp, nil

View File

@ -0,0 +1,48 @@
package hpc
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type ResourceLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewResourceLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ResourceLogic {
return &ResourceLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *ResourceLogic) Resource(req *types.HpcResourceReq) (resp *types.HpcResourceResp, err error) {
l.svcCtx.DbEngin.Raw("SELECT th.NAME as job_name,t.description as job_desc,t.commit_time as submit_time,th.STATUS as job_status,ta.name as adapter_name,tc.name as cluster_name,tc.label as cluster_type FROM task_hpc th LEFT JOIN task t ON t.id = th.task_id JOIN t_cluster tc on th.cluster_id = tc.id JOIN t_adapter ta on tc.adapter_id = ta.id")
hpcResource := types.HPCResource{
GPUCardsTotal: 0,
CPUCoresTotal: 0,
RAMTotal: 0,
GPUCardsUsed: 0,
CPUCoresUsed: 0,
RAMUsed: 0,
GPURate: 0,
CPURate: 0,
RAMRate: 0,
}
resp = &types.HpcResourceResp{
Code: 200,
Msg: "success",
HPCResource: hpcResource,
}
return resp, nil
}

View File

@ -26,7 +26,7 @@ func NewScheduleGetAlgorithmsLogic(ctx context.Context, svcCtx *svc.ServiceConte
func (l *ScheduleGetAlgorithmsLogic) ScheduleGetAlgorithms(req *types.AiAlgorithmsReq) (resp *types.AiAlgorithmsResp, err error) { func (l *ScheduleGetAlgorithmsLogic) ScheduleGetAlgorithms(req *types.AiAlgorithmsReq) (resp *types.AiAlgorithmsResp, err error) {
resp = &types.AiAlgorithmsResp{} resp = &types.AiAlgorithmsResp{}
algorithms, err := storeLink.GetAlgorithms(l.ctx, l.svcCtx.Scheduler.ResourceCollector, req.ResourceType, req.TaskType, req.Dataset) algorithms, err := storeLink.GetAlgorithms(l.ctx, l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId], req.ResourceType, req.TaskType, req.Dataset)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -3,6 +3,7 @@ package schedule
import ( import (
"context" "context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
@ -23,9 +24,9 @@ func NewScheduleGetDatasetsLogic(ctx context.Context, svcCtx *svc.ServiceContext
} }
} }
func (l *ScheduleGetDatasetsLogic) ScheduleGetDatasets() (resp *types.AiDatasetsResp, err error) { func (l *ScheduleGetDatasetsLogic) ScheduleGetDatasets(req *types.AiDatasetsReq) (resp *types.AiDatasetsResp, err error) {
resp = &types.AiDatasetsResp{} resp = &types.AiDatasetsResp{}
names, err := storeLink.GetDatasetsNames(l.ctx, l.svcCtx.Scheduler.ResourceCollector) names, err := storeLink.GetDatasetsNames(l.ctx, l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId])
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -27,6 +27,7 @@ func NewScheduleSubmitLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Sc
func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *types.ScheduleResp, err error) { func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *types.ScheduleResp, err error) {
resp = &types.ScheduleResp{} resp = &types.ScheduleResp{}
opt := &option.AiOption{ opt := &option.AiOption{
AdapterId: req.AiOption.AdapterId,
ResourceType: req.AiOption.ResourceType, ResourceType: req.AiOption.ResourceType,
Tops: req.AiOption.Tops, Tops: req.AiOption.Tops,
TaskType: req.AiOption.TaskType, TaskType: req.AiOption.TaskType,
@ -55,6 +56,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
scheResult := &types.ScheduleResult{} scheResult := &types.ScheduleResult{}
scheResult.ClusterId = r.ClusterId scheResult.ClusterId = r.ClusterId
scheResult.TaskId = r.TaskId scheResult.TaskId = r.TaskId
scheResult.Strategy = r.Strategy
scheResult.Replica = r.Replica scheResult.Replica = r.Replica
scheResult.Msg = r.Msg scheResult.Msg = r.Msg
resp.Results = append(resp.Results, scheResult) resp.Results = append(resp.Results, scheResult)

View File

@ -5,9 +5,8 @@ import (
) )
type Weight struct { type Weight struct {
Id int64 Id string
Weight int32 Weight int32
Name string
Replica int32 Replica int32
} }

View File

@ -33,6 +33,21 @@ func (s *AiStorage) GetClustersByAdapterId(id string) (*types.ClusterListResp, e
return &resp, nil return &resp, nil
} }
func (s *AiStorage) GetAdapterIdsByType(adapterType string) ([]string, error) {
var list []types.AdapterInfo
var ids []string
db := s.DbEngin.Model(&types.AdapterInfo{}).Table("t_adapter")
db = db.Where("type = ?", adapterType)
err := db.Order("create_time desc").Find(&list).Error
if err != nil {
return nil, err
}
for _, info := range list {
ids = append(ids, info.Id)
}
return ids, nil
}
func (s *AiStorage) SaveTask(name string) error { func (s *AiStorage) SaveTask(name string) error {
// 构建主任务结构体 // 构建主任务结构体
taskModel := models.Task{ taskModel := models.Task{

View File

@ -20,8 +20,7 @@ import (
"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/common" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/common"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/database" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/database"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/executor"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response" "gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response"
"gitlink.org.cn/JointCloud/pcm-coordinator/rpc/client/participantservice" "gitlink.org.cn/JointCloud/pcm-coordinator/rpc/client/participantservice"
@ -32,16 +31,15 @@ import (
) )
type Scheduler struct { type Scheduler struct {
task *response.TaskInfo task *response.TaskInfo
participantIds []int64 participantIds []int64
subSchedule SubSchedule subSchedule SubSchedule
dbEngin *gorm.DB dbEngin *gorm.DB
result []string //pID:子任务yamlstring 键值对 result []string //pID:子任务yamlstring 键值对
participantRpc participantservice.ParticipantService participantRpc participantservice.ParticipantService
ResourceCollector *map[string]collector.AiCollector AiStorages *database.AiStorage
AiStorages *database.AiStorage AiService *service.AiService
AiExecutor *map[string]executor.AiExecutor mu sync.RWMutex
mu sync.RWMutex
} }
type SubSchedule interface { type SubSchedule interface {
@ -59,8 +57,8 @@ func NewScheduler(subSchedule SubSchedule, val string, dbEngin *gorm.DB, partici
return &Scheduler{task: task, subSchedule: subSchedule, dbEngin: dbEngin, participantRpc: participantRpc}, nil return &Scheduler{task: task, subSchedule: subSchedule, dbEngin: dbEngin, participantRpc: participantRpc}, nil
} }
func NewSchdlr(resourceCollector *map[string]collector.AiCollector, storages *database.AiStorage, aiExecutor *map[string]executor.AiExecutor) *Scheduler { func NewSchdlr(aiService *service.AiService, storages *database.AiStorage) *Scheduler {
return &Scheduler{ResourceCollector: resourceCollector, AiStorages: storages, AiExecutor: aiExecutor} return &Scheduler{AiService: aiService, AiStorages: storages}
} }
func (s *Scheduler) SpecifyClusters() { func (s *Scheduler) SpecifyClusters() {

View File

@ -18,6 +18,7 @@ import (
"context" "context"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt"
"gitlink.org.cn/JointCloud/pcm-ac/hpcAC" "gitlink.org.cn/JointCloud/pcm-ac/hpcAC"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
@ -28,7 +29,6 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"gitlink.org.cn/JointCloud/pcm-octopus/octopus" "gitlink.org.cn/JointCloud/pcm-octopus/octopus"
"strconv"
"sync" "sync"
) )
@ -43,6 +43,7 @@ type AiScheduler struct {
type AiResult struct { type AiResult struct {
TaskId string TaskId string
ClusterId string ClusterId string
Strategy string
Replica int32 Replica int32
Msg string Msg string
} }
@ -63,9 +64,8 @@ func (as *AiScheduler) GetNewStructForDb(task *response.TaskInfo, resource strin
} }
func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) { func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
if as.option.AiClusterId != "" { if len(as.option.ClusterIds) == 1 {
// TODO database operation Find return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ClusterId: as.option.ClusterIds[0], Replicas: 1}}, nil
return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ParticipantId: 0, Name: "", Replicas: 1}}, nil
} }
resources, err := as.findClustersWithResources() resources, err := as.findClustersWithResources()
@ -79,8 +79,7 @@ func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
if len(resources) == 1 { if len(resources) == 1 {
var cluster strategy.AssignedCluster var cluster strategy.AssignedCluster
cluster.ParticipantId = resources[0].ParticipantId cluster.ClusterId = resources[0].ClusterId
cluster.Name = resources[0].Name
cluster.Replicas = 1 cluster.Replicas = 1
return &strategy.SingleAssignment{Cluster: &cluster}, nil return &strategy.SingleAssignment{Cluster: &cluster}, nil
} }
@ -89,7 +88,11 @@ func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
switch as.option.StrategyName { switch as.option.StrategyName {
case strategy.REPLICATION: case strategy.REPLICATION:
strategy := strategy.NewReplicationStrategy(&param.ReplicationParams{Params: params, Replicas: 1}) var clusterIds []string
for _, resource := range resources {
clusterIds = append(clusterIds, resource.ClusterId)
}
strategy := strategy.NewReplicationStrategy(clusterIds, 1)
return strategy, nil return strategy, nil
case strategy.RESOURCES_PRICING: case strategy.RESOURCES_PRICING:
strategy := strategy.NewPricingStrategy(&param.ResourcePricingParams{Params: params, Replicas: 1}) strategy := strategy.NewPricingStrategy(&param.ResourcePricingParams{Params: params, Replicas: 1})
@ -111,32 +114,47 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
return nil, errors.New("clusters is nil") return nil, errors.New("clusters is nil")
} }
for i := len(clusters) - 1; i >= 0; i-- {
if clusters[i].Replicas == 0 {
clusters = append(clusters[:i], clusters[i+1:]...)
}
}
if len(clusters) == 0 {
return nil, errors.New("clusters is nil")
}
var wg sync.WaitGroup var wg sync.WaitGroup
var results []*AiResult var results []*AiResult
var errs []error var errs []interface{}
var ch = make(chan *AiResult, len(clusters)) var ch = make(chan *AiResult, len(clusters))
var errCh = make(chan error, len(clusters)) var errCh = make(chan interface{}, len(clusters))
executorMap := *as.AiExecutor executorMap := as.AiService.AiExecutorAdapterMap[as.option.AdapterId]
for _, cluster := range clusters { for _, cluster := range clusters {
c := cluster c := cluster
if cluster.Replicas == 0 {
continue
}
wg.Add(1) wg.Add(1)
go func() { go func() {
opt, _ := cloneAiOption(as.option) opt, _ := cloneAiOption(as.option)
resp, err := executorMap[c.Name].Execute(as.ctx, opt) resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt)
if err != nil { if err != nil {
errCh <- err e := struct {
err error
clusterId string
}{
err: err,
clusterId: c.ClusterId,
}
errCh <- e
wg.Done() wg.Done()
return return
} }
result, _ := convertType(resp) result, _ := convertType(resp)
result.Replica = c.Replicas result.Replica = c.Replicas
result.ClusterId = strconv.FormatInt(c.ParticipantId, 10) result.ClusterId = c.ClusterId
result.Strategy = as.option.StrategyName
ch <- result ch <- result
wg.Done() wg.Done()
@ -150,10 +168,29 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
errs = append(errs, e) errs = append(errs, e)
} }
if len(errs) != 0 { if len(errs) == len(clusters) {
return nil, errors.New("submit task failed") return nil, errors.New("submit task failed")
} }
if len(errs) != 0 {
var msg string
for _, err := range errs {
e := (err).(struct {
err error
clusterId string
})
msg += fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
}
for s := range ch {
if s.Msg != "" {
msg += fmt.Sprintf("clusterId: %v , error: %v \n", s.ClusterId, s.Msg)
} else {
msg += fmt.Sprintf("clusterId: %v , submitted successfully, taskId: %v \n", s.ClusterId, s.TaskId)
}
}
return nil, errors.New(msg)
}
for s := range ch { for s := range ch {
// TODO: database operation // TODO: database operation
results = append(results, s) results = append(results, s)
@ -164,19 +201,28 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats, error) { func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats, error) {
var wg sync.WaitGroup var wg sync.WaitGroup
var ch = make(chan *collector.ResourceStats, len(*as.ResourceCollector)) var clustersNum = len(as.AiService.AiCollectorAdapterMap[as.option.AdapterId])
var errCh = make(chan error, len(*as.ResourceCollector)) var ch = make(chan *collector.ResourceStats, clustersNum)
var errCh = make(chan interface{}, clustersNum)
var resourceSpecs []*collector.ResourceStats var resourceSpecs []*collector.ResourceStats
var errs []error var errs []interface{}
for _, resourceCollector := range *as.ResourceCollector { for s, resourceCollector := range as.AiService.AiCollectorAdapterMap[as.option.AdapterId] {
wg.Add(1) wg.Add(1)
rc := resourceCollector rc := resourceCollector
id := s
go func() { go func() {
spec, err := rc.GetResourceStats(as.ctx) spec, err := rc.GetResourceStats(as.ctx)
if err != nil { if err != nil {
errCh <- err e := struct {
err error
clusterId string
}{
err: err,
clusterId: id,
}
errCh <- e
wg.Done() wg.Done()
return return
} }
@ -196,13 +242,22 @@ func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats,
errs = append(errs, e) errs = append(errs, e)
} }
if len(errs) != 0 { if len(errs) == clustersNum {
return nil, errors.New("get resources failed") return nil, errors.New("get resources failed")
} }
if len(resourceSpecs) == 0 { if len(errs) != 0 {
return nil, errors.New("no resource found") var msg string
for _, err := range errs {
e := (err).(struct {
err error
clusterId string
})
msg += fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
}
return nil, errors.New(msg)
} }
return resourceSpecs, nil return resourceSpecs, nil
} }

View File

@ -1,7 +1,8 @@
package option package option
type AiOption struct { type AiOption struct {
AiClusterId string // shuguangAi /octopus ClusterId AdapterId string
ClusterIds []string
TaskName string TaskName string
ResourceType string // cpu/gpu/compute card ResourceType string // cpu/gpu/compute card
CpuCoreNum int64 CpuCoreNum int64

View File

@ -1,11 +1,14 @@
package service package service
import ( import (
"github.com/zeromicro/go-zero/zrpc"
"gitlink.org.cn/JointCloud/pcm-ac/hpcacclient" "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/config"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/database" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/database"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/executor" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/executor"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-octopus/octopusclient" "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
"gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice" "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/imagesservice"
"gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice" "gitlink.org.cn/jcce-pcm/pcm-participant-modelarts/client/modelartsservice"
@ -18,30 +21,60 @@ const (
SHUGUANGAI = "shuguangAi" SHUGUANGAI = "shuguangAi"
) )
func InitAiClusterMap(octopusRpc octopusclient.Octopus, modelArtsRpc modelartsservice.ModelArtsService, modelArtsImgRpc imagesservice.ImagesService, aCRpc hpcacclient.HpcAC, storages *database.AiStorage) (*map[string]executor.AiExecutor, *map[string]collector.AiCollector) { type AiService struct {
clusters, _ := storages.GetClustersByAdapterId("1777144940459986944") AiExecutorAdapterMap map[string]map[string]executor.AiExecutor
AiCollectorAdapterMap map[string]map[string]collector.AiCollector
}
func NewAiService(conf *config.Config, storages *database.AiStorage) (*AiService, error) {
var aiType = "1"
adapterIds, err := storages.GetAdapterIdsByType(aiType)
if err != nil {
return nil, err
}
aiService := &AiService{
AiExecutorAdapterMap: make(map[string]map[string]executor.AiExecutor),
AiCollectorAdapterMap: make(map[string]map[string]collector.AiCollector),
}
for _, id := range adapterIds {
clusters, err := storages.GetClustersByAdapterId(id)
if err != nil {
return nil, err
}
exeClusterMap, colClusterMap := InitAiClusterMap(conf, clusters.List)
aiService.AiExecutorAdapterMap[id] = exeClusterMap
aiService.AiCollectorAdapterMap[id] = colClusterMap
}
return aiService, nil
}
func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[string]executor.AiExecutor, map[string]collector.AiCollector) {
executorMap := make(map[string]executor.AiExecutor) executorMap := make(map[string]executor.AiExecutor)
collectorMap := make(map[string]collector.AiCollector) collectorMap := make(map[string]collector.AiCollector)
for _, c := range clusters.List { for _, c := range clusters {
switch c.Name { switch c.Name {
case OCTOPUS: case OCTOPUS:
id, _ := strconv.ParseInt(c.Id, 10, 64) id, _ := strconv.ParseInt(c.Id, 10, 64)
octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(conf.OctopusRpcConf))
octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id) octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
collectorMap[c.Nickname] = octopus collectorMap[c.Id] = octopus
executorMap[c.Nickname] = octopus executorMap[c.Id] = octopus
case MODELARTS: case MODELARTS:
id, _ := strconv.ParseInt(c.Id, 10, 64) id, _ := strconv.ParseInt(c.Id, 10, 64)
modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf))
modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf))
modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Nickname, id) modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Nickname, id)
collectorMap[c.Nickname] = modelarts collectorMap[c.Id] = modelarts
executorMap[c.Nickname] = modelarts executorMap[c.Id] = modelarts
case SHUGUANGAI: case SHUGUANGAI:
id, _ := strconv.ParseInt(c.Id, 10, 64) id, _ := strconv.ParseInt(c.Id, 10, 64)
aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf))
sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id) sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
collectorMap[c.Nickname] = sgai collectorMap[c.Id] = sgai
executorMap[c.Nickname] = sgai executorMap[c.Id] = sgai
} }
} }
return &executorMap, &collectorMap return executorMap, collectorMap
} }

View File

@ -9,18 +9,18 @@ type AiCollector interface {
} }
type ResourceStats struct { type ResourceStats struct {
ParticipantId int64 ClusterId string
Name string Name string
CpuCoreAvail int64 CpuCoreAvail int64
CpuCoreTotal int64 CpuCoreTotal int64
MemAvail float64 MemAvail float64
MemTotal float64 MemTotal float64
DiskAvail float64 DiskAvail float64
DiskTotal float64 DiskTotal float64
GpuAvail int64 GpuAvail int64
CardsAvail []*Card CardsAvail []*Card
CpuCoreHours float64 CpuCoreHours float64
Balance float64 Balance float64
} }
type Card struct { type Card struct {

View File

@ -33,15 +33,14 @@ func (ps *DynamicResourcesStrategy) Schedule() ([]*AssignedCluster, error) {
for _, res := range ps.resources { for _, res := range ps.resources {
if opt.ResourceType == "cpu" { if opt.ResourceType == "cpu" {
if res.CpuCoreHours <= 0 { if res.CpuCoreHours <= 0 {
cluster := &AssignedCluster{ParticipantId: res.ParticipantId, Name: res.Name, Replicas: ps.replicas} cluster := &AssignedCluster{ClusterId: res.ClusterId, Replicas: ps.replicas}
results = append(results, cluster) results = append(results, cluster)
return results, nil return results, nil
} }
if res.CpuCoreHours > maxCpuCoreHoursAvailable { if res.CpuCoreHours > maxCpuCoreHoursAvailable {
maxCpuCoreHoursAvailable = res.CpuCoreHours maxCpuCoreHoursAvailable = res.CpuCoreHours
assignedCluster.Name = res.Name assignedCluster.ClusterId = res.ClusterId
assignedCluster.ParticipantId = res.ParticipantId
assignedCluster.Replicas = ps.replicas assignedCluster.Replicas = ps.replicas
} }
} }
@ -56,8 +55,7 @@ func (ps *DynamicResourcesStrategy) Schedule() ([]*AssignedCluster, error) {
} }
if maxCurrentCardHours > maxCardHoursAvailable { if maxCurrentCardHours > maxCardHoursAvailable {
maxCardHoursAvailable = maxCurrentCardHours maxCardHoursAvailable = maxCurrentCardHours
assignedCluster.Name = res.Name assignedCluster.ClusterId = res.ClusterId
assignedCluster.ParticipantId = res.ParticipantId
assignedCluster.Replicas = ps.replicas assignedCluster.Replicas = ps.replicas
} }
} }

View File

@ -1,23 +0,0 @@
package param
import "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/entity"
type ReplicationParams struct {
Replicas int32
*Params
}
func (r *ReplicationParams) GetReplicas() int32 {
return r.Replicas
}
func (r *ReplicationParams) GetParticipants() []*entity.Participant {
var participants []*entity.Participant
for _, resource := range r.Resources {
participants = append(participants, &entity.Participant{
Participant_id: resource.ParticipantId,
Name: resource.Name,
})
}
return participants
}

View File

@ -2,6 +2,7 @@ package param
import ( import (
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/algorithm/providerPricing" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/algorithm/providerPricing"
"strconv"
) )
type ResourcePricingParams struct { type ResourcePricingParams struct {
@ -21,8 +22,9 @@ func (r *ResourcePricingParams) GetTask() *providerPricing.Task {
func (r *ResourcePricingParams) GetProviders() []*providerPricing.Provider { func (r *ResourcePricingParams) GetProviders() []*providerPricing.Provider {
var providerList []*providerPricing.Provider var providerList []*providerPricing.Provider
for _, resource := range r.Resources { for _, resource := range r.Resources {
id, _ := strconv.ParseInt(resource.ClusterId, 10, 64)
provider := providerPricing.NewProvider( provider := providerPricing.NewProvider(
resource.ParticipantId, id,
float64(resource.CpuCoreAvail), float64(resource.CpuCoreAvail),
resource.MemAvail, resource.MemAvail,
resource.DiskAvail, 0.0, 0.0, 0.0) resource.DiskAvail, 0.0, 0.0, 0.0)

View File

@ -2,33 +2,31 @@ package strategy
import ( import (
"errors" "errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/entity"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param"
) )
type ReplicationStrategy struct { type ReplicationStrategy struct {
replicas int32 replicas int32
participants []*entity.Participant clusterIds []string
} }
func NewReplicationStrategy(params *param.ReplicationParams) *ReplicationStrategy { func NewReplicationStrategy(clusterIds []string, replicas int32) *ReplicationStrategy {
return &ReplicationStrategy{replicas: params.GetReplicas(), return &ReplicationStrategy{clusterIds: clusterIds,
participants: params.GetParticipants(), replicas: replicas,
} }
} }
func (ps *ReplicationStrategy) Schedule() ([]*AssignedCluster, error) { func (r *ReplicationStrategy) Schedule() ([]*AssignedCluster, error) {
if ps.replicas < 1 { if r.replicas < 1 {
return nil, errors.New("replicas must be greater than 0") return nil, errors.New("replicas must be greater than 0")
} }
if ps.participants == nil { if len(r.clusterIds) == 0 {
return nil, errors.New("participantId must be set") return nil, errors.New("clusterIds must be set")
} }
var results []*AssignedCluster var results []*AssignedCluster
for _, p := range ps.participants { for _, c := range r.clusterIds {
cluster := &AssignedCluster{ParticipantId: p.Participant_id, Name: p.Name, Replicas: ps.replicas} cluster := &AssignedCluster{ClusterId: c, Replicas: r.replicas}
results = append(results, cluster) results = append(results, cluster)
} }
return results, nil return results, nil

View File

@ -18,6 +18,7 @@ import (
"errors" "errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/algorithm/providerPricing" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/algorithm/providerPricing"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param"
"strconv"
) )
type PricingStrategy struct { type PricingStrategy struct {
@ -154,7 +155,7 @@ func (ps *PricingStrategy) Schedule() ([]*AssignedCluster, error) {
if e == 0 { if e == 0 {
continue continue
} }
cluster := &AssignedCluster{ParticipantId: ps.ProviderList[i].Pid, Replicas: int32(e)} cluster := &AssignedCluster{ClusterId: strconv.FormatInt(ps.ProviderList[i].Pid, 10), Replicas: int32(e)}
results = append(results, cluster) results = append(results, cluster)
} }

View File

@ -29,7 +29,7 @@ func (s *StaticWeightStrategy) Schedule() ([]*AssignedCluster, error) {
weights := make([]*weightDistributing.Weight, 0) weights := make([]*weightDistributing.Weight, 0)
for k, v := range s.staticWeightMap { for k, v := range s.staticWeightMap {
weight := &weightDistributing.Weight{ weight := &weightDistributing.Weight{
Name: k, Id: k,
Weight: v, Weight: v,
} }
weights = append(weights, weight) weights = append(weights, weight)
@ -39,7 +39,7 @@ func (s *StaticWeightStrategy) Schedule() ([]*AssignedCluster, error) {
var results []*AssignedCluster var results []*AssignedCluster
for _, weight := range weights { for _, weight := range weights {
cluster := &AssignedCluster{ParticipantId: weight.Id, Name: weight.Name, Replicas: weight.Replica} cluster := &AssignedCluster{ClusterId: weight.Id, Replicas: weight.Replica}
results = append(results, cluster) results = append(results, cluster)
} }

View File

@ -18,9 +18,8 @@ type Strategy interface {
} }
type AssignedCluster struct { type AssignedCluster struct {
ParticipantId int64 ClusterId string
Name string Replicas int32
Replicas int32
} }
func GetStrategyNames() []string { func GetStrategyNames() []string {

View File

@ -5,7 +5,6 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/entity" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/entity"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param"
"testing" "testing"
) )
@ -17,15 +16,15 @@ func TestReplication(t *testing.T) {
} }
rsc := []*collector.ResourceStats{ rsc := []*collector.ResourceStats{
{ {
ParticipantId: 1, ClusterId: "1",
Name: "test1", Name: "test1",
}, },
{ {
ParticipantId: 1, ClusterId: "2",
Name: "test2"}, Name: "test2"},
{ {
ParticipantId: 1, ClusterId: "3",
Name: "test3"}, Name: "test3"},
} }
tests := []struct { tests := []struct {
name string name string
@ -47,8 +46,11 @@ func TestReplication(t *testing.T) {
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
params := &param.Params{Resources: rsc} var clusterIds []string
repl := strategy.NewReplicationStrategy(&param.ReplicationParams{Params: params, Replicas: tt.replica}) for _, stats := range rsc {
clusterIds = append(clusterIds, stats.ClusterId)
}
repl := strategy.NewReplicationStrategy(clusterIds, 0)
schedule, err := repl.Schedule() schedule, err := repl.Schedule()
if err != nil { if err != nil {
return return

View File

@ -283,11 +283,11 @@ func (o *OctopusLink) GetResourceStats(ctx context.Context) (*collector.Resource
} }
resourceStats := &collector.ResourceStats{ resourceStats := &collector.ResourceStats{
ParticipantId: o.participantId, ClusterId: strconv.FormatInt(o.participantId, 10),
Name: o.platform, Name: o.platform,
Balance: balance, Balance: balance,
CardsAvail: cards, CardsAvail: cards,
CpuCoreHours: cpuHours, CpuCoreHours: cpuHours,
} }
return resourceStats, nil return resourceStats, nil

View File

@ -26,6 +26,8 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"strconv" "strconv"
"strings" "strings"
"sync"
"time"
) )
const ( const (
@ -266,96 +268,144 @@ func (s *ShuguangAi) QuerySpecs(ctx context.Context) (interface{}, error) {
} }
func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) { func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
//balance var wg sync.WaitGroup
userReq := &hpcAC.GetUserInfoReq{} wg.Add(4)
userinfo, err := s.aCRpc.GetUserInfo(ctx, userReq) var cBalance = make(chan float64)
if err != nil { var cMemTotal = make(chan float64)
return nil, err var cTotalCpu = make(chan int64)
}
balance, _ := strconv.ParseFloat(userinfo.Data.AccountBalance, 64)
//resource limit resourceStats := &collector.ResourceStats{
limitReq := &hpcAC.QueueReq{} ClusterId: strconv.FormatInt(s.participantId, 10),
limitResp, err := s.aCRpc.QueryUserQuotasLimit(ctx, limitReq) Name: s.platform,
if err != nil {
return nil, err
} }
totalCpu := limitResp.Data.AccountMaxCpu
totalDcu := limitResp.Data.AccountMaxDcu
//disk
//diskReq := &hpcAC.ParaStorQuotaReq{}
//diskResp, err := s.aCRpc.ParaStorQuota(ctx, diskReq)
//if err != nil {
// return nil, err
//}
//
//totalDisk := common.RoundFloat(diskResp.Data[0].Threshold*KB*KB*KB, 3)
//availDisk := common.RoundFloat((diskResp.Data[0].Threshold-diskResp.Data[0].Usage)*KB*KB*KB, 3)
//memory
nodeResp, err := s.aCRpc.GetNodeResources(ctx, nil)
if err != nil {
return nil, err
}
memSize := common.RoundFloat(float64(nodeResp.Data.MemorySize)*KB*KB, 3) // MB to BYTES
//resources being occupied
memberJobResp, err := s.aCRpc.GetMemberJobs(ctx, nil)
if err != nil {
return nil, err
}
var CpuCoreAvail int64
var MemAvail float64
if len(memberJobResp.Data) != 0 {
CpuCoreAvail = totalCpu
MemAvail = memSize
} else {
var cpuCoreUsed int64
var memUsed float64
for _, datum := range memberJobResp.Data {
cpuCoreUsed += datum.CpuCore
}
memUsed = float64(cpuCoreUsed * 2 * KB * KB * KB) // 2 GB per cpu core
if cpuCoreUsed > totalCpu {
CpuCoreAvail = 0
} else {
CpuCoreAvail = totalCpu - cpuCoreUsed
}
if memUsed > memSize {
MemAvail = 0
} else {
MemAvail = memSize - memUsed
}
}
//usable hours
var cards []*collector.Card
cardHours := common.RoundFloat(balance/DCUPRICEPERHOUR, 3)
cpuHours := common.RoundFloat(balance/CPUCOREPRICEPERHOUR, 3)
dcu := &collector.Card{ dcu := &collector.Card{
Platform: SHUGUANGAI, Platform: SHUGUANGAI,
Type: CARD, Type: CARD,
Name: DCU, Name: DCU,
TOpsAtFp16: DCU_TOPS, TOpsAtFp16: DCU_TOPS,
CardHours: cardHours,
CardNum: int32(totalDcu),
} }
//balance
go func() {
userReq := &hpcAC.GetUserInfoReq{}
userinfo, err := s.aCRpc.GetUserInfo(ctx, userReq)
if err != nil {
return
}
balance, _ := strconv.ParseFloat(userinfo.Data.AccountBalance, 64)
resourceStats.Balance = balance
cBalance <- balance
}()
//resource limit
go func() {
limitReq := &hpcAC.QueueReq{}
limitResp, err := s.aCRpc.QueryUserQuotasLimit(ctx, limitReq)
if err != nil {
wg.Done()
return
}
totalCpu := limitResp.Data.AccountMaxCpu
totalDcu := limitResp.Data.AccountMaxDcu
dcu.CardNum = int32(totalDcu)
resourceStats.CpuCoreTotal = totalCpu
cTotalCpu <- totalCpu
wg.Done()
}()
//disk
go func() {
diskReq := &hpcAC.ParaStorQuotaReq{}
diskResp, err := s.aCRpc.ParaStorQuota(ctx, diskReq)
if err != nil {
wg.Done()
return
}
totalDisk := common.RoundFloat(diskResp.Data[0].Threshold*KB*KB*KB, 3)
availDisk := common.RoundFloat((diskResp.Data[0].Threshold-diskResp.Data[0].Usage)*KB*KB*KB, 3)
resourceStats.DiskTotal = totalDisk
resourceStats.DiskAvail = availDisk
wg.Done()
}()
//memory
go func() {
nodeResp, err := s.aCRpc.GetNodeResources(ctx, nil)
if err != nil {
wg.Done()
return
}
memSize := common.RoundFloat(float64(nodeResp.Data.MemorySize)*KB*KB, 3) // MB to BYTES
resourceStats.MemTotal = memSize
cMemTotal <- memSize
wg.Done()
}()
//resources being occupied
go func() {
memSize := <-cMemTotal
totalCpu := <-cTotalCpu
memberJobResp, err := s.aCRpc.GetMemberJobs(ctx, nil)
if err != nil {
wg.Done()
return
}
var cpuCoreAvail int64
var memAvail float64
if len(memberJobResp.Data) != 0 {
cpuCoreAvail = totalCpu
memAvail = memSize
} else {
var cpuCoreUsed int64
var memUsed float64
for _, datum := range memberJobResp.Data {
cpuCoreUsed += datum.CpuCore
}
memUsed = float64(cpuCoreUsed * 2 * KB * KB * KB) // 2 GB per cpu core
if cpuCoreUsed > totalCpu {
cpuCoreAvail = 0
} else {
cpuCoreAvail = totalCpu - cpuCoreUsed
}
if memUsed > memSize {
memAvail = 0
} else {
memAvail = memSize - memUsed
}
}
resourceStats.CpuCoreAvail = cpuCoreAvail
resourceStats.MemAvail = memAvail
wg.Done()
}()
//usable hours
var balance float64
select {
case v := <-cBalance:
balance = v
case <-time.After(2 * time.Second):
return nil, errors.New("get balance rpc call failed")
}
var cards []*collector.Card
cardHours := common.RoundFloat(balance/DCUPRICEPERHOUR, 3)
cpuHours := common.RoundFloat(balance/CPUCOREPRICEPERHOUR, 3)
dcu.CardHours = cardHours
resourceStats.CpuCoreHours = cpuHours
wg.Wait()
cards = append(cards, dcu) cards = append(cards, dcu)
resourceStats := &collector.ResourceStats{ resourceStats.CardsAvail = cards
ParticipantId: s.participantId,
Name: s.platform,
Balance: balance,
CpuCoreTotal: totalCpu,
CpuCoreAvail: CpuCoreAvail,
//DiskTotal: totalDisk,
//DiskAvail: availDisk,
MemTotal: memSize,
MemAvail: MemAvail,
CpuCoreHours: cpuHours,
CardsAvail: cards,
}
return resourceStats, nil return resourceStats, nil
} }

View File

@ -16,6 +16,7 @@ package storeLink
import ( import (
"context" "context"
"fmt"
"github.com/pkg/errors" "github.com/pkg/errors"
"gitlink.org.cn/JointCloud/pcm-ac/hpcAC" "gitlink.org.cn/JointCloud/pcm-ac/hpcAC"
"gitlink.org.cn/JointCloud/pcm-ac/hpcacclient" "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient"
@ -127,21 +128,29 @@ func GetResourceTypes() []string {
return resourceTypes return resourceTypes
} }
func GetDatasetsNames(ctx context.Context, collectorMap *map[string]collector.AiCollector) ([]string, error) { func GetDatasetsNames(ctx context.Context, collectorMap map[string]collector.AiCollector) ([]string, error) {
var wg sync.WaitGroup var wg sync.WaitGroup
var errCh = make(chan error, len(*collectorMap)) var errCh = make(chan interface{}, len(collectorMap))
var errs []error var errs []interface{}
var names []string var names []string
var mu sync.Mutex var mu sync.Mutex
colMap := *collectorMap colMap := collectorMap
for _, col := range colMap { for s, col := range colMap {
wg.Add(1) wg.Add(1)
c := col c := col
id := s
go func() { go func() {
var ns []string var ns []string
specs, err := c.GetDatasetsSpecs(ctx) specs, err := c.GetDatasetsSpecs(ctx)
if err != nil { if err != nil {
errCh <- err e := struct {
err error
clusterId string
}{
err: err,
clusterId: id,
}
errCh <- e
wg.Done() wg.Done()
return return
} }
@ -167,34 +176,54 @@ func GetDatasetsNames(ctx context.Context, collectorMap *map[string]collector.Ai
wg.Wait() wg.Wait()
close(errCh) close(errCh)
if len(errs) == len(colMap) {
return nil, errors.New("get DatasetsNames failed")
}
for e := range errCh { for e := range errCh {
errs = append(errs, e) errs = append(errs, e)
} }
if len(errs) != 0 { if len(errs) != 0 {
return nil, errors.New("get DatasetsNames failed") var msg string
for _, err := range errs {
e := (err).(struct {
err error
clusterId string
})
msg += fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
}
return nil, errors.New(msg)
} }
names = common.RemoveDuplicates(names) names = common.RemoveDuplicates(names)
return names, nil return names, nil
} }
func GetAlgorithms(ctx context.Context, collectorMap *map[string]collector.AiCollector, resourceType string, taskType string, dataset string) ([]string, error) { func GetAlgorithms(ctx context.Context, collectorMap map[string]collector.AiCollector, resourceType string, taskType string, dataset string) ([]string, error) {
var names []string var names []string
var wg sync.WaitGroup var wg sync.WaitGroup
var errCh = make(chan error, len(*collectorMap)) var errCh = make(chan interface{}, len(collectorMap))
var errs []error var errs []interface{}
var mu sync.Mutex var mu sync.Mutex
colMap := *collectorMap colMap := collectorMap
for _, col := range colMap { for s, col := range colMap {
wg.Add(1) wg.Add(1)
c := col c := col
id := s
go func() { go func() {
var ns []string var ns []string
algorithms, err := c.GetAlgorithms(ctx) algorithms, err := c.GetAlgorithms(ctx)
if err != nil { if err != nil {
errCh <- err e := struct {
err error
clusterId string
}{
err: err,
clusterId: id,
}
errCh <- e
wg.Done() wg.Done()
return return
} }
@ -240,10 +269,22 @@ func GetAlgorithms(ctx context.Context, collectorMap *map[string]collector.AiCol
errs = append(errs, e) errs = append(errs, e)
} }
if len(errs) != 0 { if len(errs) == len(colMap) {
return nil, errors.New("get Algorithms failed") return nil, errors.New("get Algorithms failed")
} }
if len(errs) != 0 {
var msg string
for _, err := range errs {
e := (err).(struct {
err error
clusterId string
})
msg += fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
}
return nil, errors.New(msg)
}
names = common.RemoveDuplicates(names) names = common.RemoveDuplicates(names)
return names, nil return names, nil
} }

View File

@ -116,24 +116,28 @@ func NewServiceContext(c config.Config) *ServiceContext {
}) })
// scheduler // scheduler
octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(c.OctopusRpcConf)) //octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(c.OctopusRpcConf))
aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(c.ACRpcConf)) //aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(c.ACRpcConf))
modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(c.ModelArtsRpcConf)) //modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(c.ModelArtsRpcConf))
modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(c.ModelArtsImgRpcConf)) //modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(c.ModelArtsImgRpcConf))
storage := &database.AiStorage{DbEngin: dbEngin} storage := &database.AiStorage{DbEngin: dbEngin}
aiExecutor, resourceCollector := service.InitAiClusterMap(octopusRpc, modelArtsRpc, modelArtsImgRpc, aCRpc, storage) aiService, err := service.NewAiService(&c, storage)
scheduler := scheduler.NewSchdlr(resourceCollector, storage, aiExecutor) if err != nil {
logx.Error(err.Error())
return nil
}
scheduler := scheduler.NewSchdlr(aiService, storage)
return &ServiceContext{ return &ServiceContext{
Cron: cron.New(cron.WithSeconds()), Cron: cron.New(cron.WithSeconds()),
DbEngin: dbEngin, DbEngin: dbEngin,
Config: c, Config: c,
RedisClient: redisClient, RedisClient: redisClient,
ModelArtsRpc: modelArtsRpc, ModelArtsRpc: modelartsservice.NewModelArtsService(zrpc.MustNewClient(c.ModelArtsRpcConf)),
ModelArtsImgRpc: modelArtsImgRpc, ModelArtsImgRpc: imagesservice.NewImagesService(zrpc.MustNewClient(c.ModelArtsImgRpcConf)),
CephRpc: cephclient.NewCeph(zrpc.MustNewClient(c.CephRpcConf)), CephRpc: cephclient.NewCeph(zrpc.MustNewClient(c.CephRpcConf)),
ACRpc: aCRpc, ACRpc: hpcacclient.NewHpcAC(zrpc.MustNewClient(c.ACRpcConf)),
OctopusRpc: octopusRpc, OctopusRpc: octopusclient.NewOctopus(zrpc.MustNewClient(c.OctopusRpcConf)),
OpenstackRpc: openstackclient.NewOpenstack(zrpc.MustNewClient(c.OpenstackRpcConf)), OpenstackRpc: openstackclient.NewOpenstack(zrpc.MustNewClient(c.OpenstackRpcConf)),
K8sRpc: kubernetesclient.NewKubernetes(zrpc.MustNewClient(c.K8sNativeConf)), K8sRpc: kubernetesclient.NewKubernetes(zrpc.MustNewClient(c.K8sNativeConf)),
MonitorClient: make(map[int64]tracker.Prometheus), MonitorClient: make(map[int64]tracker.Prometheus),

View File

@ -131,40 +131,22 @@ type TaskYaml struct {
} }
type CommitVmTaskReq struct { type CommitVmTaskReq struct {
Name string `json:"name"` Name string `json:"name"`
NsID string `json:"nsID"` NsID string `json:"nsID"`
Replicas int64 `json:"replicas,optional"` Replicas int64 `json:"replicas,optional"`
MatchLabels map[string]string `json:"matchLabels,optional"` MatchLabels map[string]string `json:"matchLabels,optional"`
Servers []ServerCommit `json:"servers,optional"` AdapterId string `json:"adapterId,optional"`
Platform string `json:"platform,optional"` ClusterType string `json:"clusterType,optional"`
AdapterId string `json:"adapterId,optional"` CreateMulServer []CreateMulDomainServer `json:"createMulServer,optional"`
ClusterType string `json:"clusterType,optional"`
} }
type ServerCommit struct { type CreateMulDomainServer struct {
AllCardRunTime string `json:"allCardRunTime"` Platform string `json:"platform,optional"`
FlavorRef string `json:"flavorRef,optional"` Name string `json:"name,optional"`
Name string `json:"name,optional"` Min_count int64 `json:"min_count,optional"`
ImageRef string `json:"imageRef,optional"` ImageRef string `json:"imageRef,optional"`
AccessIPv4 string `json:"accessIPv4,optional"` FlavorRef string `json:"flavorRef,optional"`
AccessIPv6 string `json:"accessIPv6,optional"` Uuid string `json:"uuid,optional"`
AdminPass string `json:"adminPass,optional"`
Availability_zone string `json:"availability_zone,optional"`
Key_name string `json:"key_name,optional"`
Hostname string `json:"hostname,optional"`
Host string `json:"host,optional"`
Networks []Networks `json:"networks,optional"`
}
type Networks struct {
Uuid string `json:"uuid,optional"`
Port string `json:"port,optional"`
Fixed_ip string `json:"fixed_ip,optional"`
Tag string `json:"tag,optional"`
}
type Block_device_mapping_v2Commit struct {
Uuid string `json:"uuid,optional"`
} }
type CommitVmTaskResp struct { type CommitVmTaskResp struct {
@ -5309,13 +5291,15 @@ type ScheduleResp struct {
type ScheduleResult struct { type ScheduleResult struct {
ClusterId string `json:"clusterId"` ClusterId string `json:"clusterId"`
TaskId string `json:"taskId"` TaskId string `json:"taskId"`
Strategy string `json:"strategy"`
Replica int32 `json:"replica"` Replica int32 `json:"replica"`
Msg string `json:"msg"` Msg string `json:"msg"`
} }
type AiOption struct { type AiOption struct {
TaskName string `json:"taskName"` TaskName string `json:"taskName"`
AiClusterId string `json:"aiClusterId,optional"` AdapterId string `json:"adapterId"`
AiClusterIds []string `json:"aiClusterIds"`
ResourceType string `json:"resourceType"` ResourceType string `json:"resourceType"`
Tops float64 `json:"Tops,optional"` Tops float64 `json:"Tops,optional"`
TaskType string `json:"taskType"` TaskType string `json:"taskType"`
@ -5336,6 +5320,10 @@ type AiTaskTypesResp struct {
TaskTypes []string `json:"taskTypes"` TaskTypes []string `json:"taskTypes"`
} }
type AiDatasetsReq struct {
AdapterId string `path:"adapterId"`
}
type AiDatasetsResp struct { type AiDatasetsResp struct {
Datasets []string `json:"datasets"` Datasets []string `json:"datasets"`
} }
@ -5345,6 +5333,7 @@ type AiStrategyResp struct {
} }
type AiAlgorithmsReq struct { type AiAlgorithmsReq struct {
AdapterId string `path:"adapterId"`
ResourceType string `path:"resourceType"` ResourceType string `path:"resourceType"`
TaskType string `path:"taskType"` TaskType string `path:"taskType"`
Dataset string `path:"dataset"` Dataset string `path:"dataset"`
@ -5451,7 +5440,10 @@ type VmInfo struct {
BlockUuid string `json:"block_uuid,omitempty"` BlockUuid string `json:"block_uuid,omitempty"`
SourceType string `json:"source_type,omitempty"` SourceType string `json:"source_type,omitempty"`
DeleteOnTermination bool `json:"delete_on_termination,omitempty"` DeleteOnTermination bool `json:"delete_on_termination,omitempty"`
State string `json:"state,omitempty"` Status string `json:"status,omitempty"`
MinCount string `json:"min_count,omitempty"`
Platform string `json:"platform,omitempty"`
Uuid string `json:"uuid,omitempty"`
} }
type PushTaskInfoReq struct { type PushTaskInfoReq struct {
@ -5468,7 +5460,37 @@ type PushTaskInfoResp struct {
} }
type PushResourceInfoReq struct { type PushResourceInfoReq struct {
AdapterId int64 `json:"adapterId"` AdapterId int64 `json:"adapterId"`
ResourceStats []ResourceStats `json:"resourceStats"`
}
type PushResourceInfoResp struct {
Code int64 `json:"code"`
Msg string `json:"msg"`
}
type ResourceStats struct {
ClusterId int64 `json:"clusterId"`
Name string `json:"name"`
CpuCoreAvail int64 `json:"cpuCoreAvail"`
CpuCoreTotal int64 `json:"cpuCoreTotal"`
MemAvail float64 `json:"memAvail"`
MemTotal float64 `json:"memTotal"`
DiskAvail float64 `json:"diskAvail"`
DiskTotal float64 `json:"diskTotal"`
GpuAvail int64 `json:"gpuAvail"`
CardsAvail []*Card `json:"cardsAvail"`
CpuCoreHours float64 `json:"cpuCoreHours"`
Balance float64 `json:"balance"`
}
type Card struct {
Platform string `json:"platform"`
Type string `json:"type"`
Name string `json:"name"`
TOpsAtFp16 float64 `json:"TOpsAtFp16"`
CardHours float64 `json:"cardHours"`
CardNum int32 `json:"cardNum"`
} }
type CreateAlertRuleReq struct { type CreateAlertRuleReq struct {

2
go.mod
View File

@ -2,6 +2,8 @@ module gitlink.org.cn/JointCloud/pcm-coordinator
go 1.21 go 1.21
retract v0.1.20-0.20240319015239-6ae13da05255
require ( require (
github.com/JCCE-nudt/zero-contrib/zrpc/registry/nacos v0.0.0-20230419021610-13bbc83fbc3c github.com/JCCE-nudt/zero-contrib/zrpc/registry/nacos v0.0.0-20230419021610-13bbc83fbc3c
github.com/Masterminds/squirrel v1.5.4 github.com/Masterminds/squirrel v1.5.4

24
pkg/models/taskvmmodel.go Normal file
View File

@ -0,0 +1,24 @@
package models
import "github.com/zeromicro/go-zero/core/stores/sqlx"
var _ TaskVmModel = (*customTaskVmModel)(nil)
type (
// TaskVmModel is an interface to be customized, add more methods here,
// and implement the added methods in customTaskVmModel.
TaskVmModel interface {
taskVmModel
}
customTaskVmModel struct {
*defaultTaskVmModel
}
)
// NewTaskVmModel returns a model for the database table.
func NewTaskVmModel(conn sqlx.SqlConn) TaskVmModel {
return &customTaskVmModel{
defaultTaskVmModel: newTaskVmModel(conn),
}
}

View File

@ -0,0 +1,107 @@
// Code generated by goctl. DO NOT EDIT.
package models
import (
"context"
"database/sql"
"fmt"
"strings"
"github.com/zeromicro/go-zero/core/stores/builder"
"github.com/zeromicro/go-zero/core/stores/sqlc"
"github.com/zeromicro/go-zero/core/stores/sqlx"
"github.com/zeromicro/go-zero/core/stringx"
)
var (
taskVmFieldNames = builder.RawFieldNames(&TaskVm{})
taskVmRows = strings.Join(taskVmFieldNames, ",")
taskVmRowsExpectAutoSet = strings.Join(stringx.Remove(taskVmFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), ",")
taskVmRowsWithPlaceHolder = strings.Join(stringx.Remove(taskVmFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), "=?,") + "=?"
)
type (
taskVmModel interface {
Insert(ctx context.Context, data *TaskVm) (sql.Result, error)
FindOne(ctx context.Context, id int64) (*TaskVm, error)
Update(ctx context.Context, data *TaskVm) error
Delete(ctx context.Context, id int64) error
}
defaultTaskVmModel struct {
conn sqlx.SqlConn
table string
}
TaskVm struct {
Id int64 `db:"id"` // id
ParticipantId int64 `db:"participant_id"` // p端id
TaskId int64 `db:"task_id"` // 任务id
Name string `db:"name"` // 虚拟机名称
AdapterId int64 `db:"adapter_id"` // 执行任务的适配器id
ClusterId int64 `db:"cluster_id"` // 执行任务的集群id
FlavorRef string `db:"flavor_ref"` // 规格索引
ImageRef string `db:"image_ref"` // 镜像索引
Status string `db:"status"` // 状态
Platform string `db:"platform"` // 平台
Description string `db:"description"` // 描述
AvailabilityZone string `db:"availability_zone"`
MinCount int64 `db:"min_count"` // 数量
Uuid string `db:"uuid"` // 网络id
StartTime string `db:"start_time"` // 开始时间
RunningTime string `db:"running_time"` // 运行时间
Result string `db:"result"` // 运行结果
DeletedAt string `db:"deleted_at"` // 删除时间
}
)
func newTaskVmModel(conn sqlx.SqlConn) *defaultTaskVmModel {
return &defaultTaskVmModel{
conn: conn,
table: "`task_vm`",
}
}
func (m *defaultTaskVmModel) withSession(session sqlx.Session) *defaultTaskVmModel {
return &defaultTaskVmModel{
conn: sqlx.NewSqlConnFromSession(session),
table: "`task_vm`",
}
}
func (m *defaultTaskVmModel) Delete(ctx context.Context, id int64) error {
query := fmt.Sprintf("delete from %s where `id` = ?", m.table)
_, err := m.conn.ExecCtx(ctx, query, id)
return err
}
func (m *defaultTaskVmModel) FindOne(ctx context.Context, id int64) (*TaskVm, error) {
query := fmt.Sprintf("select %s from %s where `id` = ? limit 1", taskVmRows, m.table)
var resp TaskVm
err := m.conn.QueryRowCtx(ctx, &resp, query, id)
switch err {
case nil:
return &resp, nil
case sqlc.ErrNotFound:
return nil, ErrNotFound
default:
return nil, err
}
}
func (m *defaultTaskVmModel) Insert(ctx context.Context, data *TaskVm) (sql.Result, error) {
query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskVmRowsExpectAutoSet)
ret, err := m.conn.ExecCtx(ctx, query, data.ParticipantId, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt)
return ret, err
}
func (m *defaultTaskVmModel) Update(ctx context.Context, data *TaskVm) error {
query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskVmRowsWithPlaceHolder)
_, err := m.conn.ExecCtx(ctx, query, data.ParticipantId, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt, data.Id)
return err
}
func (m *defaultTaskVmModel) tableName() string {
return m.table
}