Merge pull request 'added deployinstance logics' (#266) from tzwang/pcm-coordinator:master into master

Former-commit-id: 9352e4cc0c1dfd7497868fd3a03676be05981e4b
This commit is contained in:
tzwang 2024-07-22 15:48:03 +08:00
commit 11ed4137b0
9 changed files with 311 additions and 50 deletions

4
go.mod
View File

@ -18,9 +18,9 @@ require (
github.com/prometheus/common v0.54.0
github.com/robfig/cron/v3 v3.0.1
github.com/zeromicro/go-zero v1.6.5
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203
gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5
gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d

8
go.sum
View File

@ -471,12 +471,12 @@ github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M
github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
github.com/zeromicro/go-zero v1.6.5 h1:JgsBa25/knnEL7+KQksbwktudIkNQvaAin0nisVgnSA=
github.com/zeromicro/go-zero v1.6.5/go.mod h1:XjbssEVEzFKueAh0Fie5kNf+cRqFlQQk46fY9WgEGaM=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1 h1:Wc9M/vq+9Iw49KZb6mgHj85sysGHjVY+QlHJeZKlx4w=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249 h1:bHJGq5P+8w4fP62PZhIiq/fvOhvDPRtkM4pcmU8OZ1w=
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY=
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe h1:teAWL7sJszDb1ZA7uptrzPSwJ1OIV840Q1/nrrDsx7E=
gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe/go.mod h1:/eOmBFZKWGoabG3sRVkVvIbLwsd2631k4jkUBR6x1AA=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 h1:E2QfpS3Y0FjR8Zyv5l2Ti/2NetQFqHG66c8+T/+J1u0=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330 h1:WxPrFSO6LjDCr+k7nmNFlPst8CtoTHQ2iSjv+D2rNnM=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ=
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI=
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203/go.mod h1:i2rrbMQ+Fve345BY9Heh4MUqVTAimZQElQhzzRee5B8=
gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnzkJBfMRnya1NrhOzlroUtRa5ePiYbPKlHLoLV0=

View File

@ -11,7 +11,10 @@ const (
type ICluster interface {
GetInferUrl(ctx context.Context, option *option.InferOption) ([]*InferUrl, error)
GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*DeployInstance, error)
GetInferDeployInstanceList(ctx context.Context) ([]*DeployInstance, error)
StartInferDeployInstance(ctx context.Context, id string) bool
StopInferDeployInstance(ctx context.Context, id string) bool
GetInferDeployInstance(ctx context.Context, id string) (*DeployInstance, error)
}
type IInference interface {
@ -29,4 +32,12 @@ type InferUrl struct {
}
type DeployInstance struct {
InstanceName string
InstanceId string
ModelName string
ModelType string
InferCard string
ClusterName string
Status string
CreatedTime string
}

View File

@ -399,6 +399,18 @@ func (m *ModelArtsLink) GetInferUrl(ctx context.Context, option *option.InferOpt
return imageUrls, nil
}
func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) {
func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
return nil, nil
}
func (m *ModelArtsLink) StartInferDeployInstance(ctx context.Context, id string) bool {
return false
}
func (m *ModelArtsLink) StopInferDeployInstance(ctx context.Context, id string) bool {
return false
}
func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
return nil, nil
}

View File

@ -904,6 +904,56 @@ func (o *OctopusLink) GetInferUrl(ctx context.Context, option *option.InferOptio
return imageUrls, nil
}
func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) {
func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
var insList []*inference.DeployInstance
req := &octopus.GetNotebookListReq{
Platform: o.platform,
PageIndex: o.pageIndex,
PageSize: o.pageSize,
SearchKey: DEPLOY_INSTANCE_PREFIEX,
}
list, err := o.octopusRpc.GetNotebookList(ctx, req)
if err != nil {
return nil, err
}
if list.Error != nil {
return nil, errors.New(list.Error.Message)
}
for _, notebook := range list.Payload.Notebooks {
ins := &inference.DeployInstance{}
ins.InstanceName = notebook.Name
ins.InstanceId = notebook.Id
ins.ClusterName = o.platform
ins.Status = notebook.Status
insList = append(insList, ins)
}
return insList, nil
}
func (o *OctopusLink) StartInferDeployInstance(ctx context.Context, id string) bool {
req := &octopus.StartNotebookReq{
Platform: o.platform,
Id: id,
}
resp, err := o.octopusRpc.StartNotebook(ctx, req)
if err != nil || !resp.Success {
return false
}
return resp.Success
}
func (o *OctopusLink) StopInferDeployInstance(ctx context.Context, id string) bool {
req := &octopus.StopNotebookReq{
Platform: o.platform,
Id: id,
}
resp, err := o.octopusRpc.StopNotebook(ctx, req)
if err != nil || !resp.Success {
return false
}
return resp.Success
}
func (o *OctopusLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
return nil, nil
}

View File

@ -32,24 +32,25 @@ import (
)
const (
RAM_SIZE_1G = 1024 // 1G
WORKER_NUMBER = 1
DCU = "DCU"
DCU_TOPS = 24.5
PYTORCH = "Pytorch"
TASK_PYTORCH_PREFIX = "PytorchTask"
TENSORFLOW = "Tensorflow"
RESOURCE_GROUP = "wzhdtest"
WorkPath = "/work/home/acgnnmfbwo/pcmv1/"
TimeoutLimit = "10:00:00"
PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py"
DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset"
ALGORITHM_DIR = "/work/home/acgnnmfbwo/pcmv1/algorithm"
TRAIN_FILE = "train.py"
CPUCOREPRICEPERHOUR = 0.09
DCUPRICEPERHOUR = 2.0
KB = 1024
TIMEOUT = 20
RAM_SIZE_1G = 1024 // 1G
WORKER_NUMBER = 1
DCU = "DCU"
DCU_TOPS = 24.5
PYTORCH = "Pytorch"
TASK_PYTORCH_PREFIX = "PytorchTask"
TENSORFLOW = "Tensorflow"
RESOURCE_GROUP = "wzhdtest"
WorkPath = "/work/home/acgnnmfbwo/pcmv1/"
TimeoutLimit = "10:00:00"
PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py"
DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset"
ALGORITHM_DIR = "/work/home/acgnnmfbwo/pcmv1/algorithm"
TRAIN_FILE = "train.py"
CPUCOREPRICEPERHOUR = 0.09
DCUPRICEPERHOUR = 2.0
KB = 1024
TIMEOUT = 20
DEPLOY_INSTANCE_LIMIT = 100
)
var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
@ -789,6 +790,66 @@ func (s *ShuguangAi) GetInferUrl(ctx context.Context, option *option.InferOption
return imageUrls, nil
}
func (s *ShuguangAi) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) {
func (s *ShuguangAi) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
var insList []*inference.DeployInstance
params := &hpcAC.GetInstanceServiceListReqParam{
InstanceServiceName: DEPLOY_INSTANCE_PREFIEX,
Start: 0,
Limit: DEPLOY_INSTANCE_LIMIT,
}
req := &hpcacclient.GetInstanceServiceListReq{
Param: params,
}
list, err := s.aCRpc.GetInstanceServiceList(ctx, req)
if err != nil {
return nil, err
}
if list.Code != "0" {
return nil, errors.New(list.Msg)
}
for _, datum := range list.Data {
ins := &inference.DeployInstance{}
ins.InstanceName = datum.InstanceServiceName
ins.InstanceId = datum.Id
ins.ClusterName = s.platform
ins.Status = datum.Status
ins.InferCard = DCU
ins.CreatedTime = datum.CreateTime
insList = append(insList, ins)
}
return insList, nil
}
func (s *ShuguangAi) StartInferDeployInstance(ctx context.Context, id string) bool {
req := &hpcAC.StartInstanceServiceReq{
InstanceServiceId: id,
}
resp, err := s.aCRpc.StartInstanceService(ctx, req)
if err != nil || resp.Code != "0" {
return false
}
if resp.Data == id && resp.Code == "0" {
return true
}
return false
}
func (s *ShuguangAi) StopInferDeployInstance(ctx context.Context, id string) bool {
ids := []string{id}
req := &hpcAC.StopInstanceServiceReq{
Ids: ids,
}
resp, err := s.aCRpc.StopInstanceService(ctx, req)
if err != nil || resp.Code != "0" {
return false
}
if resp.Code == "0" {
return true
}
return false
}
func (s *ShuguangAi) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
return nil, nil
}

View File

@ -46,28 +46,29 @@ type Linkage interface {
}
const (
COLON = ":"
PY_PARAM_PREFIX = "--"
SPACE = " "
UNDERSCORE = "_"
EQUAL = "="
DASH = "-"
FORWARD_SLASH = "/"
COMMA = ","
STAR = "*"
TYPE_OCTOPUS = "1"
TYPE_MODELARTS = "2"
TYPE_SHUGUANGAI = "3"
TYPE_SHUGUANGHPC = "4"
OCTOPUS = "Octopus"
MODELARTS = "Modelarts"
SHUGUANGAI = "ShuguangAi"
SHUGUANGHPC = "ShuguangHpc"
CPU = "cpu"
GPU = "gpu"
CARD = "computeCard"
PYTORCH_TASK = "pytorch"
TENSORFLOW_TASK = "tensorflow"
COLON = ":"
PY_PARAM_PREFIX = "--"
SPACE = " "
UNDERSCORE = "_"
EQUAL = "="
DASH = "-"
FORWARD_SLASH = "/"
COMMA = ","
STAR = "*"
TYPE_OCTOPUS = "1"
TYPE_MODELARTS = "2"
TYPE_SHUGUANGAI = "3"
TYPE_SHUGUANGHPC = "4"
OCTOPUS = "Octopus"
MODELARTS = "Modelarts"
SHUGUANGAI = "ShuguangAi"
SHUGUANGHPC = "ShuguangHpc"
CPU = "cpu"
GPU = "gpu"
CARD = "computeCard"
PYTORCH_TASK = "pytorch"
TENSORFLOW_TASK = "tensorflow"
DEPLOY_INSTANCE_PREFIEX = "infer"
)
var (

View File

@ -0,0 +1,24 @@
package models
import "github.com/zeromicro/go-zero/core/stores/sqlx"
var _ AiInferDeployInstanceModel = (*customAiInferDeployInstanceModel)(nil)
type (
// AiInferDeployInstanceModel is an interface to be customized, add more methods here,
// and implement the added methods in customAiInferDeployInstanceModel.
AiInferDeployInstanceModel interface {
aiInferDeployInstanceModel
}
customAiInferDeployInstanceModel struct {
*defaultAiInferDeployInstanceModel
}
)
// NewAiInferDeployInstanceModel returns a model for the database table.
func NewAiInferDeployInstanceModel(conn sqlx.SqlConn) AiInferDeployInstanceModel {
return &customAiInferDeployInstanceModel{
defaultAiInferDeployInstanceModel: newAiInferDeployInstanceModel(conn),
}
}

View File

@ -0,0 +1,102 @@
// Code generated by goctl. DO NOT EDIT.
package models
import (
"context"
"database/sql"
"fmt"
"strings"
"github.com/zeromicro/go-zero/core/stores/builder"
"github.com/zeromicro/go-zero/core/stores/sqlc"
"github.com/zeromicro/go-zero/core/stores/sqlx"
"github.com/zeromicro/go-zero/core/stringx"
)
var (
aiInferDeployInstanceFieldNames = builder.RawFieldNames(&AiInferDeployInstance{})
aiInferDeployInstanceRows = strings.Join(aiInferDeployInstanceFieldNames, ",")
aiInferDeployInstanceRowsExpectAutoSet = strings.Join(stringx.Remove(aiInferDeployInstanceFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), ",")
aiInferDeployInstanceRowsWithPlaceHolder = strings.Join(stringx.Remove(aiInferDeployInstanceFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), "=?,") + "=?"
)
type (
aiInferDeployInstanceModel interface {
Insert(ctx context.Context, data *AiInferDeployInstance) (sql.Result, error)
FindOne(ctx context.Context, id int64) (*AiInferDeployInstance, error)
Update(ctx context.Context, data *AiInferDeployInstance) error
Delete(ctx context.Context, id int64) error
}
defaultAiInferDeployInstanceModel struct {
conn sqlx.SqlConn
table string
}
AiInferDeployInstance struct {
Id int64 `db:"id"`
InstanceId sql.NullString `db:"instance_id"`
InstanceName sql.NullString `db:"instance_name"`
AdapterId sql.NullInt64 `db:"adapter_id"`
AdapterName sql.NullString `db:"adapter_name"`
ClusterId sql.NullInt64 `db:"cluster_id"`
ClusterName sql.NullString `db:"cluster_name"`
ModelName sql.NullString `db:"model_name"`
ModelType sql.NullString `db:"model_type"`
InferCard sql.NullString `db:"infer_card"`
Status sql.NullString `db:"status"`
CreateTime sql.NullString `db:"create_time"`
UpdateTime sql.NullString `db:"update_time"`
}
)
func newAiInferDeployInstanceModel(conn sqlx.SqlConn) *defaultAiInferDeployInstanceModel {
return &defaultAiInferDeployInstanceModel{
conn: conn,
table: "`ai_infer_deploy_instance`",
}
}
func (m *defaultAiInferDeployInstanceModel) withSession(session sqlx.Session) *defaultAiInferDeployInstanceModel {
return &defaultAiInferDeployInstanceModel{
conn: sqlx.NewSqlConnFromSession(session),
table: "`ai_infer_deploy_instance`",
}
}
func (m *defaultAiInferDeployInstanceModel) Delete(ctx context.Context, id int64) error {
query := fmt.Sprintf("delete from %s where `id` = ?", m.table)
_, err := m.conn.ExecCtx(ctx, query, id)
return err
}
func (m *defaultAiInferDeployInstanceModel) FindOne(ctx context.Context, id int64) (*AiInferDeployInstance, error) {
query := fmt.Sprintf("select %s from %s where `id` = ? limit 1", aiInferDeployInstanceRows, m.table)
var resp AiInferDeployInstance
err := m.conn.QueryRowCtx(ctx, &resp, query, id)
switch err {
case nil:
return &resp, nil
case sqlc.ErrNotFound:
return nil, ErrNotFound
default:
return nil, err
}
}
func (m *defaultAiInferDeployInstanceModel) Insert(ctx context.Context, data *AiInferDeployInstance) (sql.Result, error) {
query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, aiInferDeployInstanceRowsExpectAutoSet)
ret, err := m.conn.ExecCtx(ctx, query, data.InstanceId, data.InstanceName, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.ModelName, data.ModelType, data.InferCard, data.Status)
return ret, err
}
func (m *defaultAiInferDeployInstanceModel) Update(ctx context.Context, data *AiInferDeployInstance) error {
query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, aiInferDeployInstanceRowsWithPlaceHolder)
_, err := m.conn.ExecCtx(ctx, query, data.InstanceId, data.InstanceName, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.ModelName, data.ModelType, data.InferCard, data.Status, data.Id)
return err
}
func (m *defaultAiInferDeployInstanceModel) tableName() string {
return m.table
}