From d24f5dd7a8c51f1356c7e583439de1039f9e7255 Mon Sep 17 00:00:00 2001 From: tzwang Date: Mon, 22 Jul 2024 15:14:05 +0800 Subject: [PATCH 1/2] add deployinstance model Former-commit-id: 0beb6347de0ee3d21ea1033d692602010f74acca --- pkg/models/aiinferdeployinstancemodel.go | 24 +++++ pkg/models/aiinferdeployinstancemodel_gen.go | 102 +++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 pkg/models/aiinferdeployinstancemodel.go create mode 100644 pkg/models/aiinferdeployinstancemodel_gen.go diff --git a/pkg/models/aiinferdeployinstancemodel.go b/pkg/models/aiinferdeployinstancemodel.go new file mode 100644 index 00000000..59c29a44 --- /dev/null +++ b/pkg/models/aiinferdeployinstancemodel.go @@ -0,0 +1,24 @@ +package models + +import "github.com/zeromicro/go-zero/core/stores/sqlx" + +var _ AiInferDeployInstanceModel = (*customAiInferDeployInstanceModel)(nil) + +type ( + // AiInferDeployInstanceModel is an interface to be customized, add more methods here, + // and implement the added methods in customAiInferDeployInstanceModel. + AiInferDeployInstanceModel interface { + aiInferDeployInstanceModel + } + + customAiInferDeployInstanceModel struct { + *defaultAiInferDeployInstanceModel + } +) + +// NewAiInferDeployInstanceModel returns a model for the database table. +func NewAiInferDeployInstanceModel(conn sqlx.SqlConn) AiInferDeployInstanceModel { + return &customAiInferDeployInstanceModel{ + defaultAiInferDeployInstanceModel: newAiInferDeployInstanceModel(conn), + } +} diff --git a/pkg/models/aiinferdeployinstancemodel_gen.go b/pkg/models/aiinferdeployinstancemodel_gen.go new file mode 100644 index 00000000..d212f5f8 --- /dev/null +++ b/pkg/models/aiinferdeployinstancemodel_gen.go @@ -0,0 +1,102 @@ +// Code generated by goctl. DO NOT EDIT. + +package models + +import ( + "context" + "database/sql" + "fmt" + "strings" + + "github.com/zeromicro/go-zero/core/stores/builder" + "github.com/zeromicro/go-zero/core/stores/sqlc" + "github.com/zeromicro/go-zero/core/stores/sqlx" + "github.com/zeromicro/go-zero/core/stringx" +) + +var ( + aiInferDeployInstanceFieldNames = builder.RawFieldNames(&AiInferDeployInstance{}) + aiInferDeployInstanceRows = strings.Join(aiInferDeployInstanceFieldNames, ",") + aiInferDeployInstanceRowsExpectAutoSet = strings.Join(stringx.Remove(aiInferDeployInstanceFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), ",") + aiInferDeployInstanceRowsWithPlaceHolder = strings.Join(stringx.Remove(aiInferDeployInstanceFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), "=?,") + "=?" +) + +type ( + aiInferDeployInstanceModel interface { + Insert(ctx context.Context, data *AiInferDeployInstance) (sql.Result, error) + FindOne(ctx context.Context, id int64) (*AiInferDeployInstance, error) + Update(ctx context.Context, data *AiInferDeployInstance) error + Delete(ctx context.Context, id int64) error + } + + defaultAiInferDeployInstanceModel struct { + conn sqlx.SqlConn + table string + } + + AiInferDeployInstance struct { + Id int64 `db:"id"` + InstanceId sql.NullString `db:"instance_id"` + InstanceName sql.NullString `db:"instance_name"` + AdapterId sql.NullInt64 `db:"adapter_id"` + AdapterName sql.NullString `db:"adapter_name"` + ClusterId sql.NullInt64 `db:"cluster_id"` + ClusterName sql.NullString `db:"cluster_name"` + ModelName sql.NullString `db:"model_name"` + ModelType sql.NullString `db:"model_type"` + InferCard sql.NullString `db:"infer_card"` + Status sql.NullString `db:"status"` + CreateTime sql.NullString `db:"create_time"` + UpdateTime sql.NullString `db:"update_time"` + } +) + +func newAiInferDeployInstanceModel(conn sqlx.SqlConn) *defaultAiInferDeployInstanceModel { + return &defaultAiInferDeployInstanceModel{ + conn: conn, + table: "`ai_infer_deploy_instance`", + } +} + +func (m *defaultAiInferDeployInstanceModel) withSession(session sqlx.Session) *defaultAiInferDeployInstanceModel { + return &defaultAiInferDeployInstanceModel{ + conn: sqlx.NewSqlConnFromSession(session), + table: "`ai_infer_deploy_instance`", + } +} + +func (m *defaultAiInferDeployInstanceModel) Delete(ctx context.Context, id int64) error { + query := fmt.Sprintf("delete from %s where `id` = ?", m.table) + _, err := m.conn.ExecCtx(ctx, query, id) + return err +} + +func (m *defaultAiInferDeployInstanceModel) FindOne(ctx context.Context, id int64) (*AiInferDeployInstance, error) { + query := fmt.Sprintf("select %s from %s where `id` = ? limit 1", aiInferDeployInstanceRows, m.table) + var resp AiInferDeployInstance + err := m.conn.QueryRowCtx(ctx, &resp, query, id) + switch err { + case nil: + return &resp, nil + case sqlc.ErrNotFound: + return nil, ErrNotFound + default: + return nil, err + } +} + +func (m *defaultAiInferDeployInstanceModel) Insert(ctx context.Context, data *AiInferDeployInstance) (sql.Result, error) { + query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, aiInferDeployInstanceRowsExpectAutoSet) + ret, err := m.conn.ExecCtx(ctx, query, data.InstanceId, data.InstanceName, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.ModelName, data.ModelType, data.InferCard, data.Status) + return ret, err +} + +func (m *defaultAiInferDeployInstanceModel) Update(ctx context.Context, data *AiInferDeployInstance) error { + query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, aiInferDeployInstanceRowsWithPlaceHolder) + _, err := m.conn.ExecCtx(ctx, query, data.InstanceId, data.InstanceName, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.ModelName, data.ModelType, data.InferCard, data.Status, data.Id) + return err +} + +func (m *defaultAiInferDeployInstanceModel) tableName() string { + return m.table +} From 61d4ea3af7238f2156add2a547f07a8ab7d81de1 Mon Sep 17 00:00:00 2001 From: tzwang Date: Mon, 22 Jul 2024 15:47:20 +0800 Subject: [PATCH 2/2] added deployinstance logics Former-commit-id: 84e32bcb695a2a89210a0a912f7bd42649f1dcf3 --- go.mod | 4 +- go.sum | 8 +- .../scheduler/service/inference/inference.go | 13 ++- internal/storeLink/modelarts.go | 14 ++- internal/storeLink/octopus.go | 52 +++++++++- internal/storeLink/shuguangai.go | 99 +++++++++++++++---- internal/storeLink/storeLink.go | 45 ++++----- 7 files changed, 185 insertions(+), 50 deletions(-) diff --git a/go.mod b/go.mod index 888d5ae4..8c311812 100644 --- a/go.mod +++ b/go.mod @@ -18,9 +18,9 @@ require ( github.com/prometheus/common v0.54.0 github.com/robfig/cron/v3 v3.0.1 github.com/zeromicro/go-zero v1.6.5 - gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1 + gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249 gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe - gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 + gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330 gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d diff --git a/go.sum b/go.sum index 5f9165ae..aaeb23d1 100644 --- a/go.sum +++ b/go.sum @@ -471,12 +471,12 @@ github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw= github.com/zeromicro/go-zero v1.6.5 h1:JgsBa25/knnEL7+KQksbwktudIkNQvaAin0nisVgnSA= github.com/zeromicro/go-zero v1.6.5/go.mod h1:XjbssEVEzFKueAh0Fie5kNf+cRqFlQQk46fY9WgEGaM= -gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1 h1:Wc9M/vq+9Iw49KZb6mgHj85sysGHjVY+QlHJeZKlx4w= -gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY= +gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249 h1:bHJGq5P+8w4fP62PZhIiq/fvOhvDPRtkM4pcmU8OZ1w= +gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY= gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe h1:teAWL7sJszDb1ZA7uptrzPSwJ1OIV840Q1/nrrDsx7E= gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe/go.mod h1:/eOmBFZKWGoabG3sRVkVvIbLwsd2631k4jkUBR6x1AA= -gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 h1:E2QfpS3Y0FjR8Zyv5l2Ti/2NetQFqHG66c8+T/+J1u0= -gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= +gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330 h1:WxPrFSO6LjDCr+k7nmNFlPst8CtoTHQ2iSjv+D2rNnM= +gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203/go.mod h1:i2rrbMQ+Fve345BY9Heh4MUqVTAimZQElQhzzRee5B8= gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnzkJBfMRnya1NrhOzlroUtRa5ePiYbPKlHLoLV0= diff --git a/internal/scheduler/service/inference/inference.go b/internal/scheduler/service/inference/inference.go index 23b2dbc2..10cce695 100644 --- a/internal/scheduler/service/inference/inference.go +++ b/internal/scheduler/service/inference/inference.go @@ -11,7 +11,10 @@ const ( type ICluster interface { GetInferUrl(ctx context.Context, option *option.InferOption) ([]*InferUrl, error) - GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*DeployInstance, error) + GetInferDeployInstanceList(ctx context.Context) ([]*DeployInstance, error) + StartInferDeployInstance(ctx context.Context, id string) bool + StopInferDeployInstance(ctx context.Context, id string) bool + GetInferDeployInstance(ctx context.Context, id string) (*DeployInstance, error) } type IInference interface { @@ -29,4 +32,12 @@ type InferUrl struct { } type DeployInstance struct { + InstanceName string + InstanceId string + ModelName string + ModelType string + InferCard string + ClusterName string + Status string + CreatedTime string } diff --git a/internal/storeLink/modelarts.go b/internal/storeLink/modelarts.go index fc48c8ea..492281de 100644 --- a/internal/storeLink/modelarts.go +++ b/internal/storeLink/modelarts.go @@ -399,6 +399,18 @@ func (m *ModelArtsLink) GetInferUrl(ctx context.Context, option *option.InferOpt return imageUrls, nil } -func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) { +func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) { + return nil, nil +} + +func (m *ModelArtsLink) StartInferDeployInstance(ctx context.Context, id string) bool { + return false +} + +func (m *ModelArtsLink) StopInferDeployInstance(ctx context.Context, id string) bool { + return false +} + +func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) { return nil, nil } diff --git a/internal/storeLink/octopus.go b/internal/storeLink/octopus.go index e3b4c73e..290feedf 100644 --- a/internal/storeLink/octopus.go +++ b/internal/storeLink/octopus.go @@ -904,6 +904,56 @@ func (o *OctopusLink) GetInferUrl(ctx context.Context, option *option.InferOptio return imageUrls, nil } -func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) { +func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) { + var insList []*inference.DeployInstance + req := &octopus.GetNotebookListReq{ + Platform: o.platform, + PageIndex: o.pageIndex, + PageSize: o.pageSize, + SearchKey: DEPLOY_INSTANCE_PREFIEX, + } + list, err := o.octopusRpc.GetNotebookList(ctx, req) + if err != nil { + return nil, err + } + if list.Error != nil { + return nil, errors.New(list.Error.Message) + } + for _, notebook := range list.Payload.Notebooks { + ins := &inference.DeployInstance{} + ins.InstanceName = notebook.Name + ins.InstanceId = notebook.Id + ins.ClusterName = o.platform + ins.Status = notebook.Status + insList = append(insList, ins) + } + return insList, nil +} + +func (o *OctopusLink) StartInferDeployInstance(ctx context.Context, id string) bool { + req := &octopus.StartNotebookReq{ + Platform: o.platform, + Id: id, + } + resp, err := o.octopusRpc.StartNotebook(ctx, req) + if err != nil || !resp.Success { + return false + } + return resp.Success +} + +func (o *OctopusLink) StopInferDeployInstance(ctx context.Context, id string) bool { + req := &octopus.StopNotebookReq{ + Platform: o.platform, + Id: id, + } + resp, err := o.octopusRpc.StopNotebook(ctx, req) + if err != nil || !resp.Success { + return false + } + return resp.Success +} + +func (o *OctopusLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) { return nil, nil } diff --git a/internal/storeLink/shuguangai.go b/internal/storeLink/shuguangai.go index b6647f84..e8ea0e51 100644 --- a/internal/storeLink/shuguangai.go +++ b/internal/storeLink/shuguangai.go @@ -32,24 +32,25 @@ import ( ) const ( - RAM_SIZE_1G = 1024 // 1G - WORKER_NUMBER = 1 - DCU = "DCU" - DCU_TOPS = 24.5 - PYTORCH = "Pytorch" - TASK_PYTORCH_PREFIX = "PytorchTask" - TENSORFLOW = "Tensorflow" - RESOURCE_GROUP = "wzhdtest" - WorkPath = "/work/home/acgnnmfbwo/pcmv1/" - TimeoutLimit = "10:00:00" - PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py" - DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" - ALGORITHM_DIR = "/work/home/acgnnmfbwo/pcmv1/algorithm" - TRAIN_FILE = "train.py" - CPUCOREPRICEPERHOUR = 0.09 - DCUPRICEPERHOUR = 2.0 - KB = 1024 - TIMEOUT = 20 + RAM_SIZE_1G = 1024 // 1G + WORKER_NUMBER = 1 + DCU = "DCU" + DCU_TOPS = 24.5 + PYTORCH = "Pytorch" + TASK_PYTORCH_PREFIX = "PytorchTask" + TENSORFLOW = "Tensorflow" + RESOURCE_GROUP = "wzhdtest" + WorkPath = "/work/home/acgnnmfbwo/pcmv1/" + TimeoutLimit = "10:00:00" + PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py" + DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" + ALGORITHM_DIR = "/work/home/acgnnmfbwo/pcmv1/algorithm" + TRAIN_FILE = "train.py" + CPUCOREPRICEPERHOUR = 0.09 + DCUPRICEPERHOUR = 2.0 + KB = 1024 + TIMEOUT = 20 + DEPLOY_INSTANCE_LIMIT = 100 ) var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{ @@ -789,6 +790,66 @@ func (s *ShuguangAi) GetInferUrl(ctx context.Context, option *option.InferOption return imageUrls, nil } -func (s *ShuguangAi) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) { +func (s *ShuguangAi) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) { + var insList []*inference.DeployInstance + params := &hpcAC.GetInstanceServiceListReqParam{ + InstanceServiceName: DEPLOY_INSTANCE_PREFIEX, + Start: 0, + Limit: DEPLOY_INSTANCE_LIMIT, + } + req := &hpcacclient.GetInstanceServiceListReq{ + Param: params, + } + list, err := s.aCRpc.GetInstanceServiceList(ctx, req) + if err != nil { + return nil, err + } + if list.Code != "0" { + return nil, errors.New(list.Msg) + } + for _, datum := range list.Data { + ins := &inference.DeployInstance{} + ins.InstanceName = datum.InstanceServiceName + ins.InstanceId = datum.Id + ins.ClusterName = s.platform + ins.Status = datum.Status + ins.InferCard = DCU + ins.CreatedTime = datum.CreateTime + insList = append(insList, ins) + } + + return insList, nil +} + +func (s *ShuguangAi) StartInferDeployInstance(ctx context.Context, id string) bool { + req := &hpcAC.StartInstanceServiceReq{ + InstanceServiceId: id, + } + resp, err := s.aCRpc.StartInstanceService(ctx, req) + if err != nil || resp.Code != "0" { + return false + } + if resp.Data == id && resp.Code == "0" { + return true + } + return false +} + +func (s *ShuguangAi) StopInferDeployInstance(ctx context.Context, id string) bool { + ids := []string{id} + req := &hpcAC.StopInstanceServiceReq{ + Ids: ids, + } + resp, err := s.aCRpc.StopInstanceService(ctx, req) + if err != nil || resp.Code != "0" { + return false + } + if resp.Code == "0" { + return true + } + return false +} + +func (s *ShuguangAi) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) { return nil, nil } diff --git a/internal/storeLink/storeLink.go b/internal/storeLink/storeLink.go index 9dcf782a..ac81553a 100644 --- a/internal/storeLink/storeLink.go +++ b/internal/storeLink/storeLink.go @@ -46,28 +46,29 @@ type Linkage interface { } const ( - COLON = ":" - PY_PARAM_PREFIX = "--" - SPACE = " " - UNDERSCORE = "_" - EQUAL = "=" - DASH = "-" - FORWARD_SLASH = "/" - COMMA = "," - STAR = "*" - TYPE_OCTOPUS = "1" - TYPE_MODELARTS = "2" - TYPE_SHUGUANGAI = "3" - TYPE_SHUGUANGHPC = "4" - OCTOPUS = "Octopus" - MODELARTS = "Modelarts" - SHUGUANGAI = "ShuguangAi" - SHUGUANGHPC = "ShuguangHpc" - CPU = "cpu" - GPU = "gpu" - CARD = "computeCard" - PYTORCH_TASK = "pytorch" - TENSORFLOW_TASK = "tensorflow" + COLON = ":" + PY_PARAM_PREFIX = "--" + SPACE = " " + UNDERSCORE = "_" + EQUAL = "=" + DASH = "-" + FORWARD_SLASH = "/" + COMMA = "," + STAR = "*" + TYPE_OCTOPUS = "1" + TYPE_MODELARTS = "2" + TYPE_SHUGUANGAI = "3" + TYPE_SHUGUANGHPC = "4" + OCTOPUS = "Octopus" + MODELARTS = "Modelarts" + SHUGUANGAI = "ShuguangAi" + SHUGUANGHPC = "ShuguangHpc" + CPU = "cpu" + GPU = "gpu" + CARD = "computeCard" + PYTORCH_TASK = "pytorch" + TENSORFLOW_TASK = "tensorflow" + DEPLOY_INSTANCE_PREFIEX = "infer" ) var (