added deployinstance logics

Former-commit-id: 84e32bcb695a2a89210a0a912f7bd42649f1dcf3
2024-07-22 15:47:20 +08:00 · 2024-07-22 15:47:20 +08:00 · 61d4ea3af7
parent d24f5dd7a8
commit 61d4ea3af7
7 changed files with 185 additions and 50 deletions
--- a/go.mod
+++ b/go.mod
@ -18,9 +18,9 @@ require (
 	github.com/prometheus/common v0.54.0
 	github.com/robfig/cron/v3 v3.0.1
 	github.com/zeromicro/go-zero v1.6.5
-	gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1
+	gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249
 	gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe
-	gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35
+	gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330
 	gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203
 	gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5
 	gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d
--- a/go.sum
+++ b/go.sum
@ -471,12 +471,12 @@ github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M
 github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
 github.com/zeromicro/go-zero v1.6.5 h1:JgsBa25/knnEL7+KQksbwktudIkNQvaAin0nisVgnSA=
 github.com/zeromicro/go-zero v1.6.5/go.mod h1:XjbssEVEzFKueAh0Fie5kNf+cRqFlQQk46fY9WgEGaM=
-gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1 h1:Wc9M/vq+9Iw49KZb6mgHj85sysGHjVY+QlHJeZKlx4w=
-gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY=
+gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249 h1:bHJGq5P+8w4fP62PZhIiq/fvOhvDPRtkM4pcmU8OZ1w=
+gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY=
 gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe h1:teAWL7sJszDb1ZA7uptrzPSwJ1OIV840Q1/nrrDsx7E=
 gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe/go.mod h1:/eOmBFZKWGoabG3sRVkVvIbLwsd2631k4jkUBR6x1AA=
-gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 h1:E2QfpS3Y0FjR8Zyv5l2Ti/2NetQFqHG66c8+T/+J1u0=
-gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ=
+gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330 h1:WxPrFSO6LjDCr+k7nmNFlPst8CtoTHQ2iSjv+D2rNnM=
+gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ=
 gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI=
 gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203/go.mod h1:i2rrbMQ+Fve345BY9Heh4MUqVTAimZQElQhzzRee5B8=
 gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnzkJBfMRnya1NrhOzlroUtRa5ePiYbPKlHLoLV0=
--- a/internal/scheduler/service/inference/inference.go
+++ b/internal/scheduler/service/inference/inference.go
@ -11,7 +11,10 @@ const (

 type ICluster interface {
 	GetInferUrl(ctx context.Context, option *option.InferOption) ([]*InferUrl, error)
-	GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*DeployInstance, error)
+	GetInferDeployInstanceList(ctx context.Context) ([]*DeployInstance, error)
+	StartInferDeployInstance(ctx context.Context, id string) bool
+	StopInferDeployInstance(ctx context.Context, id string) bool
+	GetInferDeployInstance(ctx context.Context, id string) (*DeployInstance, error)
 }

 type IInference interface {
@ -29,4 +32,12 @@ type InferUrl struct {
 }

 type DeployInstance struct {
+	InstanceName string
+	InstanceId   string
+	ModelName    string
+	ModelType    string
+	InferCard    string
+	ClusterName  string
+	Status       string
+	CreatedTime  string
 }
--- a/internal/storeLink/modelarts.go
+++ b/internal/storeLink/modelarts.go
@ -399,6 +399,18 @@ func (m *ModelArtsLink) GetInferUrl(ctx context.Context, option *option.InferOpt
 	return imageUrls, nil
 }

-func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) {
+func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
+	return nil, nil
+}
+
+func (m *ModelArtsLink) StartInferDeployInstance(ctx context.Context, id string) bool {
+	return false
+}
+
+func (m *ModelArtsLink) StopInferDeployInstance(ctx context.Context, id string) bool {
+	return false
+}
+
+func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
 	return nil, nil
 }
--- a/internal/storeLink/octopus.go
+++ b/internal/storeLink/octopus.go
@ -904,6 +904,56 @@ func (o *OctopusLink) GetInferUrl(ctx context.Context, option *option.InferOptio
 	return imageUrls, nil
 }

-func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) {
+func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
+	var insList []*inference.DeployInstance
+	req := &octopus.GetNotebookListReq{
+		Platform:  o.platform,
+		PageIndex: o.pageIndex,
+		PageSize:  o.pageSize,
+		SearchKey: DEPLOY_INSTANCE_PREFIEX,
+	}
+	list, err := o.octopusRpc.GetNotebookList(ctx, req)
+	if err != nil {
+		return nil, err
+	}
+	if list.Error != nil {
+		return nil, errors.New(list.Error.Message)
+	}
+	for _, notebook := range list.Payload.Notebooks {
+		ins := &inference.DeployInstance{}
+		ins.InstanceName = notebook.Name
+		ins.InstanceId = notebook.Id
+		ins.ClusterName = o.platform
+		ins.Status = notebook.Status
+		insList = append(insList, ins)
+	}
+	return insList, nil
+}
+
+func (o *OctopusLink) StartInferDeployInstance(ctx context.Context, id string) bool {
+	req := &octopus.StartNotebookReq{
+		Platform: o.platform,
+		Id:       id,
+	}
+	resp, err := o.octopusRpc.StartNotebook(ctx, req)
+	if err != nil || !resp.Success {
+		return false
+	}
+	return resp.Success
+}
+
+func (o *OctopusLink) StopInferDeployInstance(ctx context.Context, id string) bool {
+	req := &octopus.StopNotebookReq{
+		Platform: o.platform,
+		Id:       id,
+	}
+	resp, err := o.octopusRpc.StopNotebook(ctx, req)
+	if err != nil || !resp.Success {
+		return false
+	}
+	return resp.Success
+}
+
+func (o *OctopusLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
 	return nil, nil
 }
--- a/internal/storeLink/shuguangai.go
+++ b/internal/storeLink/shuguangai.go
@ -32,24 +32,25 @@ import (
 )

 const (
-	RAM_SIZE_1G         = 1024 // 1G
-	WORKER_NUMBER       = 1
-	DCU                 = "DCU"
-	DCU_TOPS            = 24.5
-	PYTORCH             = "Pytorch"
-	TASK_PYTORCH_PREFIX = "PytorchTask"
-	TENSORFLOW          = "Tensorflow"
-	RESOURCE_GROUP      = "wzhdtest"
-	WorkPath            = "/work/home/acgnnmfbwo/pcmv1/"
-	TimeoutLimit        = "10:00:00"
-	PythonCodePath      = "/work/home/acgnnmfbwo/111111/py/test.py"
-	DATASETS_DIR        = "/work/home/acgnnmfbwo/pcmv1/dataset"
-	ALGORITHM_DIR       = "/work/home/acgnnmfbwo/pcmv1/algorithm"
-	TRAIN_FILE          = "train.py"
-	CPUCOREPRICEPERHOUR = 0.09
-	DCUPRICEPERHOUR     = 2.0
-	KB                  = 1024
-	TIMEOUT             = 20
+	RAM_SIZE_1G           = 1024 // 1G
+	WORKER_NUMBER         = 1
+	DCU                   = "DCU"
+	DCU_TOPS              = 24.5
+	PYTORCH               = "Pytorch"
+	TASK_PYTORCH_PREFIX   = "PytorchTask"
+	TENSORFLOW            = "Tensorflow"
+	RESOURCE_GROUP        = "wzhdtest"
+	WorkPath              = "/work/home/acgnnmfbwo/pcmv1/"
+	TimeoutLimit          = "10:00:00"
+	PythonCodePath        = "/work/home/acgnnmfbwo/111111/py/test.py"
+	DATASETS_DIR          = "/work/home/acgnnmfbwo/pcmv1/dataset"
+	ALGORITHM_DIR         = "/work/home/acgnnmfbwo/pcmv1/algorithm"
+	TRAIN_FILE            = "train.py"
+	CPUCOREPRICEPERHOUR   = 0.09
+	DCUPRICEPERHOUR       = 2.0
+	KB                    = 1024
+	TIMEOUT               = 20
+	DEPLOY_INSTANCE_LIMIT = 100
 )

 var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
@ -789,6 +790,66 @@ func (s *ShuguangAi) GetInferUrl(ctx context.Context, option *option.InferOption
 	return imageUrls, nil
 }

-func (s *ShuguangAi) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) {
+func (s *ShuguangAi) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
+	var insList []*inference.DeployInstance
+	params := &hpcAC.GetInstanceServiceListReqParam{
+		InstanceServiceName: DEPLOY_INSTANCE_PREFIEX,
+		Start:               0,
+		Limit:               DEPLOY_INSTANCE_LIMIT,
+	}
+	req := &hpcacclient.GetInstanceServiceListReq{
+		Param: params,
+	}
+	list, err := s.aCRpc.GetInstanceServiceList(ctx, req)
+	if err != nil {
+		return nil, err
+	}
+	if list.Code != "0" {
+		return nil, errors.New(list.Msg)
+	}
+	for _, datum := range list.Data {
+		ins := &inference.DeployInstance{}
+		ins.InstanceName = datum.InstanceServiceName
+		ins.InstanceId = datum.Id
+		ins.ClusterName = s.platform
+		ins.Status = datum.Status
+		ins.InferCard = DCU
+		ins.CreatedTime = datum.CreateTime
+		insList = append(insList, ins)
+	}
+
+	return insList, nil
+}
+
+func (s *ShuguangAi) StartInferDeployInstance(ctx context.Context, id string) bool {
+	req := &hpcAC.StartInstanceServiceReq{
+		InstanceServiceId: id,
+	}
+	resp, err := s.aCRpc.StartInstanceService(ctx, req)
+	if err != nil || resp.Code != "0" {
+		return false
+	}
+	if resp.Data == id && resp.Code == "0" {
+		return true
+	}
+	return false
+}
+
+func (s *ShuguangAi) StopInferDeployInstance(ctx context.Context, id string) bool {
+	ids := []string{id}
+	req := &hpcAC.StopInstanceServiceReq{
+		Ids: ids,
+	}
+	resp, err := s.aCRpc.StopInstanceService(ctx, req)
+	if err != nil || resp.Code != "0" {
+		return false
+	}
+	if resp.Code == "0" {
+		return true
+	}
+	return false
+}
+
+func (s *ShuguangAi) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
 	return nil, nil
 }
--- a/internal/storeLink/storeLink.go
+++ b/internal/storeLink/storeLink.go
@ -46,28 +46,29 @@ type Linkage interface {
 }

 const (
-	COLON            = ":"
-	PY_PARAM_PREFIX  = "--"
-	SPACE            = " "
-	UNDERSCORE       = "_"
-	EQUAL            = "="
-	DASH             = "-"
-	FORWARD_SLASH    = "/"
-	COMMA            = ","
-	STAR             = "*"
-	TYPE_OCTOPUS     = "1"
-	TYPE_MODELARTS   = "2"
-	TYPE_SHUGUANGAI  = "3"
-	TYPE_SHUGUANGHPC = "4"
-	OCTOPUS          = "Octopus"
-	MODELARTS        = "Modelarts"
-	SHUGUANGAI       = "ShuguangAi"
-	SHUGUANGHPC      = "ShuguangHpc"
-	CPU              = "cpu"
-	GPU              = "gpu"
-	CARD             = "computeCard"
-	PYTORCH_TASK     = "pytorch"
-	TENSORFLOW_TASK  = "tensorflow"
+	COLON                   = ":"
+	PY_PARAM_PREFIX         = "--"
+	SPACE                   = " "
+	UNDERSCORE              = "_"
+	EQUAL                   = "="
+	DASH                    = "-"
+	FORWARD_SLASH           = "/"
+	COMMA                   = ","
+	STAR                    = "*"
+	TYPE_OCTOPUS            = "1"
+	TYPE_MODELARTS          = "2"
+	TYPE_SHUGUANGAI         = "3"
+	TYPE_SHUGUANGHPC        = "4"
+	OCTOPUS                 = "Octopus"
+	MODELARTS               = "Modelarts"
+	SHUGUANGAI              = "ShuguangAi"
+	SHUGUANGHPC             = "ShuguangHpc"
+	CPU                     = "cpu"
+	GPU                     = "gpu"
+	CARD                    = "computeCard"
+	PYTORCH_TASK            = "pytorch"
+	TENSORFLOW_TASK         = "tensorflow"
+	DEPLOY_INSTANCE_PREFIEX = "infer"
 )

 var (