diff --git a/go.mod b/go.mod index 888d5ae4..8c311812 100644 --- a/go.mod +++ b/go.mod @@ -18,9 +18,9 @@ require ( github.com/prometheus/common v0.54.0 github.com/robfig/cron/v3 v3.0.1 github.com/zeromicro/go-zero v1.6.5 - gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1 + gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249 gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe - gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 + gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330 gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d diff --git a/go.sum b/go.sum index 5f9165ae..aaeb23d1 100644 --- a/go.sum +++ b/go.sum @@ -471,12 +471,12 @@ github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw= github.com/zeromicro/go-zero v1.6.5 h1:JgsBa25/knnEL7+KQksbwktudIkNQvaAin0nisVgnSA= github.com/zeromicro/go-zero v1.6.5/go.mod h1:XjbssEVEzFKueAh0Fie5kNf+cRqFlQQk46fY9WgEGaM= -gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1 h1:Wc9M/vq+9Iw49KZb6mgHj85sysGHjVY+QlHJeZKlx4w= -gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240712090657-cfba062e68e1/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY= +gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249 h1:bHJGq5P+8w4fP62PZhIiq/fvOhvDPRtkM4pcmU8OZ1w= +gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240722032933-7bd6ab00d249/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY= gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe h1:teAWL7sJszDb1ZA7uptrzPSwJ1OIV840Q1/nrrDsx7E= gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240620065702-5dcad373c1fe/go.mod h1:/eOmBFZKWGoabG3sRVkVvIbLwsd2631k4jkUBR6x1AA= -gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 h1:E2QfpS3Y0FjR8Zyv5l2Ti/2NetQFqHG66c8+T/+J1u0= -gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= +gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330 h1:WxPrFSO6LjDCr+k7nmNFlPst8CtoTHQ2iSjv+D2rNnM= +gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240718073732-bc5d687f6330/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203/go.mod h1:i2rrbMQ+Fve345BY9Heh4MUqVTAimZQElQhzzRee5B8= gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnzkJBfMRnya1NrhOzlroUtRa5ePiYbPKlHLoLV0= diff --git a/internal/scheduler/service/inference/inference.go b/internal/scheduler/service/inference/inference.go index 23b2dbc2..10cce695 100644 --- a/internal/scheduler/service/inference/inference.go +++ b/internal/scheduler/service/inference/inference.go @@ -11,7 +11,10 @@ const ( type ICluster interface { GetInferUrl(ctx context.Context, option *option.InferOption) ([]*InferUrl, error) - GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*DeployInstance, error) + GetInferDeployInstanceList(ctx context.Context) ([]*DeployInstance, error) + StartInferDeployInstance(ctx context.Context, id string) bool + StopInferDeployInstance(ctx context.Context, id string) bool + GetInferDeployInstance(ctx context.Context, id string) (*DeployInstance, error) } type IInference interface { @@ -29,4 +32,12 @@ type InferUrl struct { } type DeployInstance struct { + InstanceName string + InstanceId string + ModelName string + ModelType string + InferCard string + ClusterName string + Status string + CreatedTime string } diff --git a/internal/storeLink/modelarts.go b/internal/storeLink/modelarts.go index fc48c8ea..492281de 100644 --- a/internal/storeLink/modelarts.go +++ b/internal/storeLink/modelarts.go @@ -399,6 +399,18 @@ func (m *ModelArtsLink) GetInferUrl(ctx context.Context, option *option.InferOpt return imageUrls, nil } -func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) { +func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) { + return nil, nil +} + +func (m *ModelArtsLink) StartInferDeployInstance(ctx context.Context, id string) bool { + return false +} + +func (m *ModelArtsLink) StopInferDeployInstance(ctx context.Context, id string) bool { + return false +} + +func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) { return nil, nil } diff --git a/internal/storeLink/octopus.go b/internal/storeLink/octopus.go index e3b4c73e..290feedf 100644 --- a/internal/storeLink/octopus.go +++ b/internal/storeLink/octopus.go @@ -904,6 +904,56 @@ func (o *OctopusLink) GetInferUrl(ctx context.Context, option *option.InferOptio return imageUrls, nil } -func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) { +func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) { + var insList []*inference.DeployInstance + req := &octopus.GetNotebookListReq{ + Platform: o.platform, + PageIndex: o.pageIndex, + PageSize: o.pageSize, + SearchKey: DEPLOY_INSTANCE_PREFIEX, + } + list, err := o.octopusRpc.GetNotebookList(ctx, req) + if err != nil { + return nil, err + } + if list.Error != nil { + return nil, errors.New(list.Error.Message) + } + for _, notebook := range list.Payload.Notebooks { + ins := &inference.DeployInstance{} + ins.InstanceName = notebook.Name + ins.InstanceId = notebook.Id + ins.ClusterName = o.platform + ins.Status = notebook.Status + insList = append(insList, ins) + } + return insList, nil +} + +func (o *OctopusLink) StartInferDeployInstance(ctx context.Context, id string) bool { + req := &octopus.StartNotebookReq{ + Platform: o.platform, + Id: id, + } + resp, err := o.octopusRpc.StartNotebook(ctx, req) + if err != nil || !resp.Success { + return false + } + return resp.Success +} + +func (o *OctopusLink) StopInferDeployInstance(ctx context.Context, id string) bool { + req := &octopus.StopNotebookReq{ + Platform: o.platform, + Id: id, + } + resp, err := o.octopusRpc.StopNotebook(ctx, req) + if err != nil || !resp.Success { + return false + } + return resp.Success +} + +func (o *OctopusLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) { return nil, nil } diff --git a/internal/storeLink/shuguangai.go b/internal/storeLink/shuguangai.go index b6647f84..e8ea0e51 100644 --- a/internal/storeLink/shuguangai.go +++ b/internal/storeLink/shuguangai.go @@ -32,24 +32,25 @@ import ( ) const ( - RAM_SIZE_1G = 1024 // 1G - WORKER_NUMBER = 1 - DCU = "DCU" - DCU_TOPS = 24.5 - PYTORCH = "Pytorch" - TASK_PYTORCH_PREFIX = "PytorchTask" - TENSORFLOW = "Tensorflow" - RESOURCE_GROUP = "wzhdtest" - WorkPath = "/work/home/acgnnmfbwo/pcmv1/" - TimeoutLimit = "10:00:00" - PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py" - DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" - ALGORITHM_DIR = "/work/home/acgnnmfbwo/pcmv1/algorithm" - TRAIN_FILE = "train.py" - CPUCOREPRICEPERHOUR = 0.09 - DCUPRICEPERHOUR = 2.0 - KB = 1024 - TIMEOUT = 20 + RAM_SIZE_1G = 1024 // 1G + WORKER_NUMBER = 1 + DCU = "DCU" + DCU_TOPS = 24.5 + PYTORCH = "Pytorch" + TASK_PYTORCH_PREFIX = "PytorchTask" + TENSORFLOW = "Tensorflow" + RESOURCE_GROUP = "wzhdtest" + WorkPath = "/work/home/acgnnmfbwo/pcmv1/" + TimeoutLimit = "10:00:00" + PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py" + DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" + ALGORITHM_DIR = "/work/home/acgnnmfbwo/pcmv1/algorithm" + TRAIN_FILE = "train.py" + CPUCOREPRICEPERHOUR = 0.09 + DCUPRICEPERHOUR = 2.0 + KB = 1024 + TIMEOUT = 20 + DEPLOY_INSTANCE_LIMIT = 100 ) var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{ @@ -789,6 +790,66 @@ func (s *ShuguangAi) GetInferUrl(ctx context.Context, option *option.InferOption return imageUrls, nil } -func (s *ShuguangAi) GetInferDeployInstanceList(ctx context.Context, option *option.InferOption) ([]*inference.DeployInstance, error) { +func (s *ShuguangAi) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) { + var insList []*inference.DeployInstance + params := &hpcAC.GetInstanceServiceListReqParam{ + InstanceServiceName: DEPLOY_INSTANCE_PREFIEX, + Start: 0, + Limit: DEPLOY_INSTANCE_LIMIT, + } + req := &hpcacclient.GetInstanceServiceListReq{ + Param: params, + } + list, err := s.aCRpc.GetInstanceServiceList(ctx, req) + if err != nil { + return nil, err + } + if list.Code != "0" { + return nil, errors.New(list.Msg) + } + for _, datum := range list.Data { + ins := &inference.DeployInstance{} + ins.InstanceName = datum.InstanceServiceName + ins.InstanceId = datum.Id + ins.ClusterName = s.platform + ins.Status = datum.Status + ins.InferCard = DCU + ins.CreatedTime = datum.CreateTime + insList = append(insList, ins) + } + + return insList, nil +} + +func (s *ShuguangAi) StartInferDeployInstance(ctx context.Context, id string) bool { + req := &hpcAC.StartInstanceServiceReq{ + InstanceServiceId: id, + } + resp, err := s.aCRpc.StartInstanceService(ctx, req) + if err != nil || resp.Code != "0" { + return false + } + if resp.Data == id && resp.Code == "0" { + return true + } + return false +} + +func (s *ShuguangAi) StopInferDeployInstance(ctx context.Context, id string) bool { + ids := []string{id} + req := &hpcAC.StopInstanceServiceReq{ + Ids: ids, + } + resp, err := s.aCRpc.StopInstanceService(ctx, req) + if err != nil || resp.Code != "0" { + return false + } + if resp.Code == "0" { + return true + } + return false +} + +func (s *ShuguangAi) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) { return nil, nil } diff --git a/internal/storeLink/storeLink.go b/internal/storeLink/storeLink.go index 9dcf782a..ac81553a 100644 --- a/internal/storeLink/storeLink.go +++ b/internal/storeLink/storeLink.go @@ -46,28 +46,29 @@ type Linkage interface { } const ( - COLON = ":" - PY_PARAM_PREFIX = "--" - SPACE = " " - UNDERSCORE = "_" - EQUAL = "=" - DASH = "-" - FORWARD_SLASH = "/" - COMMA = "," - STAR = "*" - TYPE_OCTOPUS = "1" - TYPE_MODELARTS = "2" - TYPE_SHUGUANGAI = "3" - TYPE_SHUGUANGHPC = "4" - OCTOPUS = "Octopus" - MODELARTS = "Modelarts" - SHUGUANGAI = "ShuguangAi" - SHUGUANGHPC = "ShuguangHpc" - CPU = "cpu" - GPU = "gpu" - CARD = "computeCard" - PYTORCH_TASK = "pytorch" - TENSORFLOW_TASK = "tensorflow" + COLON = ":" + PY_PARAM_PREFIX = "--" + SPACE = " " + UNDERSCORE = "_" + EQUAL = "=" + DASH = "-" + FORWARD_SLASH = "/" + COMMA = "," + STAR = "*" + TYPE_OCTOPUS = "1" + TYPE_MODELARTS = "2" + TYPE_SHUGUANGAI = "3" + TYPE_SHUGUANGHPC = "4" + OCTOPUS = "Octopus" + MODELARTS = "Modelarts" + SHUGUANGAI = "ShuguangAi" + SHUGUANGHPC = "ShuguangHpc" + CPU = "cpu" + GPU = "gpu" + CARD = "computeCard" + PYTORCH_TASK = "pytorch" + TENSORFLOW_TASK = "tensorflow" + DEPLOY_INSTANCE_PREFIEX = "infer" ) var (