From 9c6b3801fde3378855306f8b4284f974bc53fa12 Mon Sep 17 00:00:00 2001 From: tzwang Date: Fri, 20 Sep 2024 17:49:28 +0800 Subject: [PATCH] updated shuguang imageinfer func --- go.mod | 3 +- go.sum | 6 +- .../imageInference/imageInference.go | 2 +- .../scheduler/service/inference/inference.go | 6 +- internal/storeLink/modelarts.go | 2 +- internal/storeLink/octopus.go | 2 +- internal/storeLink/shuguangai.go | 151 +++++++++++++----- 7 files changed, 125 insertions(+), 47 deletions(-) diff --git a/go.mod b/go.mod index ca5b7a24..29707f9e 100644 --- a/go.mod +++ b/go.mod @@ -18,13 +18,12 @@ require ( github.com/prometheus/common v0.59.1 github.com/robfig/cron/v3 v3.0.1 github.com/zeromicro/go-zero v1.7.2 - gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240918015229-59c579d1a437 + gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240920093406-601f283f0185 gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240918011543-482dcd609877 gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240817071412-44397870b110 gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d - gitlink.org.cn/jcce-pcm/utils v0.0.1 go.opentelemetry.io/otel/trace v1.29.0 gonum.org/v1/gonum v0.11.0 google.golang.org/grpc v1.66.0 diff --git a/go.sum b/go.sum index 7900db0e..31c5b5ad 100644 --- a/go.sum +++ b/go.sum @@ -466,8 +466,8 @@ github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw= github.com/zeromicro/go-zero v1.7.2 h1:a8lyVOG3KXG4LrAy6ZmtJTJtisX4Ostc4Pst4fE704I= github.com/zeromicro/go-zero v1.7.2/go.mod h1:WFXfF92Exw0O7WECifS6r99JSzv4KEN49x9RhAfgkMc= -gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240918015229-59c579d1a437 h1:ta6h9+FU7AQ2fNyQiXrZnMdlNBjOKdyBx4e3RF7BE84= -gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240918015229-59c579d1a437/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY= +gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240920093406-601f283f0185 h1:B+YBB5xHlIAS6ILuaCGQwbOpr/L6LOHAlj9PeFUCetM= +gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240920093406-601f283f0185/go.mod h1:3eECiw9O2bIFkkePlloKyLNXiqBAhOxNrDoGaaGseGY= gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240918011543-482dcd609877 h1:a+1FpxqLPRojlAkJlAeRhKRbxajymXYgrM+s9bfQx0E= gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20240918011543-482dcd609877/go.mod h1:/eOmBFZKWGoabG3sRVkVvIbLwsd2631k4jkUBR6x1AA= gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240817071412-44397870b110 h1:GaXwr5sgDh0raHjUf9IewTvnRvajYea7zbLsaerYyXo= @@ -478,8 +478,6 @@ gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnz gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5/go.mod h1:97AlUXN13g9UN3+9/DzCHpeoU5sbdyv0IQuTEHNexzQ= gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d h1:DHjl/rLuH2gKYtY0MKMGNQDHFT12APg25RlMUQo+tHk= gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d/go.mod h1:r/KLzUpupCV5jdxSfgDhc2pVjP0fBi3VhAWRttsBn30= -gitlink.org.cn/jcce-pcm/utils v0.0.1 h1:3PH93Z/JFTH5JRO9MFf3dD1Gnd12aGiIIViWBlQGuhE= -gitlink.org.cn/jcce-pcm/utils v0.0.1/go.mod h1:5cwaaqM0+HK5GXVbYozGlWvgwoUby0KytdvhbwQW1ks= go.etcd.io/etcd/api/v3 v3.5.15 h1:3KpLJir1ZEBrYuV2v+Twaa/e2MdDCEZ/70H+lzEiwsk= go.etcd.io/etcd/api/v3 v3.5.15/go.mod h1:N9EhGzXq58WuMllgH9ZvnEr7SI9pS0k0+DHZezGp7jM= go.etcd.io/etcd/client/pkg/v3 v3.5.15 h1:fo0HpWz/KlHGMCC+YejpiCmyWDEuIpnTDzpJLB5fWlA= diff --git a/internal/scheduler/service/inference/imageInference/imageInference.go b/internal/scheduler/service/inference/imageInference/imageInference.go index ba524db6..ed0a375d 100644 --- a/internal/scheduler/service/inference/imageInference/imageInference.go +++ b/internal/scheduler/service/inference/imageInference/imageInference.go @@ -463,7 +463,7 @@ func getInferResult(url string, file multipart.File, fileName string, clusterId switch clusterType { case storeLink.TYPE_OCTOPUS: r := http.Request{} - result, err := iCluster.GetInferResult(r.Context(), url, file, fileName) + result, err := iCluster.GetImageInferResult(r.Context(), url, file, fileName) if err != nil { return "", err } diff --git a/internal/scheduler/service/inference/inference.go b/internal/scheduler/service/inference/inference.go index 95e2a058..6e1def1b 100644 --- a/internal/scheduler/service/inference/inference.go +++ b/internal/scheduler/service/inference/inference.go @@ -12,13 +12,17 @@ const ( type ICluster interface { GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*ClusterInferUrl, error) - GetInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) GetInferDeployInstanceList(ctx context.Context) ([]*DeployInstance, error) StartInferDeployInstance(ctx context.Context, id string) bool StopInferDeployInstance(ctx context.Context, id string) bool GetInferDeployInstance(ctx context.Context, id string) (*DeployInstance, error) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) CheckModelExistence(ctx context.Context, modelName string, modelType string) bool + InferResult +} + +type InferResult interface { + GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) } type IInference interface { diff --git a/internal/storeLink/modelarts.go b/internal/storeLink/modelarts.go index 36022ac5..bcb1dd17 100644 --- a/internal/storeLink/modelarts.go +++ b/internal/storeLink/modelarts.go @@ -767,7 +767,7 @@ func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) ( return ins, nil } -func (m *ModelArtsLink) GetInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) { +func (m *ModelArtsLink) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) { return "", nil } diff --git a/internal/storeLink/octopus.go b/internal/storeLink/octopus.go index 5fd26e5f..74368949 100644 --- a/internal/storeLink/octopus.go +++ b/internal/storeLink/octopus.go @@ -1183,7 +1183,7 @@ func (o *OctopusLink) GetInferDeployInstance(ctx context.Context, id string) (*i return ins, nil } -func (o *OctopusLink) GetInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) { +func (o *OctopusLink) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) { stream, err := o.octopusRpc.GetInferResult(ctx) if err != nil { return "", err diff --git a/internal/storeLink/shuguangai.go b/internal/storeLink/shuguangai.go index 429f772d..bac1659a 100644 --- a/internal/storeLink/shuguangai.go +++ b/internal/storeLink/shuguangai.go @@ -47,49 +47,60 @@ const ( PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py" DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" ALGORITHM_DIR = "/work/home/acgnnmfbwo/pcmv1/algorithm" + KUNSHAN_DIR = "/public/home/acgnnmfbwo/pcmv1" TRAIN_FILE = "train.py" CPUCOREPRICEPERHOUR = 0.09 DCUPRICEPERHOUR = 2.0 KB = 1024 TIMEOUT = 20 DEPLOY_INSTANCE_LIMIT = 100 + ProtocolType = "HTTP" + ContainerPort = 8881 + JUPYTER = "jupyter" ) -var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{ - "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": { - CPU: 1, - GPU: 1, - RAM: 2 * RAM_SIZE_1G, - }, - "6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": { - CPU: 1, - GPU: 2, - RAM: 2 * RAM_SIZE_1G, - }, - "OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": { - CPU: 2, - GPU: 3, - RAM: 4 * RAM_SIZE_1G, - }, - "sBWfpkntUzsWYly11kdwEHZOYYIsFmve": { - CPU: 4, - GPU: 4, - RAM: 8 * RAM_SIZE_1G, - }, - "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": { - CPU: 5, - GPU: 5, - RAM: 10 * RAM_SIZE_1G, - }, -} +var ( + RESOURCESGAIMAP = map[string]ResourceSpecSGAI{ + "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": { + CPU: 1, + GPU: 1, + RAM: 2 * RAM_SIZE_1G, + }, + "6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": { + CPU: 1, + GPU: 2, + RAM: 2 * RAM_SIZE_1G, + }, + "OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": { + CPU: 2, + GPU: 3, + RAM: 4 * RAM_SIZE_1G, + }, + "sBWfpkntUzsWYly11kdwEHZOYYIsFmve": { + CPU: 4, + GPU: 4, + RAM: 8 * RAM_SIZE_1G, + }, + "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": { + CPU: 5, + GPU: 5, + RAM: 10 * RAM_SIZE_1G, + }, + } -var RESOURCESPECSAI = map[string]string{ - "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G", - "6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G", - "OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:3, RAM:4G", - "sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:4, DCU:4, RAM:8G", - "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:5, RAM:10G", -} + RESOURCESPECSAI = map[string]string{ + "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G", + "6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G", + "OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:3, RAM:4G", + "sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:4, DCU:4, RAM:8G", + "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:5, RAM:10G", + } + + ModelNameCmdMap = map[string]string{ + "blip-image-captioning-base": "pip install transformers python-multipart fastapi uvicorn[standard]; python /public/home/acgnnmfbwo/pcmv1/inference/pytorch/blip_image_captioning_base/infer.py", + "imagenet_resnet50": "pip install fastapi uvicorn[standard] python-multipart; python /public/home/acgnnmfbwo/pcmv1/inference/pytorch/imagenet_resnet50/infer.py", + } +) type ResourceSpecSGAI struct { CPU int64 @@ -905,15 +916,81 @@ func (s *ShuguangAi) GetInferDeployInstance(ctx context.Context, id string) (*in return ins, nil } -func (s *ShuguangAi) GetInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) { +func (s *ShuguangAi) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) { return "", nil } func (s *ShuguangAi) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) { + containerPortInfoList := []*hpcAC.ContainerPortInfoList{ + { + ProtocolType: ProtocolType, + ContainerPort: ContainerPort, + }, + } - return "", nil + desc := option.ModelType + FORWARD_SLASH + option.ModelName + FORWARD_SLASH + strings.ToLower(DCU) + instanceServiceName := "infer_instance" + UNDERSCORE + utils.RandomString(15) + resourceGroup := "kshdtest" + + script, ok := ModelNameCmdMap[option.ModelName] + if !ok { + return "", errors.New("failed to set cmd, ModelName not exist") + } + + param := &hpcAC.CreateParams{ + AcceleratorType: strings.ToLower(DCU), + ContainerPortInfoList: containerPortInfoList, + CpuNumber: 8, + Description: desc, + //env + GpuNumber: 1, + ImagePath: "11.11.100.6:5000/dcu/admin/base/jupyterlab-pytorch:1.13.1-py3.7-dtk23.04-centos7.6", + InstanceServiceName: instanceServiceName, + MountInfoList: make([]*hpcAC.MountInfoList, 0), + //originalVersion + RamSize: 10 * RAM_SIZE_1G, + //rdma + ResourceGroup: resourceGroup, + StartScriptActionScope: "all", + StartScriptContent: script, + //startServiceCommand + //taskClassification: "interactive" + TaskNumber: 1, + TaskType: JUPYTER, + TimeoutLimit: "01:00:00", + UseStartScript: true, + //useStartServiceCommand: false + Version: "jupyterlab-pytorch:1.13.1-py3.7-dtk23.04-centos7.6", + } + + req := &hpcacclient.CreateInstanceServiceReq{ + Data: param, + } + + resp, err := s.aCRpc.CreateInstanceService(ctx, req) + if err != nil { + return "", err + } + if resp.Code != "0" { + return "", errors.New(resp.Msg) + } + + return resp.Data, nil } func (s *ShuguangAi) CheckModelExistence(ctx context.Context, name string, mtype string) bool { - return false + modelPath := "model" + FORWARD_SLASH + name + req := &hpcAC.IsExistFileReq{ + Path: KUNSHAN_DIR + FORWARD_SLASH + modelPath, + } + resp, err := s.aCRpc.IsExistFile(ctx, req) + if err != nil { + return false + } + + if resp.Code != "0" || resp.Data == nil { + return false + } + + return resp.Data.Exist }