From 03cc3fe9789f4de7faf335a4c05121cf65568bd1 Mon Sep 17 00:00:00 2001 From: Tuberrr Date: Mon, 29 Apr 2024 15:31:58 +0800 Subject: [PATCH 01/40] Update README.md Former-commit-id: 835b51a8d270bc7fa8fa8b7e4fba1194aec078e4 --- README.md | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 1903e134..3f96a77c 100644 --- a/README.md +++ b/README.md @@ -7,35 +7,35 @@ ## What is Jointcloud and PCM -  Jointcloud Cooperation Environment (JCCE) comprises Distributed Cloud Trading, Distributed Cloud Community, Distributed Cloud Supervision, and a Blockchain-based Distributed Accounting System. JCCE, with its supporting technologies related to information and value exchange, facilitates breaking the information asymmetry among participants in cloud collaboration. It creates a win-win opportunity for all involved parties and provides robust support for innovative business models in the cloud collaboration computing environment. +Jointcloud Cooperation Environment (JCCE) comprises Distributed Cloud Trading, Distributed Cloud Community, Distributed Cloud Supervision, and a Blockchain-based Distributed Accounting System. JCCE, with its supporting technologies related to information and value exchange, facilitates breaking the information asymmetry among participants in cloud collaboration. It creates a win-win opportunity for all involved parties and provides robust support for innovative business models in the cloud collaboration computing environment. -  The vision of the Peer Cooperation Mechanism (PCM) is to build a set of collaboration mechanisms in the cloud environment where all service providers and demand-side entities can autonomously participate, achieve peer-to-peer transactions, and establish efficient connections and invocations in a non-intrusive manner. PCM focuses more on collaboration among clouds rather than individual clouds themselves. +The vision of the Peer Cooperation Mechanism (PCM) is to build a set of collaboration mechanisms in the cloud environment where all service providers and demand-side entities can autonomously participate, achieve peer-to-peer transactions, and establish efficient connections and invocations in a non-intrusive manner. PCM focuses more on collaboration among clouds rather than individual clouds themselves. -  PCM is built upon a standardized software-defined framework. Cloud service providers with physical cloud resources can autonomously define PCM rule frameworks. Other entities adhering to these rule frameworks can then participate in collaborative transactions. Subsequently, tailored virtual private clouds can be created for domain-specific applications, meeting the diverse requirements of various cloud participants, such as central, edge, and terminal infrastructure, enabling them to directly participate in collaboration and transactions. +PCM is built upon a standardized software-defined framework. Cloud service providers with physical cloud resources can autonomously define PCM rule frameworks. Other entities adhering to these rule frameworks can then participate in collaborative transactions. Subsequently, tailored virtual private clouds can be created for domain-specific applications, meeting the diverse requirements of various cloud participants, such as central, edge, and terminal infrastructure, enabling them to directly participate in collaboration and transactions. ## Real-world Issues Addressed by PCM: -- **Performance Improvement**: +- **Performance Improvement** Developers in the cloud collaboration environment experience performance enhancement upon joining the PCM framework. They can generate code for the internal functionality structure and most specification definitions using descriptive language, allowing them to focus on business logic development without concerning themselves with underlying management functions. The framework offers features such as microservices management, multi-language code generation, and model bridging, reducing coding workload and entry barriers while improving efficiency. -- **Platform Lock-in Resolution**: +- **Platform Lock-in Resolution** The PCM effectively addresses platform lock-in issues through multi-cloud adaptation, standardized interfaces, and abstraction layers. This enables cross-platform operations, deployment, and interaction. Standardized interfaces simplify the work for developers, lowering the risk of platform lock-in. -- **Reduced Code Development Complexity**: +- **Reduced Code Development Complexity** The PCM development framework lowers the complexity of development by providing structured development patterns, ready-made components, and documentation support. Developers of different skill levels can collaborate more fairly. The framework can save approximately 50% of development time, with junior programmers completing tasks that originally required one person-month in about 15 person-days. Features such as automatic generation tools, code templates, and component reuse allow developers to focus more on business logic implementation. There is no need for additional training of advanced developers, saving time, and reducing labor costs while improving the return on investment. - **Reduced Code Development Workload**: -   The PCM framework offers automation features and code generation tools, reducing manual code writing and improving development speed and code quality. With the framework's auto-generation tools, over half of the code is generated, achieving a low code rate of approximately 63%. Developers primarily work on writing descriptive language files to generate the basic structure and then adjust and write logic functions. This enables teams to deliver products more quickly, iterate based on business optimization and user feedback, and be more agile. +The PCM framework offers automation features and code generation tools, reducing manual code writing and improving development speed and code quality. With the framework's auto-generation tools, over half of the code is generated, achieving a low code rate of approximately 63%. Developers primarily work on writing descriptive language files to generate the basic structure and then adjust and write logic functions. This enables teams to deliver products more quickly, iterate based on business optimization and user feedback, and be more agile. ## Architecture -  The Coordinator is the core component of the framework, providing heterogeneous abstraction for different Participant-side technology stacks. The framework, oriented towards the user side, primarily involves two major functionalities: resource changes (task submission) and centralized display. After task submission, PCM achieves dynamic task flow through a scheduling center deployed on cloud computing, intelligent computing, and supercomputing clouds. In the centralized display section, PCM mainly collects and aggregates information about resources and tasks from multiple Participant services through Tracker and actively reported by the Participant side. It provides users with a standardized unified interface. Users can view the information provided by PCM on the frontend page or directly access data through the interface. +The Coordinator is the core component of the framework, providing heterogeneous abstraction for different Participant-side technology stacks. The framework, oriented towards the user side, primarily involves two major functionalities: resource changes (task submission) and centralized display. After task submission, PCM achieves dynamic task flow through a scheduling center deployed on cloud computing, intelligent computing, and supercomputing clouds. In the centralized display section, PCM mainly collects and aggregates information about resources and tasks from multiple Participant services through Tracker and actively reported by the Participant side. It provides users with a standardized unified interface. Users can view the information provided by PCM on the frontend page or directly access data through the interface. ![PCM架构](docs/images/arch-eng.png) ## PCM deploy @@ -43,22 +43,22 @@ The development environment for PCM requires the installation of Go version 1.18 or above. Please refer to the following instructions to locally start the Kubernetes-related services. The command to fetch the project is as follows: -``` +```bash git clone https://gitlink.org.cn/JointCloud/pcm-coordinator.git ``` -  After executing the following command, the Kubernetes RPC service will be launched locally. For its specific functionalities, please refer to the description in the architecture design mentioned earlier. -``` +After executing the following command, the Kubernetes RPC service will be launched locally. For its specific functionalities, please refer to the description in the architecture design mentioned earlier. +``` bash # get required packages go mod tidy # run rpc service go run pcm-coordinator/rpc/pcmcore.go -``` +``` #### coordinator-api -  The template for the configuration content of the API service is as follows, where the config information for each Participant (P-side) can be configured as needed based on the actual situation. -``` +The template for the configuration content of the API service is as follows, where the config information for each Participant (P-side) can be configured as needed based on the actual situation. +``` bash Name: pcm.core.api Host: 0.0.0.0 Port: 8999 @@ -77,22 +77,22 @@ THRpcConf: #modelarts rpc ModelArtsRpcConf: Endpoints: - 127.0.0.1:2002NonBlock: true -``` -  After executing the following command, the Kubernetes API service will be launched locally. Once the service is started, users can make HTTP requests to its interfaces for various functional calls. -``` +``` +After executing the following command, the Kubernetes API service will be launched locally. Once the service is started, users can make HTTP requests to its interfaces for various functional calls. +``` bash # get required packages go mod tidy # run rpc service go run pcm-coordinator/api/pcm.go -``` +``` ## Upcoming Plans -- Pluginization of Scheduling Algorithms and Definition of Basic Resource Templates -- Fundamental Definition of Resource Operation Types -- Rapid Development Mode -- Implementation of First-level Scheduling +- Pluginization of Scheduling Algorithms and Definition of Basic Resource Templates. +- Fundamental Definition of Resource Operation Types. +- Rapid Development Mode. +- Implementation of First-level Scheduling. ## Join&Contribute -  We look forward to your opinions and contributions. Welcome all friends to provide corrections and improvements to the project, collectively building an efficient and stable cloud collaboration mechanism. \ No newline at end of file +We look forward to your opinions and contributions. Welcome all friends to provide corrections and improvements to the project, collectively building an efficient and stable cloud collaboration mechanism. \ No newline at end of file From c15cc7254aab1ed67c90beb564691b791d466979 Mon Sep 17 00:00:00 2001 From: Tuberrr Date: Mon, 29 Apr 2024 15:33:42 +0800 Subject: [PATCH 02/40] Update README.md Former-commit-id: 55b9855d043a04b6c76e1e7a973ab0d80903480c --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3f96a77c..320bfeec 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ PCM is built upon a standardized software-defined framework. Cloud service provi - **Reduced Code Development Complexity** - The PCM development framework lowers the complexity of development by providing structured development patterns, ready-made components, and documentation support. Developers of different skill levels can collaborate more fairly. The framework can save approximately 50% of development time, with junior programmers completing tasks that originally required one person-month in about 15 person-days. Features such as automatic generation tools, code templates, and component reuse allow developers to focus more on business logic implementation. There is no need for additional training of advanced developers, saving time, and reducing labor costs while improving the return on investment. +The PCM development framework lowers the complexity of development by providing structured development patterns, ready-made components, and documentation support. Developers of different skill levels can collaborate more fairly. The framework can save approximately 50% of development time, with junior programmers completing tasks that originally required one person-month in about 15 person-days. Features such as automatic generation tools, code templates, and component reuse allow developers to focus more on business logic implementation. There is no need for additional training of advanced developers, saving time, and reducing labor costs while improving the return on investment. - **Reduced Code Development Workload**: From 4838de910c616bc005c5060a872eb0c6c8c0e09d Mon Sep 17 00:00:00 2001 From: Tuberrr Date: Mon, 29 Apr 2024 15:34:04 +0800 Subject: [PATCH 03/40] Update README.md Former-commit-id: c595a58fd98428f428139648d52bb30793e40754 --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 320bfeec..f684f986 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,6 @@ PCM is built upon a standardized software-defined framework. Cloud service provi - **Reduced Code Development Complexity** - The PCM development framework lowers the complexity of development by providing structured development patterns, ready-made components, and documentation support. Developers of different skill levels can collaborate more fairly. The framework can save approximately 50% of development time, with junior programmers completing tasks that originally required one person-month in about 15 person-days. Features such as automatic generation tools, code templates, and component reuse allow developers to focus more on business logic implementation. There is no need for additional training of advanced developers, saving time, and reducing labor costs while improving the return on investment. From eecb88541386a1dc895b1c2a782f2e01ad7b95c4 Mon Sep 17 00:00:00 2001 From: Tuberrr Date: Mon, 29 Apr 2024 15:34:49 +0800 Subject: [PATCH 04/40] Update README.md Former-commit-id: 71476e27b72c3e4a868d4245dc5000feeaeb3417 --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f684f986..fe692cab 100644 --- a/README.md +++ b/README.md @@ -16,13 +16,11 @@ PCM is built upon a standardized software-defined framework. Cloud service provi ## Real-world Issues Addressed by PCM: - **Performance Improvement** - - Developers in the cloud collaboration environment experience performance enhancement upon joining the PCM framework. They can generate code for the internal functionality structure and most specification definitions using descriptive language, allowing them to focus on business logic development without concerning themselves with underlying management functions. The framework offers features such as microservices management, multi-language code generation, and model bridging, reducing coding workload and entry barriers while improving efficiency. +Developers in the cloud collaboration environment experience performance enhancement upon joining the PCM framework. They can generate code for the internal functionality structure and most specification definitions using descriptive language, allowing them to focus on business logic development without concerning themselves with underlying management functions. The framework offers features such as microservices management, multi-language code generation, and model bridging, reducing coding workload and entry barriers while improving efficiency. - **Platform Lock-in Resolution** - - The PCM effectively addresses platform lock-in issues through multi-cloud adaptation, standardized interfaces, and abstraction layers. This enables cross-platform operations, deployment, and interaction. Standardized interfaces simplify the work for developers, lowering the risk of platform lock-in. +The PCM effectively addresses platform lock-in issues through multi-cloud adaptation, standardized interfaces, and abstraction layers. This enables cross-platform operations, deployment, and interaction. Standardized interfaces simplify the work for developers, lowering the risk of platform lock-in. - **Reduced Code Development Complexity** From 4fa42ed43337c2ce5f67c763c8a9becfbdcc1319 Mon Sep 17 00:00:00 2001 From: Tuberrr Date: Mon, 29 Apr 2024 15:35:54 +0800 Subject: [PATCH 05/40] Update README.md Former-commit-id: 406286681bcb8a6715e82da292e354df7b82944f --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fe692cab..01192752 100644 --- a/README.md +++ b/README.md @@ -15,19 +15,19 @@ PCM is built upon a standardized software-defined framework. Cloud service provi ## Real-world Issues Addressed by PCM: -- **Performance Improvement** +- **Performance Improvement** Developers in the cloud collaboration environment experience performance enhancement upon joining the PCM framework. They can generate code for the internal functionality structure and most specification definitions using descriptive language, allowing them to focus on business logic development without concerning themselves with underlying management functions. The framework offers features such as microservices management, multi-language code generation, and model bridging, reducing coding workload and entry barriers while improving efficiency. -- **Platform Lock-in Resolution** +- **Platform Lock-in Resolution** The PCM effectively addresses platform lock-in issues through multi-cloud adaptation, standardized interfaces, and abstraction layers. This enables cross-platform operations, deployment, and interaction. Standardized interfaces simplify the work for developers, lowering the risk of platform lock-in. -- **Reduced Code Development Complexity** +- **Reduced Code Development Complexity** The PCM development framework lowers the complexity of development by providing structured development patterns, ready-made components, and documentation support. Developers of different skill levels can collaborate more fairly. The framework can save approximately 50% of development time, with junior programmers completing tasks that originally required one person-month in about 15 person-days. Features such as automatic generation tools, code templates, and component reuse allow developers to focus more on business logic implementation. There is no need for additional training of advanced developers, saving time, and reducing labor costs while improving the return on investment. -- **Reduced Code Development Workload**: +- **Reduced Code Development Workload** The PCM framework offers automation features and code generation tools, reducing manual code writing and improving development speed and code quality. With the framework's auto-generation tools, over half of the code is generated, achieving a low code rate of approximately 63%. Developers primarily work on writing descriptive language files to generate the basic structure and then adjust and write logic functions. This enables teams to deliver products more quickly, iterate based on business optimization and user feedback, and be more agile. From 92d2b31f76a94b59b7172d355748819d76862e05 Mon Sep 17 00:00:00 2001 From: Tuberrr Date: Mon, 29 Apr 2024 15:39:41 +0800 Subject: [PATCH 06/40] Update README.md Former-commit-id: daf75cec37db83641da53e436d1d9afe51ccb7f1 --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 01192752..2b05037d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -![PCM](https://www.gitlink.org.cn/images/avatars/Organization/123822?t=1689062058) +PCM +

Docs | 简体中文 | From 95c43cf2717aed6ed67f24a02ada6d70519be8e0 Mon Sep 17 00:00:00 2001 From: tzwang Date: Tue, 30 Apr 2024 17:35:56 +0800 Subject: [PATCH 07/40] updated ai db functions Former-commit-id: c1a9ee19503fbedfe980ec175ec0fc846dd26e87 --- .../logic/ai/getcenteroverviewlogic.go | 2 +- api/internal/scheduler/database/aiStorage.go | 28 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/api/internal/logic/ai/getcenteroverviewlogic.go b/api/internal/logic/ai/getcenteroverviewlogic.go index 9f76979d..eb93684b 100644 --- a/api/internal/logic/ai/getcenteroverviewlogic.go +++ b/api/internal/logic/ai/getcenteroverviewlogic.go @@ -61,7 +61,7 @@ func (l *GetCenterOverviewLogic) GetCenterOverview() (resp *types.CenterOverview } } - resp.CardNum = centerNum + resp.CardNum = cardNum resp.PowerInTops = totalTops return resp, nil diff --git a/api/internal/scheduler/database/aiStorage.go b/api/internal/scheduler/database/aiStorage.go index c458c622..018f58b8 100644 --- a/api/internal/scheduler/database/aiStorage.go +++ b/api/internal/scheduler/database/aiStorage.go @@ -169,6 +169,34 @@ func (s *AiStorage) GetClusterResourcesById(clusterId string) (*models.TClusterR return &clusterResource, nil } +func (s *AiStorage) SaveClusterResources(clusterId string, clusterName string, clusterType int64, cpuAvail float64, cpuTotal float64, + memAvail float64, memTotal float64, diskAvail float64, diskTotal float64, gpuAvail float64, gpuTotal float64, cardTotal int64, topsTotal float64) error { + cId, err := strconv.ParseInt(clusterId, 10, 64) + if err != nil { + return err + } + clusterResource := models.TClusterResource{ + ClusterId: cId, + ClusterName: clusterName, + ClusterType: clusterType, + CpuAvail: cpuAvail, + CpuTotal: cpuTotal, + MemAvail: memAvail, + MemTotal: memTotal, + DiskAvail: diskAvail, + DiskTotal: diskTotal, + GpuAvail: gpuAvail, + GpuTotal: gpuTotal, + CardTotal: cardTotal, + CardTopsTotal: topsTotal, + } + tx := s.DbEngin.Create(&clusterResource) + if tx.Error != nil { + return tx.Error + } + return nil +} + func (s *AiStorage) UpdateTask() error { return nil } From 60359ea095833863ec7b6f0ebb796240804ce7c1 Mon Sep 17 00:00:00 2001 From: tzwang Date: Tue, 7 May 2024 16:55:35 +0800 Subject: [PATCH 08/40] updated getCentertaskList function Former-commit-id: 0c2585ad3301d2a1dc973cc5a14b5fee041dbcc6 --- .../logic/ai/getcentertasklistlogic.go | 19 +++++++----- .../logic/schedule/schedulesubmitlogic.go | 2 +- api/internal/scheduler/database/aiStorage.go | 10 ++++-- .../scheduler/schedulers/aiScheduler.go | 8 ++--- .../scheduler/service/collector/collector.go | 8 +++++ api/internal/storeLink/modelarts.go | 4 +++ api/internal/storeLink/octopus.go | 31 +++++++++++++++++++ api/internal/storeLink/shuguangai.go | 10 ++++++ pkg/constants/task.go | 1 + pkg/constants/time.go | 3 ++ 10 files changed, 80 insertions(+), 16 deletions(-) create mode 100644 pkg/constants/time.go diff --git a/api/internal/logic/ai/getcentertasklistlogic.go b/api/internal/logic/ai/getcentertasklistlogic.go index 0a800630..6fce581a 100644 --- a/api/internal/logic/ai/getcentertasklistlogic.go +++ b/api/internal/logic/ai/getcentertasklistlogic.go @@ -17,8 +17,6 @@ type GetCenterTaskListLogic struct { svcCtx *svc.ServiceContext } -const layout = "2006-01-02 15:04:05" - func NewGetCenterTaskListLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetCenterTaskListLogic { return &GetCenterTaskListLogic{ Logger: logx.WithContext(ctx), @@ -42,12 +40,17 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList } for _, task := range taskList { var elapsed time.Duration - start, _ := time.Parse(layout, task.CommitTime) - if task.Status != constants.Completed { - elapsed = start.Sub(time.Now()) - } else { - end, _ := time.Parse(layout, task.EndTime) - elapsed = start.Sub(end) + switch task.Status { + case constants.Completed: + end, err := time.ParseInLocation(constants.Layout, task.EndTime, time.Local) + if err != nil { + elapsed = time.Duration(0) + } + elapsed = end.Sub(task.CommitTime) + case constants.Running: + elapsed = time.Now().Sub(task.CommitTime) + default: + elapsed = 0 } t := &types.AiTask{ diff --git a/api/internal/logic/schedule/schedulesubmitlogic.go b/api/internal/logic/schedule/schedulesubmitlogic.go index e46ffe7d..d3b1cd99 100644 --- a/api/internal/logic/schedule/schedulesubmitlogic.go +++ b/api/internal/logic/schedule/schedulesubmitlogic.go @@ -66,7 +66,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type scheResult.Strategy = r.Strategy scheResult.Replica = r.Replica scheResult.Msg = r.Msg - err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, r.ClusterId, r.TaskId, constants.Running, r.Msg) + err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, r.ClusterId, r.TaskId, constants.Saved, r.Msg) if err != nil { return nil, err } diff --git a/api/internal/scheduler/database/aiStorage.go b/api/internal/scheduler/database/aiStorage.go index 018f58b8..e75401f5 100644 --- a/api/internal/scheduler/database/aiStorage.go +++ b/api/internal/scheduler/database/aiStorage.go @@ -61,8 +61,8 @@ func (s *AiStorage) GetAdaptersByType(adapterType string) ([]*types.AdapterInfo, return list, nil } -func (s *AiStorage) GetAiTasksByAdapterId(adapterId string) ([]*types.AiTaskDb, error) { - var resp []*types.AiTaskDb +func (s *AiStorage) GetAiTasksByAdapterId(adapterId string) ([]*models.TaskAi, error) { + var resp []*models.TaskAi tx := s.DbEngin.Raw("select * from task_ai where `adapter_id` = ? ", adapterId).Scan(&resp) if tx.Error != nil { logx.Errorf(tx.Error.Error()) @@ -197,6 +197,10 @@ func (s *AiStorage) SaveClusterResources(clusterId string, clusterName string, c return nil } -func (s *AiStorage) UpdateTask() error { +func (s *AiStorage) UpdateAiTask(task models.TaskAi) error { + tx := s.DbEngin.Updates(&task) + if tx.Error != nil { + return tx.Error + } return nil } diff --git a/api/internal/scheduler/schedulers/aiScheduler.go b/api/internal/scheduler/schedulers/aiScheduler.go index af50d201..1fa4eb63 100644 --- a/api/internal/scheduler/schedulers/aiScheduler.go +++ b/api/internal/scheduler/schedulers/aiScheduler.go @@ -176,7 +176,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa if len(errs) != 0 { taskId, err := as.AiStorages.SaveTask(as.option.TaskName) if err != nil { - return nil, err + return nil, errors.New("database add failed: " + err.Error()) } var errmsg string for _, err := range errs { @@ -188,7 +188,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa errmsg += msg err := as.AiStorages.SaveAiTask(taskId, as.option, e.clusterId, "", constants.Failed, msg) if err != nil { - return nil, err + return nil, errors.New("database add failed: " + err.Error()) } } for s := range ch { @@ -197,14 +197,14 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa errmsg += msg err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, "", constants.Failed, msg) if err != nil { - return nil, err + return nil, errors.New("database add failed: " + err.Error()) } } else { msg := fmt.Sprintf("clusterId: %v , submitted successfully, taskId: %v \n", s.ClusterId, s.TaskId) errmsg += msg err := as.AiStorages.SaveAiTask(taskId, as.option, s.ClusterId, s.TaskId, constants.Succeeded, msg) if err != nil { - return nil, err + return nil, errors.New("database add failed: " + err.Error()) } } } diff --git a/api/internal/scheduler/service/collector/collector.go b/api/internal/scheduler/service/collector/collector.go index 99d34b51..01406901 100644 --- a/api/internal/scheduler/service/collector/collector.go +++ b/api/internal/scheduler/service/collector/collector.go @@ -7,6 +7,7 @@ type AiCollector interface { GetDatasetsSpecs(ctx context.Context) ([]*DatasetsSpecs, error) GetAlgorithms(ctx context.Context) ([]*Algorithm, error) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) + GetTrainingTask(ctx context.Context, taskId string) (*Task, error) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error } @@ -45,3 +46,10 @@ type Algorithm struct { Platform string TaskType string } + +type Task struct { + Id string + Start string + End string + Status string +} diff --git a/api/internal/storeLink/modelarts.go b/api/internal/storeLink/modelarts.go index 84bc0dbf..1ae255f2 100644 --- a/api/internal/storeLink/modelarts.go +++ b/api/internal/storeLink/modelarts.go @@ -174,6 +174,10 @@ func (m *ModelArtsLink) GetTrainingTaskLog(ctx context.Context, taskId string, i return "", nil } +func (m *ModelArtsLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) { + return nil, nil +} + func (m *ModelArtsLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) { err := m.GenerateSubmitParams(ctx, option) if err != nil { diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index 22781e2f..f72f63ab 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -19,12 +19,14 @@ import ( "errors" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-octopus/octopus" "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient" "math" "strconv" "strings" + "time" ) type OctopusLink struct { @@ -364,6 +366,35 @@ func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, ins return resp.Content, nil } +func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) { + resp, err := o.QueryTask(ctx, taskId) + if err != nil { + return nil, err + } + jobresp := (resp).(*octopus.GetTrainJobResp) + if !jobresp.Success { + return nil, errors.New(jobresp.Error.Message) + } + var task collector.Task + task.Id = jobresp.Payload.TrainJob.Id + task.Start = time.Unix(jobresp.Payload.TrainJob.StartedAt, 0).Format(constants.Layout) + task.End = time.Unix(jobresp.Payload.TrainJob.CompletedAt, 0).Format(constants.Layout) + switch jobresp.Payload.TrainJob.Status { + case "succeeded": + task.Status = constants.Completed + case "failed": + task.Status = constants.Failed + case "running": + task.Status = constants.Running + case "stopped": + task.Status = constants.Stopped + default: + task.Status = "undefined" + } + + return &task, nil +} + func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) { err := o.GenerateSubmitParams(ctx, option) if err != nil { diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index 4a84cea4..7f3ee370 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -17,6 +17,7 @@ package storeLink import ( "context" "errors" + "fmt" "gitlink.org.cn/JointCloud/pcm-ac/hpcAC" "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/common" @@ -473,6 +474,15 @@ func (s *ShuguangAi) GetTrainingTaskLog(ctx context.Context, taskId string, inst return resp.Data.Content, nil } +func (s *ShuguangAi) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) { + task, err := s.QueryTask(ctx, taskId) + if err != nil { + return nil, err + } + fmt.Println(task) + return nil, nil +} + func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) { err := s.GenerateSubmitParams(ctx, option) if err != nil { diff --git a/pkg/constants/task.go b/pkg/constants/task.go index daf8879f..14e4b5fe 100644 --- a/pkg/constants/task.go +++ b/pkg/constants/task.go @@ -26,4 +26,5 @@ const ( WaitRestart = "WaitRestart" WaitPause = "WaitPause" WaitStart = "WaitStart" + Stopped = "Stopped" ) diff --git a/pkg/constants/time.go b/pkg/constants/time.go new file mode 100644 index 00000000..deecc715 --- /dev/null +++ b/pkg/constants/time.go @@ -0,0 +1,3 @@ +package constants + +const Layout = "2006-01-02 15:04:05" From 353c4630f2d6919119177141fc3bf121f4481bdc Mon Sep 17 00:00:00 2001 From: tzwang Date: Tue, 7 May 2024 17:02:11 +0800 Subject: [PATCH 09/40] fix conflict Former-commit-id: 84868190bf79dbcc20faafe19b03ed34da58a104 --- pkg/constants/task.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/constants/task.go b/pkg/constants/task.go index 14e4b5fe..daf8879f 100644 --- a/pkg/constants/task.go +++ b/pkg/constants/task.go @@ -26,5 +26,4 @@ const ( WaitRestart = "WaitRestart" WaitPause = "WaitPause" WaitStart = "WaitStart" - Stopped = "Stopped" ) From 576df040796f082dd518e47e682f26057201c727 Mon Sep 17 00:00:00 2001 From: jagger Date: Tue, 7 May 2024 17:55:43 +0800 Subject: [PATCH 10/40] fix Signed-off-by: jagger Former-commit-id: 74bdba75d1474724857fd31f7ff7f97853ce7905 --- api/desc/core/pcm-core.api | 2 +- api/internal/logic/core/pagelisttasklogic.go | 18 +++++++++++++++--- api/internal/types/types.go | 2 +- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index fb7fcfce..9e322642 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -338,7 +338,7 @@ type ( } TaskModel { - Id int64 `json:"id,omitempty" db:"id"` // id + Id int64 `json:"id,omitempty,string" db:"id"` // id Name string `json:"name,omitempty" db:"name"` // 作业名称 Description string `json:"description,omitempty" db:"description"` // 作业描述 Status string `json:"status,omitempty" db:"status"` // 作业状态 diff --git a/api/internal/logic/core/pagelisttasklogic.go b/api/internal/logic/core/pagelisttasklogic.go index dd9293fe..cb456c09 100644 --- a/api/internal/logic/core/pagelisttasklogic.go +++ b/api/internal/logic/core/pagelisttasklogic.go @@ -3,6 +3,7 @@ package core import ( "context" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" + "time" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" @@ -28,7 +29,7 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa limit := req.PageSize offset := req.PageSize * (req.PageNum - 1) resp = &types.PageResult{} - var list []types.TaskModel + var list []*types.TaskModel db := l.svcCtx.DbEngin.Model(&types.TaskModel{}).Table("task") db = db.Where("deleted_at is null") @@ -48,8 +49,19 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa if err != nil { return nil, result.NewDefaultError(err.Error()) } - - resp.List = list + const layout = "2006-01-02 15:04:05" + for _, model := range list { + if model.EndTime != "" && model.StartTime != "" { + starTime, _ := time.Parse(layout, model.StartTime) + endTime, _ := time.Parse(layout, model.EndTime) + model.RunningTime = int64(endTime.Sub(starTime).Seconds()) + } + if model.StartTime != "" { + starTime, _ := time.Parse(layout, model.StartTime) + model.RunningTime = int64(time.Now().Sub(starTime).Seconds()) + } + } + resp.List = &list resp.PageSize = req.PageSize resp.PageNum = req.PageNum resp.Total = total diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 3f8820ad..a9a6127b 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -261,7 +261,7 @@ type PageTaskReq struct { } type TaskModel struct { - Id int64 `json:"id,omitempty" db:"id"` // id + Id int64 `json:"id,omitempty,string" db:"id"` // id Name string `json:"name,omitempty" db:"name"` // 作业名称 Description string `json:"description,omitempty" db:"description"` // 作业描述 Status string `json:"status,omitempty" db:"status"` // 作业状态 From 10e21a94990fcb098e98c0c920036175a0821a59 Mon Sep 17 00:00:00 2001 From: tzwang Date: Tue, 7 May 2024 20:05:00 +0800 Subject: [PATCH 11/40] updated gettasklist logic Former-commit-id: af8ea56c03fc44ba7c881c3bf9c5850765ec2b69 --- .../logic/ai/getcentertasklistlogic.go | 46 ++++++++++++++++++- .../logic/schedule/schedulesubmitlogic.go | 11 ++++- api/internal/scheduler/database/aiStorage.go | 31 ++++++++++--- .../scheduler/schedulers/aiScheduler.go | 8 +++- api/internal/storeLink/shuguangai.go | 16 +++++-- pkg/constants/task.go | 1 + 6 files changed, 98 insertions(+), 15 deletions(-) diff --git a/api/internal/logic/ai/getcentertasklistlogic.go b/api/internal/logic/ai/getcentertasklistlogic.go index 6fce581a..a0d8c9fe 100644 --- a/api/internal/logic/ai/getcentertasklistlogic.go +++ b/api/internal/logic/ai/getcentertasklistlogic.go @@ -3,6 +3,8 @@ package ai import ( "context" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" + "strconv" + "sync" "time" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" @@ -27,12 +29,15 @@ func NewGetCenterTaskListLogic(ctx context.Context, svcCtx *svc.ServiceContext) func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskListResp, err error) { resp = &types.CenterTaskListResp{} + var mu sync.RWMutex adapterList, err := l.svcCtx.Scheduler.AiStorages.GetAdaptersByType("1") if err != nil { return nil, err } + l.updateAiTaskStatus(&mu, adapterList) + for _, adapter := range adapterList { taskList, err := l.svcCtx.Scheduler.AiStorages.GetAiTasksByAdapterId(adapter.Id) if err != nil { @@ -46,7 +51,11 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList if err != nil { elapsed = time.Duration(0) } - elapsed = end.Sub(task.CommitTime) + start, err := time.ParseInLocation(constants.Layout, task.StartTime, time.Local) + if err != nil { + elapsed = time.Duration(0) + } + elapsed = end.Sub(start) case constants.Running: elapsed = time.Now().Sub(task.CommitTime) default: @@ -64,3 +73,38 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList return resp, nil } + +func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, list []*types.AdapterInfo) { + var wg sync.WaitGroup + for _, adapter := range list { + mu.RLock() + taskList, err := l.svcCtx.Scheduler.AiStorages.GetAiTasksByAdapterId(adapter.Id) + mu.RUnlock() + if err != nil { + continue + } + for _, task := range taskList { + t := task + wg.Add(1) + go func() { + trainingTask, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(l.ctx, t.JobId) + if err != nil { + wg.Done() + return + } + t.Status = trainingTask.Status + t.StartTime = trainingTask.Start + t.EndTime = trainingTask.End + mu.Lock() + err = l.svcCtx.Scheduler.AiStorages.UpdateAiTask(t) + mu.Unlock() + if err != nil { + wg.Done() + return + } + wg.Done() + }() + } + } + wg.Wait() +} diff --git a/api/internal/logic/schedule/schedulesubmitlogic.go b/api/internal/logic/schedule/schedulesubmitlogic.go index d3b1cd99..b2aa78f5 100644 --- a/api/internal/logic/schedule/schedulesubmitlogic.go +++ b/api/internal/logic/schedule/schedulesubmitlogic.go @@ -54,11 +54,18 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type switch opt.GetOptionType() { case option.AI: - id, err := l.svcCtx.Scheduler.AiStorages.SaveTask(req.AiOption.TaskName) + rs := (results).([]*schedulers.AiResult) + var synergystatus int64 + if len(rs) > 1 { + synergystatus = 1 + } + strategyCode, err := l.svcCtx.Scheduler.AiStorages.GetStrategyCode(req.AiOption.Strategy) + + id, err := l.svcCtx.Scheduler.AiStorages.SaveTask(req.AiOption.TaskName, strategyCode, synergystatus) if err != nil { return nil, err } - rs := (results).([]*schedulers.AiResult) + for _, r := range rs { scheResult := &types.ScheduleResult{} scheResult.ClusterId = r.ClusterId diff --git a/api/internal/scheduler/database/aiStorage.go b/api/internal/scheduler/database/aiStorage.go index e75401f5..89dbbaa3 100644 --- a/api/internal/scheduler/database/aiStorage.go +++ b/api/internal/scheduler/database/aiStorage.go @@ -71,13 +71,15 @@ func (s *AiStorage) GetAiTasksByAdapterId(adapterId string) ([]*models.TaskAi, e return resp, nil } -func (s *AiStorage) SaveTask(name string) (int64, error) { +func (s *AiStorage) SaveTask(name string, strategyCode int64, synergyStatus int64) (int64, error) { // 构建主任务结构体 taskModel := models.Task{ - Status: constants.Saved, - Description: "ai task", - Name: name, - CommitTime: time.Now(), + Status: constants.Saved, + Description: "ai task", + Name: name, + SynergyStatus: synergyStatus, + Strategy: strategyCode, + CommitTime: time.Now(), } // 保存任务数据到数据库 tx := s.DbEngin.Create(&taskModel) @@ -197,10 +199,25 @@ func (s *AiStorage) SaveClusterResources(clusterId string, clusterName string, c return nil } -func (s *AiStorage) UpdateAiTask(task models.TaskAi) error { - tx := s.DbEngin.Updates(&task) +func (s *AiStorage) UpdateAiTask(task *models.TaskAi) error { + tx := s.DbEngin.Updates(task) if tx.Error != nil { return tx.Error } return nil } + +func (s *AiStorage) GetStrategyCode(name string) (int64, error) { + var strategy int64 + sqlStr := `select t_dict_item.item_value + from t_dict + left join t_dict_item on t_dict.id = t_dict_item.dict_id + where item_text = ? + and t_dict.dict_code = 'schedule_Strategy'` + //查询调度策略 + err := s.DbEngin.Raw(sqlStr, name).Scan(&strategy).Error + if err != nil { + return strategy, nil + } + return strategy, nil +} diff --git a/api/internal/scheduler/schedulers/aiScheduler.go b/api/internal/scheduler/schedulers/aiScheduler.go index 1fa4eb63..76b1a6b9 100644 --- a/api/internal/scheduler/schedulers/aiScheduler.go +++ b/api/internal/scheduler/schedulers/aiScheduler.go @@ -174,10 +174,16 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa } if len(errs) != 0 { - taskId, err := as.AiStorages.SaveTask(as.option.TaskName) + var synergystatus int64 + if len(clusters) > 1 { + synergystatus = 1 + } + strategyCode, err := as.AiStorages.GetStrategyCode(as.option.StrategyName) + taskId, err := as.AiStorages.SaveTask(as.option.TaskName, strategyCode, synergystatus) if err != nil { return nil, errors.New("database add failed: " + err.Error()) } + var errmsg string for _, err := range errs { e := (err).(struct { diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index 7f3ee370..4dbd8fcc 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -17,7 +17,6 @@ package storeLink import ( "context" "errors" - "fmt" "gitlink.org.cn/JointCloud/pcm-ac/hpcAC" "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/common" @@ -475,12 +474,21 @@ func (s *ShuguangAi) GetTrainingTaskLog(ctx context.Context, taskId string, inst } func (s *ShuguangAi) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) { - task, err := s.QueryTask(ctx, taskId) + resp, err := s.QueryTask(ctx, taskId) if err != nil { return nil, err } - fmt.Println(task) - return nil, nil + jobresp := (resp).(*hpcAC.GetPytorchTaskResp) + if jobresp.Code != "0" { + return nil, errors.New(jobresp.Msg) + } + var task collector.Task + task.Id = jobresp.Data.Id + task.Start = jobresp.Data.StartTime + task.End = jobresp.Data.EndTime + task.Status = jobresp.Data.Status + + return &task, nil } func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) { diff --git a/pkg/constants/task.go b/pkg/constants/task.go index 0ec079f3..a401cd46 100644 --- a/pkg/constants/task.go +++ b/pkg/constants/task.go @@ -27,4 +27,5 @@ const ( WaitPause = "WaitPause" WaitStart = "WaitStart" Pending = "Pending" + Stopped = "Stopped" ) From 92532221523c449efab48a51c7fc8d5882b98e0a Mon Sep 17 00:00:00 2001 From: jagger Date: Wed, 8 May 2024 17:19:51 +0800 Subject: [PATCH 12/40] fix Signed-off-by: jagger Former-commit-id: ab7b8cfac57c4efd3f76090d1f85dc43f2b51293 --- api/desc/core/pcm-core.api | 1 + .../logic/cloud/commitgeneraltasklogic.go | 18 +++++----- .../logic/core/counttaskstatuslogic.go | 2 +- api/internal/logic/core/pagelisttasklogic.go | 12 +++---- api/internal/logic/core/taskdetailslogic.go | 2 +- .../scheduler/schedulers/cloudScheduler.go | 3 +- api/internal/types/types.go | 33 ++++++++++--------- pkg/models/taskmodel_gen.go | 30 ++++++++--------- pkg/utils/timeutils/time.go | 4 ++- 9 files changed, 55 insertions(+), 50 deletions(-) diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index 9e322642..973f5944 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -354,6 +354,7 @@ type ( NsID string `json:"nsId,omitempty" db:"ns_id"` TenantId string `json:"tenantId,omitempty" db:"tenant_id"` CreateTime string `json:"createTime,omitempty" db:"create_time" gorm:"autoCreateTime"` + AdapterTypeDict int `json:"adapterTypeDict" db:"create_time" gorm:"adapter_type_dict"` //任务类型(对应字典表的值 } ) diff --git a/api/internal/logic/cloud/commitgeneraltasklogic.go b/api/internal/logic/cloud/commitgeneraltasklogic.go index cf8842b9..0162b832 100644 --- a/api/internal/logic/cloud/commitgeneraltasklogic.go +++ b/api/internal/logic/cloud/commitgeneraltasklogic.go @@ -89,14 +89,14 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er //查询调度策略 err = tx.Raw(sqlStr, req.Strategy).Scan(&strategy).Error taskModel := models.Task{ - Id: utils.GenSnowflakeID(), - Status: constants.Pending, - Name: req.Name, - CommitTime: time.Now(), - YamlString: strings.Join(req.ReqBody, "\n---\n"), - TaskTypeDict: 0, - SynergyStatus: synergyStatus, - Strategy: strategy, + Id: utils.GenSnowflakeID(), + Status: constants.Saved, + Name: req.Name, + CommitTime: time.Now(), + YamlString: strings.Join(req.ReqBody, "\n---\n"), + AdapterTypeDict: 0, + SynergyStatus: synergyStatus, + Strategy: strategy, } var taskClouds []cloud.TaskCloudModel for _, r := range rs { @@ -109,7 +109,7 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er taskCloud.AdapterId = uint(adapterId) taskCloud.ClusterId = uint(clusterId) taskCloud.ClusterName = r.ClusterName - taskCloud.Status = constants.Pending + taskCloud.Status = constants.Saved taskCloud.YamlString = string(unString) taskCloud.Kind = sStruct.GetKind() taskCloud.Namespace = sStruct.GetNamespace() diff --git a/api/internal/logic/core/counttaskstatuslogic.go b/api/internal/logic/core/counttaskstatuslogic.go index 5cedea7f..3abe9797 100644 --- a/api/internal/logic/core/counttaskstatuslogic.go +++ b/api/internal/logic/core/counttaskstatuslogic.go @@ -30,7 +30,7 @@ func (l *CountTaskStatusLogic) CountTaskStatus() (resp *types.TaskStatusResp, er COUNT(CASE WHEN status = 'Succeeded' THEN 1 END) AS Succeeded, COUNT(CASE WHEN status = 'Failed' THEN 1 END) AS Failed, COUNT(CASE WHEN status = 'Running' THEN 1 END) AS Running, - COUNT(CASE WHEN status = 'Pause' THEN 1 END) AS Pause + COUNT(CASE WHEN status = 'Saved' THEN 1 END) AS Saved FROM task;` err = l.svcCtx.DbEngin.Raw(sqlStr).Scan(&resp).Error if err != nil { diff --git a/api/internal/logic/core/pagelisttasklogic.go b/api/internal/logic/core/pagelisttasklogic.go index cb456c09..ce8ae65d 100644 --- a/api/internal/logic/core/pagelisttasklogic.go +++ b/api/internal/logic/core/pagelisttasklogic.go @@ -3,6 +3,7 @@ package core import ( "context" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils" "time" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" @@ -49,16 +50,15 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa if err != nil { return nil, result.NewDefaultError(err.Error()) } - const layout = "2006-01-02 15:04:05" for _, model := range list { if model.EndTime != "" && model.StartTime != "" { - starTime, _ := time.Parse(layout, model.StartTime) - endTime, _ := time.Parse(layout, model.EndTime) - model.RunningTime = int64(endTime.Sub(starTime).Seconds()) + startTime := timeutils.TimeStringToGoTime(model.StartTime) + endTime := timeutils.TimeStringToGoTime(model.EndTime) + model.RunningTime = int64(endTime.Sub(startTime).Seconds()) } if model.StartTime != "" { - starTime, _ := time.Parse(layout, model.StartTime) - model.RunningTime = int64(time.Now().Sub(starTime).Seconds()) + startTime := timeutils.TimeStringToGoTime(model.StartTime) + model.RunningTime = int64(time.Now().Sub(startTime).Seconds()) } } resp.List = &list diff --git a/api/internal/logic/core/taskdetailslogic.go b/api/internal/logic/core/taskdetailslogic.go index 2af8de23..da4acde5 100644 --- a/api/internal/logic/core/taskdetailslogic.go +++ b/api/internal/logic/core/taskdetailslogic.go @@ -34,7 +34,7 @@ func (l *TaskDetailsLogic) TaskDetails(req *types.FId) (resp *types.TaskDetailsR } clusterIds := make([]int64, 0) var cList []*types.ClusterInfo - switch task.TaskTypeDict { + switch task.AdapterTypeDict { case 0: l.svcCtx.DbEngin.Table("task_cloud").Select("cluster_id").Where("task_id", task.Id).Scan(&clusterIds) case 1: diff --git a/api/internal/scheduler/schedulers/cloudScheduler.go b/api/internal/scheduler/schedulers/cloudScheduler.go index 4f00aaba..f14db395 100644 --- a/api/internal/scheduler/schedulers/cloudScheduler.go +++ b/api/internal/scheduler/schedulers/cloudScheduler.go @@ -24,6 +24,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy/param" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models/cloud" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/tracker" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" @@ -60,7 +61,7 @@ func (as *CloudScheduler) GetNewStructForDb(task *response.TaskInfo, resource st c := cloud.TaskCloudModel{ AdapterId: uint(participantId), TaskId: uint(task.TaskId), - Status: "Pending", + Status: constants.Saved, YamlString: as.yamlString, } utils.Convert(task.Metadata, &c) diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 66d11e63..806d8762 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -261,22 +261,23 @@ type PageTaskReq struct { } type TaskModel struct { - Id int64 `json:"id,omitempty,string" db:"id"` // id - Name string `json:"name,omitempty" db:"name"` // 作业名称 - Description string `json:"description,omitempty" db:"description"` // 作业描述 - Status string `json:"status,omitempty" db:"status"` // 作业状态 - Strategy int64 `json:"strategy" db:"strategy"` // 策略 - SynergyStatus int64 `json:"synergyStatus" db:"synergy_status"` // 协同状态(0-未协同、1-已协同) - CommitTime string `json:"commitTime,omitempty" db:"commit_time"` // 提交时间 - StartTime string `json:"startTime,omitempty" db:"start_time"` // 开始时间 - EndTime string `json:"endTime,omitempty" db:"end_time"` // 结束运行时间 - RunningTime int64 `json:"runningTime" db:"running_time"` // 已运行时间(单位秒) - YamlString string `json:"yamlString,omitempty" db:"yaml_string"` - Result string `json:"result,omitempty" db:"result"` // 作业结果 - DeletedAt string `json:"deletedAt,omitempty" gorm:"index" db:"deleted_at"` - NsID string `json:"nsId,omitempty" db:"ns_id"` - TenantId string `json:"tenantId,omitempty" db:"tenant_id"` - CreateTime string `json:"createTime,omitempty" db:"create_time" gorm:"autoCreateTime"` + Id int64 `json:"id,omitempty,string" db:"id"` // id + Name string `json:"name,omitempty" db:"name"` // 作业名称 + Description string `json:"description,omitempty" db:"description"` // 作业描述 + Status string `json:"status,omitempty" db:"status"` // 作业状态 + Strategy int64 `json:"strategy" db:"strategy"` // 策略 + SynergyStatus int64 `json:"synergyStatus" db:"synergy_status"` // 协同状态(0-未协同、1-已协同) + CommitTime string `json:"commitTime,omitempty" db:"commit_time"` // 提交时间 + StartTime string `json:"startTime,omitempty" db:"start_time"` // 开始时间 + EndTime string `json:"endTime,omitempty" db:"end_time"` // 结束运行时间 + RunningTime int64 `json:"runningTime" db:"running_time"` // 已运行时间(单位秒) + YamlString string `json:"yamlString,omitempty" db:"yaml_string"` + Result string `json:"result,omitempty" db:"result"` // 作业结果 + DeletedAt string `json:"deletedAt,omitempty" gorm:"index" db:"deleted_at"` + NsID string `json:"nsId,omitempty" db:"ns_id"` + TenantId string `json:"tenantId,omitempty" db:"tenant_id"` + CreateTime string `json:"createTime,omitempty" db:"create_time" gorm:"autoCreateTime"` + AdapterTypeDict int `json:"adapterTypeDict" db:"create_time" gorm:"adapter_type_dict"` //任务类型(对应字典表的值 } type TaskDetailReq struct { diff --git a/pkg/models/taskmodel_gen.go b/pkg/models/taskmodel_gen.go index 7eb2c745..b8ee8603 100644 --- a/pkg/models/taskmodel_gen.go +++ b/pkg/models/taskmodel_gen.go @@ -35,21 +35,21 @@ type ( } Task struct { - Id int64 `db:"id"` // id - Name string `db:"name"` // 作业名称 - Description string `db:"description"` // 作业描述 - Status string `db:"status"` // 作业状态 - Strategy int64 `db:"strategy"` // 策略 - SynergyStatus int64 `db:"synergy_status"` // 协同状态(0-未协同、1-已协同) - CommitTime time.Time `db:"commit_time"` // 提交时间 - StartTime string `db:"start_time"` // 开始时间 - EndTime string `db:"end_time"` // 结束运行时间 - RunningTime int64 `db:"running_time"` // 已运行时间(单位秒) - YamlString string `db:"yaml_string"` - Result string `db:"result"` // 作业结果 - DeletedAt gorm.DeletedAt `gorm:"index"` - NsID string `db:"ns_id"` - TaskTypeDict int `db:"task_type_dict"` //任务类型(对应字典表的值) + Id int64 `db:"id"` // id + Name string `db:"name"` // 作业名称 + Description string `db:"description"` // 作业描述 + Status string `db:"status"` // 作业状态 + Strategy int64 `db:"strategy"` // 策略 + SynergyStatus int64 `db:"synergy_status"` // 协同状态(0-未协同、1-已协同) + CommitTime time.Time `db:"commit_time"` // 提交时间 + StartTime string `db:"start_time"` // 开始时间 + EndTime string `db:"end_time"` // 结束运行时间 + RunningTime int64 `db:"running_time"` // 已运行时间(单位秒) + YamlString string `db:"yaml_string"` + Result string `db:"result"` // 作业结果 + DeletedAt gorm.DeletedAt `gorm:"index"` + NsID string `db:"ns_id"` + AdapterTypeDict int `db:"adapter_type_dict"` //任务类型(对应字典表的值) } ) diff --git a/pkg/utils/timeutils/time.go b/pkg/utils/timeutils/time.go index add75a9a..20b0accc 100644 --- a/pkg/utils/timeutils/time.go +++ b/pkg/utils/timeutils/time.go @@ -19,7 +19,9 @@ import ( ) var timeTemplates = []string{ - "2006-01-02 15:04:05", //常规类型 + "2006-01-02T15:04:05Z07:00", //RFC3339 + "2006-01-02 15:04:05", //常规类型 + "2006/01/02T15:04:05Z07:00", //RFC3339 "2006/01/02 15:04:05", "2006-01-02", "2006/01/02", From eedc344b40c4bcd5e30992316f919edb006334f6 Mon Sep 17 00:00:00 2001 From: jagger Date: Wed, 8 May 2024 18:38:54 +0800 Subject: [PATCH 13/40] fix Signed-off-by: jagger Former-commit-id: 7fa31bdee9f6e6b578ab698024cd3ec432c7076b --- api/desc/core/pcm-core.api | 6 +++--- api/internal/logic/core/pushtaskinfologic.go | 17 +++++----------- api/internal/logic/core/taskdetailslogic.go | 21 +++++++------------- api/internal/logic/core/tasklistlogic.go | 5 ++--- api/internal/types/types.go | 6 +++--- pkg/models/taskmodel_gen.go | 4 ++-- 6 files changed, 22 insertions(+), 37 deletions(-) diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index 973f5944..da7df371 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -1174,7 +1174,7 @@ type TaskStatusResp { Succeeded int `json:"Succeeded"` Failed int `json:"Failed"` Running int `json:"Running"` - Pause int `json:"Pause"` + Saved int `json:"Saved"` } type TaskDetailsResp { @@ -1182,7 +1182,7 @@ type TaskDetailsResp { description string `json:"description"` StartTime string `json:"startTime"` EndTime string `json:"endTime"` - Strategy int64 `json:"strategy,string"` - SynergyStatus int64 `json:"synergyStatus,string"` + Strategy int64 `json:"strategy"` + SynergyStatus int64 `json:"synergyStatus"` ClusterInfos []*ClusterInfo `json:"clusterInfos"` } \ No newline at end of file diff --git a/api/internal/logic/core/pushtaskinfologic.go b/api/internal/logic/core/pushtaskinfologic.go index df1635d0..5d86bca2 100644 --- a/api/internal/logic/core/pushtaskinfologic.go +++ b/api/internal/logic/core/pushtaskinfologic.go @@ -77,7 +77,6 @@ func syncTask(gorm *gorm.DB, taskId int64) { statusArray := strings.Split(allStatus, ",") if len(removeRepeatedElement(statusArray)) == 1 { updateTask(gorm, taskId, statusArray[0]) - } // 子任务包含失败状态 主任务则失败 if strings.Contains(allStatus, constants.Failed) { @@ -85,26 +84,20 @@ func syncTask(gorm *gorm.DB, taskId int64) { } if strings.Contains(allStatus, constants.Running) { - updateTaskRunning(gorm, taskId, constants.Running) + updateTask(gorm, taskId, constants.Running) } } func updateTask(gorm *gorm.DB, taskId int64, status string) { + now := time.Now() var task models.Task gorm.Where("id = ? ", taskId).Find(&task) if task.Status != status { task.Status = status - gorm.Updates(&task) - } -} - -func updateTaskRunning(gorm *gorm.DB, taskId int64, status string) { - var task models.Task - gorm.Where("id = ? ", taskId).Find(&task) - if task.Status != status { - task.Status = status - task.StartTime = time.Now().Format("2006-01-02 15:04:05") + if status == constants.Running { + task.StartTime = &now + } gorm.Updates(&task) } } diff --git a/api/internal/logic/core/taskdetailslogic.go b/api/internal/logic/core/taskdetailslogic.go index da4acde5..fb1a3ecd 100644 --- a/api/internal/logic/core/taskdetailslogic.go +++ b/api/internal/logic/core/taskdetailslogic.go @@ -3,11 +3,11 @@ package core import ( "context" "github.com/pkg/errors" - "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" - "gorm.io/gorm" - "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" + "gorm.io/gorm" "github.com/zeromicro/go-zero/core/logx" ) @@ -27,8 +27,8 @@ func NewTaskDetailsLogic(ctx context.Context, svcCtx *svc.ServiceContext) *TaskD } func (l *TaskDetailsLogic) TaskDetails(req *types.FId) (resp *types.TaskDetailsResp, err error) { - - var task models.Task + resp = &types.TaskDetailsResp{} + task := &models.Task{} if errors.Is(l.svcCtx.DbEngin.Where("id", req.Id).First(&task).Error, gorm.ErrRecordNotFound) { return nil, errors.New("记录不存在") } @@ -48,14 +48,7 @@ func (l *TaskDetailsLogic) TaskDetails(req *types.FId) (resp *types.TaskDetailsR if err != nil { return resp, err } - resp = &types.TaskDetailsResp{ - Name: task.Name, - Description: task.Description, - StartTime: task.StartTime, - EndTime: task.EndTime, - Strategy: task.Strategy, - SynergyStatus: task.SynergyStatus, - ClusterInfos: cList, - } + utils.Convert(&task, &resp) + resp.ClusterInfos = cList return } diff --git a/api/internal/logic/core/tasklistlogic.go b/api/internal/logic/core/tasklistlogic.go index 30818f24..e56be650 100644 --- a/api/internal/logic/core/tasklistlogic.go +++ b/api/internal/logic/core/tasklistlogic.go @@ -93,13 +93,12 @@ func (l *TaskListLogic) TaskList(req *types.TaskListReq) (resp *types.TaskListRe pStatus = "Normal" } } - resp.Tasks = append(resp.Tasks, types.Task{ Id: task.Id, Name: task.Name, Status: task.Status, - StartTime: task.StartTime, - EndTime: task.EndTime, + StartTime: task.StartTime.Format("2006-01-02 15:04:05"), + EndTime: task.EndTime.Format("2006-01-02 15:04:05"), ParticipantId: pInfo.Id, ParticipantName: pInfo.Name, ParticipantStatus: pStatus, diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 806d8762..cc217112 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -1078,7 +1078,7 @@ type TaskStatusResp struct { Succeeded int `json:"Succeeded"` Failed int `json:"Failed"` Running int `json:"Running"` - Pause int `json:"Pause"` + Saved int `json:"Saved"` } type TaskDetailsResp struct { @@ -1086,8 +1086,8 @@ type TaskDetailsResp struct { Description string `json:"description"` StartTime string `json:"startTime"` EndTime string `json:"endTime"` - Strategy int64 `json:"strategy,string"` - SynergyStatus int64 `json:"synergyStatus,string"` + Strategy int64 `json:"strategy"` + SynergyStatus int64 `json:"synergyStatus"` ClusterInfos []*ClusterInfo `json:"clusterInfos"` } diff --git a/pkg/models/taskmodel_gen.go b/pkg/models/taskmodel_gen.go index b8ee8603..5dae890c 100644 --- a/pkg/models/taskmodel_gen.go +++ b/pkg/models/taskmodel_gen.go @@ -42,8 +42,8 @@ type ( Strategy int64 `db:"strategy"` // 策略 SynergyStatus int64 `db:"synergy_status"` // 协同状态(0-未协同、1-已协同) CommitTime time.Time `db:"commit_time"` // 提交时间 - StartTime string `db:"start_time"` // 开始时间 - EndTime string `db:"end_time"` // 结束运行时间 + StartTime *time.Time `db:"start_time"` // 开始时间 + EndTime *time.Time `db:"end_time"` // 结束运行时间 RunningTime int64 `db:"running_time"` // 已运行时间(单位秒) YamlString string `db:"yaml_string"` Result string `db:"result"` // 作业结果 From 9b8c7d36e6711afdc95d9869fcdc2f2def8e6649 Mon Sep 17 00:00:00 2001 From: tzwang Date: Wed, 8 May 2024 18:52:02 +0800 Subject: [PATCH 14/40] updated aicenteroverview logic Former-commit-id: 497d6e912196aedfcc36e392b006cc1c26a17138 --- .../logic/ai/getcenteroverviewlogic.go | 77 ++++++++++++++++++- .../logic/ai/getcentertasklistlogic.go | 14 +++- api/internal/scheduler/database/aiStorage.go | 21 +++-- .../scheduler/service/collector/collector.go | 1 + 4 files changed, 100 insertions(+), 13 deletions(-) diff --git a/api/internal/logic/ai/getcenteroverviewlogic.go b/api/internal/logic/ai/getcenteroverviewlogic.go index eb93684b..94d44cea 100644 --- a/api/internal/logic/ai/getcenteroverviewlogic.go +++ b/api/internal/logic/ai/getcenteroverviewlogic.go @@ -2,10 +2,12 @@ package ai import ( "context" + "github.com/zeromicro/go-zero/core/logx" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" - - "github.com/zeromicro/go-zero/core/logx" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" + "strconv" + "sync" ) type GetCenterOverviewLogic struct { @@ -24,6 +26,8 @@ func NewGetCenterOverviewLogic(ctx context.Context, svcCtx *svc.ServiceContext) func (l *GetCenterOverviewLogic) GetCenterOverview() (resp *types.CenterOverviewResp, err error) { resp = &types.CenterOverviewResp{} + var mu sync.RWMutex + ch := make(chan struct{}) var centerNum int32 var taskNum int32 @@ -37,6 +41,8 @@ func (l *GetCenterOverviewLogic) GetCenterOverview() (resp *types.CenterOverview centerNum = int32(len(adapterList)) resp.CenterNum = centerNum + go l.updateClusterResource(&mu, ch, adapterList) + for _, adapter := range adapterList { taskList, err := l.svcCtx.Scheduler.AiStorages.GetAiTasksByAdapterId(adapter.Id) if err != nil { @@ -52,7 +58,10 @@ func (l *GetCenterOverviewLogic) GetCenterOverview() (resp *types.CenterOverview continue } for _, cluster := range clusters.List { + mu.RLock() clusterResource, err := l.svcCtx.Scheduler.AiStorages.GetClusterResourcesById(cluster.Id) + mu.RUnlock() + if err != nil { continue } @@ -60,9 +69,71 @@ func (l *GetCenterOverviewLogic) GetCenterOverview() (resp *types.CenterOverview totalTops += clusterResource.CardTopsTotal } } - resp.CardNum = cardNum resp.PowerInTops = totalTops + <-ch return resp, nil } + +func (l *GetCenterOverviewLogic) updateClusterResource(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) { + var wg sync.WaitGroup + for _, adapter := range list { + clusters, err := l.svcCtx.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id) + if err != nil { + continue + } + for _, cluster := range clusters.List { + c := cluster + mu.RLock() + clusterResource, err := l.svcCtx.Scheduler.AiStorages.GetClusterResourcesById(c.Id) + mu.RUnlock() + if err != nil { + continue + } + wg.Add(1) + go func() { + stat, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id].GetResourceStats(l.ctx) + if err != nil { + wg.Done() + return + } + clusterType, err := strconv.ParseInt(adapter.Type, 10, 64) + if err != nil { + wg.Done() + return + } + var cardTotal int64 + var topsTotal float64 + for _, card := range stat.CardsAvail { + cardTotal += int64(card.CardNum) + topsTotal += card.TOpsAtFp16 * float64(card.CardNum) + } + + mu.Lock() + if (models.TClusterResource{} == *clusterResource) { + err = l.svcCtx.Scheduler.AiStorages.SaveClusterResources(c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal), + stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal) + if err != nil { + mu.Unlock() + wg.Done() + return + } + } else { + clusterResource.CardTotal = cardTotal + clusterResource.CardTopsTotal = topsTotal + err := l.svcCtx.Scheduler.AiStorages.UpdateClusterResources(clusterResource) + if err != nil { + mu.Unlock() + wg.Done() + return + } + } + mu.Unlock() + wg.Done() + }() + } + } + wg.Wait() + ch <- struct{}{} +} diff --git a/api/internal/logic/ai/getcentertasklistlogic.go b/api/internal/logic/ai/getcentertasklistlogic.go index a0d8c9fe..ebca4dc4 100644 --- a/api/internal/logic/ai/getcentertasklistlogic.go +++ b/api/internal/logic/ai/getcentertasklistlogic.go @@ -30,16 +30,19 @@ func NewGetCenterTaskListLogic(ctx context.Context, svcCtx *svc.ServiceContext) func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskListResp, err error) { resp = &types.CenterTaskListResp{} var mu sync.RWMutex + ch := make(chan struct{}) adapterList, err := l.svcCtx.Scheduler.AiStorages.GetAdaptersByType("1") if err != nil { return nil, err } - l.updateAiTaskStatus(&mu, adapterList) + go l.updateAiTaskStatus(&mu, ch, adapterList) for _, adapter := range adapterList { + mu.RLock() taskList, err := l.svcCtx.Scheduler.AiStorages.GetAiTasksByAdapterId(adapter.Id) + mu.RUnlock() if err != nil { continue } @@ -70,21 +73,23 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList resp.List = append(resp.List, t) } } + <-ch return resp, nil } -func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, list []*types.AdapterInfo) { +func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) { var wg sync.WaitGroup for _, adapter := range list { - mu.RLock() taskList, err := l.svcCtx.Scheduler.AiStorages.GetAiTasksByAdapterId(adapter.Id) - mu.RUnlock() if err != nil { continue } for _, task := range taskList { t := task + if t.Status == constants.Completed { + continue + } wg.Add(1) go func() { trainingTask, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(l.ctx, t.JobId) @@ -107,4 +112,5 @@ func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, list []*ty } } wg.Wait() + ch <- struct{}{} } diff --git a/api/internal/scheduler/database/aiStorage.go b/api/internal/scheduler/database/aiStorage.go index 89dbbaa3..f25cd5c2 100644 --- a/api/internal/scheduler/database/aiStorage.go +++ b/api/internal/scheduler/database/aiStorage.go @@ -74,12 +74,13 @@ func (s *AiStorage) GetAiTasksByAdapterId(adapterId string) ([]*models.TaskAi, e func (s *AiStorage) SaveTask(name string, strategyCode int64, synergyStatus int64) (int64, error) { // 构建主任务结构体 taskModel := models.Task{ - Status: constants.Saved, - Description: "ai task", - Name: name, - SynergyStatus: synergyStatus, - Strategy: strategyCode, - CommitTime: time.Now(), + Status: constants.Saved, + Description: "ai task", + Name: name, + SynergyStatus: synergyStatus, + Strategy: strategyCode, + AdapterTypeDict: 1, + CommitTime: time.Now(), } // 保存任务数据到数据库 tx := s.DbEngin.Create(&taskModel) @@ -199,6 +200,14 @@ func (s *AiStorage) SaveClusterResources(clusterId string, clusterName string, c return nil } +func (s *AiStorage) UpdateClusterResources(clusterResource *models.TClusterResource) error { + tx := s.DbEngin.Updates(clusterResource) + if tx.Error != nil { + return tx.Error + } + return nil +} + func (s *AiStorage) UpdateAiTask(task *models.TaskAi) error { tx := s.DbEngin.Updates(task) if tx.Error != nil { diff --git a/api/internal/scheduler/service/collector/collector.go b/api/internal/scheduler/service/collector/collector.go index 01406901..96ddc815 100644 --- a/api/internal/scheduler/service/collector/collector.go +++ b/api/internal/scheduler/service/collector/collector.go @@ -22,6 +22,7 @@ type ResourceStats struct { DiskAvail float64 DiskTotal float64 GpuAvail int64 + GpuTotal int64 CardsAvail []*Card CpuCoreHours float64 Balance float64 From 868419d47aece12163ae9df9e85090f7be39ea97 Mon Sep 17 00:00:00 2001 From: Jake <450705171@qq.com> Date: Thu, 9 May 2024 14:46:32 +0800 Subject: [PATCH 15/40] bug fix Former-commit-id: 96c17e81c08d9306ae3dc21d2255d7afb8a262aa --- api/internal/logic/core/commithpctasklogic.go | 65 ------------------- api/internal/logic/hpc/commithpctasklogic.go | 16 +++-- 2 files changed, 11 insertions(+), 70 deletions(-) delete mode 100644 api/internal/logic/core/commithpctasklogic.go diff --git a/api/internal/logic/core/commithpctasklogic.go b/api/internal/logic/core/commithpctasklogic.go deleted file mode 100644 index d676a7c9..00000000 --- a/api/internal/logic/core/commithpctasklogic.go +++ /dev/null @@ -1,65 +0,0 @@ -package core - -import ( - "context" - "gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response" - "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" - "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" - tool "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" - "k8s.io/apimachinery/pkg/util/json" - "time" - - "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" - "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" - - "github.com/zeromicro/go-zero/core/logx" -) - -type CommitHpcTaskLogic struct { - logx.Logger - ctx context.Context - svcCtx *svc.ServiceContext -} - -func NewCommitHpcTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CommitHpcTaskLogic { - return &CommitHpcTaskLogic{ - Logger: logx.WithContext(ctx), - ctx: ctx, - svcCtx: svcCtx, - } -} - -func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *types.CommitHpcTaskResp, err error) { - // 构建主任务结构体 - taskModel := models.Task{ - Status: constants.Saved, - Description: req.Description, - Name: req.Name, - CommitTime: time.Now(), - } - // 保存任务数据到数据库 - tx := l.svcCtx.DbEngin.Create(&taskModel) - if tx.Error != nil { - return nil, tx.Error - } - hpc := models.Hpc{} - tool.Convert(req, &hpc) - mqInfo := response.TaskInfo{ - TaskId: taskModel.Id, - TaskType: "hpc", - MatchLabels: req.MatchLabels, - //Metadata: hpc, - } - req.TaskId = taskModel.Id - // 将任务数据转换成消息体 - reqMessage, err := json.Marshal(mqInfo) - if err != nil { - logx.Error(err) - return nil, err - } - publish := l.svcCtx.RedisClient.Publish(context.Background(), mqInfo.TaskType, reqMessage) - if publish.Err() != nil { - return nil, publish.Err() - } - return -} diff --git a/api/internal/logic/hpc/commithpctasklogic.go b/api/internal/logic/hpc/commithpctasklogic.go index 6211a13d..636400df 100644 --- a/api/internal/logic/hpc/commithpctasklogic.go +++ b/api/internal/logic/hpc/commithpctasklogic.go @@ -32,11 +32,15 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t // 构建主任务结构体 taskModel := models.Task{ - Status: constants.Saved, - Description: req.Description, - Name: req.Name, - CommitTime: time.Now(), + Name: req.Name, + Description: req.Description, + Status: constants.Saved, + Strategy: 0, + SynergyStatus: 0, + CommitTime: time.Now(), + AdapterTypeDict: 2, } + // 保存任务数据到数据库 tx := l.svcCtx.DbEngin.Create(&taskModel) if tx.Error != nil { @@ -49,7 +53,9 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t env, _ := json.Marshal(req.Environment) if len(clusterIds) == 0 || clusterIds == nil { - return nil, nil + resp.Code = 400 + resp.Msg = "no cluster found" + return resp, nil } hpcInfo := models.TaskHpc{ From e6f213bd8e415e6f598680abefbd6627236cb660 Mon Sep 17 00:00:00 2001 From: qiwang <1364512070@qq.com> Date: Thu, 9 May 2024 15:09:29 +0800 Subject: [PATCH 16/40] =?UTF-8?q?fix=EF=BC=9Apublic=20interface?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former-commit-id: f214352aa45024203c71d58622fbc292dc682fdd --- api/desc/core/pcm-core.api | 46 +++++++++++++++++++ api/desc/pcm.api | 13 +++++- .../handler/core/getpublicflavorhandler.go | 28 +++++++++++ .../handler/core/getpublicimagehandler.go | 28 +++++++++++ .../handler/core/getpublicnetworkhandler.go | 28 +++++++++++ api/internal/handler/routes.go | 32 ++++++------- .../logic/core/getpublicflavorlogic.go | 40 ++++++++++++++++ .../logic/core/getpublicimagelogic.go | 40 ++++++++++++++++ .../logic/core/getpublicnetworklogic.go | 39 ++++++++++++++++ api/internal/types/types.go | 42 +++++++++++++++++ 10 files changed, 319 insertions(+), 17 deletions(-) create mode 100644 api/internal/handler/core/getpublicflavorhandler.go create mode 100644 api/internal/handler/core/getpublicimagehandler.go create mode 100644 api/internal/handler/core/getpublicnetworkhandler.go create mode 100644 api/internal/logic/core/getpublicflavorlogic.go create mode 100644 api/internal/logic/core/getpublicimagelogic.go create mode 100644 api/internal/logic/core/getpublicnetworklogic.go diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index 6945ac8d..4e4d91f4 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -49,6 +49,52 @@ type ( } ) +type ( + PublicImageReq { + + } + PublicImageResp { + Code int `json:"code"` + Message string `json:"message"` + ImageDict []ImageDict `json:"imageRDict"` + } + ImageDict { + Id int `json:"id"` + PublicImageName string `json:"public_image_name"` + } +) + +type ( + PublicFlavorReq { + + } + PublicFlavorResp { + Code int `json:"code"` + Message string `json:"message"` + FlavorDict []FlavorDict `json:"flavorDict"` + } + FlavorDict { + Id int `json:"id"` + PublicFlavorName string `json:"public_flavor_name"` + } +) + +type ( + PublicNetworkReq { + + } + PublicNetworkResp { + Code int `json:"code"` + Message string `json:"message"` + NetworkDict []NetworkDict `json:"networkDict"` + } + NetworkDict { + Id int `json:"id"` + PublicImageName string `json:"public_image_name"` + } + +) + type remoteResp { Code int `json:"code"` Message string `json:"message"` diff --git a/api/desc/pcm.api b/api/desc/pcm.api index f62e77b4..550958ca 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -137,11 +137,22 @@ service pcm { @doc "Statistical task status" @handler countTaskStatus get /core/task/countTaskStatus () returns (TaskStatusResp) - get /core/task/countTaskStatus () returns(TaskStatusResp) @doc "Home Page Overview" @handler homeOverviewHandler get /core/homeOverview (HomeOverviewReq) returns (HomeOverviewResp) + + @doc "Get Public Image" + @handler getPublicImageHandler + get /core/getPublicImage (PublicImageReq) returns (PublicImageResp) + + @doc "Get Public Flavor" + @handler getPublicFlavorHandler + get /core/getPublicFlavor (PublicFlavorReq) returns (PublicFlavorResp) + + @doc "Get Public Network" + @handler getPublicNetworkHandler + get /core/getPublicNetwork (PublicNetworkReq) returns (PublicNetworkResp) } //hpc二级接口 diff --git a/api/internal/handler/core/getpublicflavorhandler.go b/api/internal/handler/core/getpublicflavorhandler.go new file mode 100644 index 00000000..0fc1942a --- /dev/null +++ b/api/internal/handler/core/getpublicflavorhandler.go @@ -0,0 +1,28 @@ +package core + +import ( + "net/http" + + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/core" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" +) + +func GetPublicFlavorHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.PublicFlavorReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + + l := core.NewGetPublicFlavorLogic(r.Context(), svcCtx) + resp, err := l.GetPublicFlavor(&req) + if err != nil { + httpx.ErrorCtx(r.Context(), w, err) + } else { + httpx.OkJsonCtx(r.Context(), w, resp) + } + } +} diff --git a/api/internal/handler/core/getpublicimagehandler.go b/api/internal/handler/core/getpublicimagehandler.go new file mode 100644 index 00000000..de2b1033 --- /dev/null +++ b/api/internal/handler/core/getpublicimagehandler.go @@ -0,0 +1,28 @@ +package core + +import ( + "net/http" + + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/core" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" +) + +func GetPublicImageHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.PublicImageReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + + l := core.NewGetPublicImageLogic(r.Context(), svcCtx) + resp, err := l.GetPublicImage(&req) + if err != nil { + httpx.ErrorCtx(r.Context(), w, err) + } else { + httpx.OkJsonCtx(r.Context(), w, resp) + } + } +} diff --git a/api/internal/handler/core/getpublicnetworkhandler.go b/api/internal/handler/core/getpublicnetworkhandler.go new file mode 100644 index 00000000..7450740d --- /dev/null +++ b/api/internal/handler/core/getpublicnetworkhandler.go @@ -0,0 +1,28 @@ +package core + +import ( + "net/http" + + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/core" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" +) + +func GetPublicNetworkHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.PublicNetworkReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + + l := core.NewGetPublicNetworkLogic(r.Context(), svcCtx) + resp, err := l.GetPublicNetwork(&req) + if err != nil { + httpx.ErrorCtx(r.Context(), w, err) + } else { + httpx.OkJsonCtx(r.Context(), w, resp) + } + } +} diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index 875ff648..923bdf9e 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -170,6 +170,21 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/core/homeOverview", Handler: core.HomeOverviewHandler(serverCtx), }, + { + Method: http.MethodGet, + Path: "/core/getPublicImage", + Handler: core.GetPublicImageHandler(serverCtx), + }, + { + Method: http.MethodGet, + Path: "/core/getPublicFlavor", + Handler: core.GetPublicFlavorHandler(serverCtx), + }, + { + Method: http.MethodGet, + Path: "/core/getPublicNetwork", + Handler: core.GetPublicNetworkHandler(serverCtx), + }, }, rest.WithPrefix("/pcm/v1"), ) @@ -1150,11 +1165,6 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset", Handler: schedule.ScheduleGetAlgorithmsHandler(serverCtx), }, - { - Method: http.MethodGet, - Path: "/schedule/ai/getJobLog/:adapterId/:clusterId/:taskId/:instanceNum", - Handler: schedule.ScheduleGetAiJobLogLogHandler(serverCtx), - }, { Method: http.MethodPost, Path: "/schedule/submit", @@ -1254,19 +1264,9 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { }, { Method: http.MethodPost, - Path: "/monitoring/syncClusterAlert", + Path: "/core/syncClusterAlert", Handler: monitoring.SyncClusterAlertHandler(serverCtx), }, - { - Method: http.MethodGet, - Path: "/monitoring/task/num", - Handler: monitoring.TaskNumHandler(serverCtx), - }, - { - Method: http.MethodGet, - Path: "/monitoring/adapter/info", - Handler: monitoring.AdapterInfoHandler(serverCtx), - }, }, rest.WithPrefix("/pcm/v1"), ) diff --git a/api/internal/logic/core/getpublicflavorlogic.go b/api/internal/logic/core/getpublicflavorlogic.go new file mode 100644 index 00000000..14d4be55 --- /dev/null +++ b/api/internal/logic/core/getpublicflavorlogic.go @@ -0,0 +1,40 @@ +package core + +import ( + "context" + + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + + "github.com/zeromicro/go-zero/core/logx" +) + +type GetPublicFlavorLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewGetPublicFlavorLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetPublicFlavorLogic { + return &GetPublicFlavorLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +func (l *GetPublicFlavorLogic) GetPublicFlavor(req *types.PublicFlavorReq) (resp *types.PublicFlavorResp, err error) { + // todo: add your logic here and delete this line + resp = &types.PublicFlavorResp{} + var flavorDict []types.FlavorDict + sqlStrTask := "SELECT * FROM `vm_flavor_dict`" + txTask := l.svcCtx.DbEngin.Raw(sqlStrTask).Scan(&flavorDict) + if txTask.Error != nil { + logx.Error(err) + return nil, txTask.Error + } + resp.Code = 200 + resp.Message = "success" + resp.FlavorDict = flavorDict + return resp, nil +} diff --git a/api/internal/logic/core/getpublicimagelogic.go b/api/internal/logic/core/getpublicimagelogic.go new file mode 100644 index 00000000..a3dafc16 --- /dev/null +++ b/api/internal/logic/core/getpublicimagelogic.go @@ -0,0 +1,40 @@ +package core + +import ( + "context" + + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + + "github.com/zeromicro/go-zero/core/logx" +) + +type GetPublicImageLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewGetPublicImageLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetPublicImageLogic { + return &GetPublicImageLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +func (l *GetPublicImageLogic) GetPublicImage(req *types.PublicImageReq) (resp *types.PublicImageResp, err error) { + // todo: add your logic here and delete this line + resp = &types.PublicImageResp{} + var iamgeDict []types.ImageDict + sqlStrTask := "SELECT * FROM `vm_image_dict`" + txTask := l.svcCtx.DbEngin.Raw(sqlStrTask).Scan(&iamgeDict) + if txTask.Error != nil { + logx.Error(err) + return nil, txTask.Error + } + resp.Code = 200 + resp.Message = "success" + resp.ImageDict = iamgeDict + return resp, nil +} diff --git a/api/internal/logic/core/getpublicnetworklogic.go b/api/internal/logic/core/getpublicnetworklogic.go new file mode 100644 index 00000000..b0d38bb0 --- /dev/null +++ b/api/internal/logic/core/getpublicnetworklogic.go @@ -0,0 +1,39 @@ +package core + +import ( + "context" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + + "github.com/zeromicro/go-zero/core/logx" +) + +type GetPublicNetworkLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewGetPublicNetworkLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetPublicNetworkLogic { + return &GetPublicNetworkLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +func (l *GetPublicNetworkLogic) GetPublicNetwork(req *types.PublicNetworkReq) (resp *types.PublicNetworkResp, err error) { + // todo: add your logic here and delete this line + resp = &types.PublicNetworkResp{} + var networkDict []types.NetworkDict + sqlStrTask := "SELECT * FROM `vm_network_dict`" + txTask := l.svcCtx.DbEngin.Raw(sqlStrTask).Scan(&networkDict) + if txTask.Error != nil { + logx.Error(err) + return nil, txTask.Error + } + resp.Code = 200 + resp.Message = "success" + resp.NetworkDict = networkDict + return resp, nil +} diff --git a/api/internal/types/types.go b/api/internal/types/types.go index ddc51b30..d4308452 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -41,6 +41,48 @@ type HomeOverviewData struct { TaskSum int64 `json:"taskSum"` } +type PublicImageReq struct { +} + +type PublicImageResp struct { + Code int `json:"code"` + Message string `json:"message"` + ImageDict []ImageDict `json:"imageRDict"` +} + +type ImageDict struct { + Id int `json:"id"` + PublicImageName string `json:"public_image_name"` +} + +type PublicFlavorReq struct { +} + +type PublicFlavorResp struct { + Code int `json:"code"` + Message string `json:"message"` + FlavorDict []FlavorDict `json:"flavorDict"` +} + +type FlavorDict struct { + Id int `json:"id"` + PublicFlavorName string `json:"public_flavor_name"` +} + +type PublicNetworkReq struct { +} + +type PublicNetworkResp struct { + Code int `json:"code"` + Message string `json:"message"` + NetworkDict []NetworkDict `json:"networkDict"` +} + +type NetworkDict struct { + Id int `json:"id"` + PublicImageName string `json:"public_image_name"` +} + type RemoteResp struct { Code int `json:"code"` Message string `json:"message"` From 0e155cb6573d1569a025c7bdcc247211beac7f42 Mon Sep 17 00:00:00 2001 From: tzwang Date: Thu, 9 May 2024 16:00:40 +0800 Subject: [PATCH 17/40] added algorithmscode api Former-commit-id: 60b3ed6bb4c24d801afeec3b4cee108788cd2bea --- api/desc/pcm.api | 6 ++++ api/desc/schedule/pcm-schedule.api | 29 +++++++++++++++++++ .../logic/ai/getcenteroverviewlogic.go | 10 +++++-- .../logic/ai/getcentertasklistlogic.go | 9 ++++-- .../scheduler/service/collector/collector.go | 4 +-- api/internal/storeLink/modelarts.go | 4 +-- api/internal/storeLink/octopus.go | 4 +-- api/internal/storeLink/shuguangai.go | 27 +++++++++++++++-- 8 files changed, 80 insertions(+), 13 deletions(-) diff --git a/api/desc/pcm.api b/api/desc/pcm.api index a124833c..2515c1e1 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -954,6 +954,12 @@ service pcm { @handler ScheduleGetOverviewHandler post /schedule/getOverview returns (ScheduleOverviewResp) + + @handler DownloadAlgothmCodeHandler + get /schedule/getDownloadAlgothmCode (DownloadAlgorithmCodeReq) returns (DownloadAlgorithmCodeResp) + + @handler UploadAlgothmCodeHandler + post /schedule/getDownloadAlgothmCode (UploadAlgorithmCodeReq) returns (UploadAlgorithmCodeResp) } @server( diff --git a/api/desc/schedule/pcm-schedule.api b/api/desc/schedule/pcm-schedule.api index e1892770..0611dc4d 100644 --- a/api/desc/schedule/pcm-schedule.api +++ b/api/desc/schedule/pcm-schedule.api @@ -100,4 +100,33 @@ type ( StartTime string `json:"startTime,omitempty" db:"start_time"` EndTime string `json:"endTime,omitempty" db:"end_time"` } + + DownloadAlgorithmCodeReq { + AdapterId string `form:"adapterId"` + ClusterId string `form:"clusterId"` + ResourceType string `form:"resourceType"` + Card string `form:"card"` + TaskType string `form:"taskType"` + Dataset string `form:"dataset"` + Algorithm string `form:"algorithm"` + Code string `form:"code"` + } + + DownloadAlgorithmCodeResp { + Code string `json:"algorithms"` + } + + UploadAlgorithmCodeReq { + AdapterId string `json:"adapterId"` + ClusterId string `json:"clusterId"` + ResourceType string `json:"resourceType"` + Card string `json:"card"` + TaskType string `json:"taskType"` + Dataset string `json:"dataset"` + Algorithm string `json:"algorithm"` + Code string `json:"code"` + } + + UploadAlgorithmCodeResp { + } ) \ No newline at end of file diff --git a/api/internal/logic/ai/getcenteroverviewlogic.go b/api/internal/logic/ai/getcenteroverviewlogic.go index 94d44cea..d2669709 100644 --- a/api/internal/logic/ai/getcenteroverviewlogic.go +++ b/api/internal/logic/ai/getcenteroverviewlogic.go @@ -8,6 +8,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "strconv" "sync" + "time" ) type GetCenterOverviewLogic struct { @@ -71,9 +72,14 @@ func (l *GetCenterOverviewLogic) GetCenterOverview() (resp *types.CenterOverview } resp.CardNum = cardNum resp.PowerInTops = totalTops - <-ch - return resp, nil + select { + case _ = <-ch: + return resp, nil + case <-time.After(2 * time.Second): + return resp, nil + } + } func (l *GetCenterOverviewLogic) updateClusterResource(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) { diff --git a/api/internal/logic/ai/getcentertasklistlogic.go b/api/internal/logic/ai/getcentertasklistlogic.go index ebca4dc4..ff1d2883 100644 --- a/api/internal/logic/ai/getcentertasklistlogic.go +++ b/api/internal/logic/ai/getcentertasklistlogic.go @@ -73,9 +73,14 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList resp.List = append(resp.List, t) } } - <-ch - return resp, nil + select { + case _ = <-ch: + return resp, nil + case <-time.After(1 * time.Second): + return resp, nil + } + } func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) { diff --git a/api/internal/scheduler/service/collector/collector.go b/api/internal/scheduler/service/collector/collector.go index 96ddc815..453d710c 100644 --- a/api/internal/scheduler/service/collector/collector.go +++ b/api/internal/scheduler/service/collector/collector.go @@ -8,8 +8,8 @@ type AiCollector interface { GetAlgorithms(ctx context.Context) ([]*Algorithm, error) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) GetTrainingTask(ctx context.Context, taskId string) (*Task, error) - DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) - UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error + DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) + UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error } type ResourceStats struct { diff --git a/api/internal/storeLink/modelarts.go b/api/internal/storeLink/modelarts.go index 1ae255f2..7bb6db2d 100644 --- a/api/internal/storeLink/modelarts.go +++ b/api/internal/storeLink/modelarts.go @@ -162,11 +162,11 @@ func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorit return nil, nil } -func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) { +func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) { return "", nil } -func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error { +func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error { return nil } diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index f72f63ab..8e1f7af0 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -339,11 +339,11 @@ func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm return algorithms, nil } -func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) { +func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) { return "", nil } -func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error { +func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error { return nil } diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index 4dbd8fcc..4f783357 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -447,11 +447,32 @@ func (s *ShuguangAi) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, return algorithms, nil } -func (s *ShuguangAi) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) { - return "", nil +func (s *ShuguangAi) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) { + algoName := dataset + DASH + algorithm + req := &hpcAC.GetFileReq{ + Path: ALGORITHM_DIR + FORWARD_SLASH + taskType + FORWARD_SLASH + algoName + FORWARD_SLASH + TRAIN_FILE, + } + resp, err := s.aCRpc.GetFile(ctx, req) + if err != nil { + return "", err + } + + return resp.Content, nil } -func (s *ShuguangAi) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error { +func (s *ShuguangAi) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error { + algoName := dataset + DASH + algorithm + req := &hpcAC.UploadFileReq{ + Path: ALGORITHM_DIR + FORWARD_SLASH + taskType + FORWARD_SLASH + algoName + FORWARD_SLASH, + Cover: "cover", + File: code, + } + + _, err := s.aCRpc.UploadFile(ctx, req) + if err != nil { + return err + } + return nil } From 9ec3a048b0584b87528d906cf61b7b161de110ec Mon Sep 17 00:00:00 2001 From: tzwang Date: Thu, 9 May 2024 16:11:29 +0800 Subject: [PATCH 18/40] gen algorithmcode proto Former-commit-id: 5efb7dd3acce89cd06ae014f4e84fc02cbf5a5a6 --- api/internal/handler/routes.go | 17 ++++++++++++++++- api/internal/types/types.go | 29 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index bedd9c57..c558141a 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -1190,6 +1190,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset", Handler: schedule.ScheduleGetAlgorithmsHandler(serverCtx), }, + { + Method: http.MethodGet, + Path: "/schedule/ai/getJobLog/:adapterId/:clusterId/:taskId/:instanceNum", + Handler: schedule.ScheduleGetAiJobLogLogHandler(serverCtx), + }, { Method: http.MethodPost, Path: "/schedule/submit", @@ -1200,6 +1205,16 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/schedule/getOverview", Handler: schedule.ScheduleGetOverviewHandler(serverCtx), }, + { + Method: http.MethodGet, + Path: "/schedule/getDownloadAlgothmCode", + Handler: schedule.DownloadAlgothmCodeHandler(serverCtx), + }, + { + Method: http.MethodPost, + Path: "/schedule/getDownloadAlgothmCode", + Handler: schedule.UploadAlgothmCodeHandler(serverCtx), + }, }, rest.WithPrefix("/pcm/v1"), ) @@ -1294,7 +1309,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { }, { Method: http.MethodPost, - Path: "/core/syncClusterAlert", + Path: "/monitoring/syncClusterAlert", Handler: monitoring.SyncClusterAlertHandler(serverCtx), }, { diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 0a905601..887ef866 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -5676,6 +5676,35 @@ type AiTaskDb struct { EndTime string `json:"endTime,omitempty" db:"end_time"` } +type DownloadAlgorithmCodeReq struct { + AdapterId string `form:"adapterId"` + ClusterId string `form:"clusterId"` + ResourceType string `form:"resourceType"` + Card string `form:"card"` + TaskType string `form:"taskType"` + Dataset string `form:"dataset"` + Algorithm string `form:"algorithm"` + Code string `form:"code"` +} + +type DownloadAlgorithmCodeResp struct { + Code string `json:"algorithms"` +} + +type UploadAlgorithmCodeReq struct { + AdapterId string `json:"adapterId"` + ClusterId string `json:"clusterId"` + ResourceType string `json:"resourceType"` + Card string `json:"card"` + TaskType string `json:"taskType"` + Dataset string `json:"dataset"` + Algorithm string `json:"algorithm"` + Code string `json:"code"` +} + +type UploadAlgorithmCodeResp struct { +} + type CreateAlertRuleReq struct { CLusterId string `json:"clusterId"` ClusterName string `json:"clusterName"` From 4849b9385325817258d6562a472ebd7d2b8680b2 Mon Sep 17 00:00:00 2001 From: Jake <450705171@qq.com> Date: Thu, 9 May 2024 18:37:24 +0800 Subject: [PATCH 19/40] table name changed Former-commit-id: 5776bc02745e66782bf7f536b8a239a52095d78f --- api/internal/logic/core/pushtaskinfologic.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/internal/logic/core/pushtaskinfologic.go b/api/internal/logic/core/pushtaskinfologic.go index 5d86bca2..664a8df9 100644 --- a/api/internal/logic/core/pushtaskinfologic.go +++ b/api/internal/logic/core/pushtaskinfologic.go @@ -51,7 +51,7 @@ func (l *PushTaskInfoLogic) PushTaskInfo(req *clientCore.PushTaskInfoReq) (*clie } case 1: for _, aiInfo := range req.AiInfoList { - l.svcCtx.DbEngin.Exec("update ai set status = ?,start_time = ?,project_id = ?,job_id = ? where participant_id = ? and task_id = ? and name = ?", + l.svcCtx.DbEngin.Exec("update task_ai set status = ?,start_time = ?,project_id = ?,job_id = ? where participant_id = ? and task_id = ? and name = ?", aiInfo.Status, aiInfo.StartTime, aiInfo.ProjectId, aiInfo.JobId, req.AdapterId, aiInfo.TaskId, aiInfo.Name) syncTask(l.svcCtx.DbEngin, aiInfo.TaskId) } @@ -69,7 +69,7 @@ func (l *PushTaskInfoLogic) PushTaskInfo(req *clientCore.PushTaskInfoReq) (*clie func syncTask(gorm *gorm.DB, taskId int64) { var allStatus string - tx := gorm.Raw("SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.status) ,GROUP_CONCAT(DISTINCT a.status) ,GROUP_CONCAT(DISTINCT c.status))as status from task t left join hpc h on t.id = h.task_id left join task_cloud c on t.id = c.task_id left join ai a on t.id = a.task_id where t.id = ?", taskId).Scan(&allStatus) + tx := gorm.Raw("SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.status) ,GROUP_CONCAT(DISTINCT a.status) ,GROUP_CONCAT(DISTINCT c.status))as status from task t left join task_hpc h on t.id = h.task_id left join task_cloud c on t.id = c.task_id left join task_ai a on t.id = a.task_id where t.id = ?", taskId).Scan(&allStatus) if tx.Error != nil { logx.Error(tx.Error) } From fb957526e518cde762412ee83ef10538a1ebb322 Mon Sep 17 00:00:00 2001 From: tzwang Date: Thu, 9 May 2024 18:54:25 +0800 Subject: [PATCH 20/40] added getComputeCards api Former-commit-id: 78b5fa73e9106a813df8d05e57c1e5dfd0a219a4 --- api/desc/pcm.api | 3 +++ api/desc/schedule/pcm-schedule.api | 9 +++++++++ api/internal/logic/ai/getcentertasklistlogic.go | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 8a3ac032..73c6577d 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -972,6 +972,9 @@ service pcm { @handler UploadAlgothmCodeHandler post /schedule/getDownloadAlgothmCode (UploadAlgorithmCodeReq) returns (UploadAlgorithmCodeResp) + + @handler GetComputeCardsByClusterHandler + get /schedule/getComputeCardsByCluster/:adapterId/:clusterId (GetComputeCardsByClusterReq) returns (GetComputeCardsByClusterResp) } @server( diff --git a/api/desc/schedule/pcm-schedule.api b/api/desc/schedule/pcm-schedule.api index 0611dc4d..776851db 100644 --- a/api/desc/schedule/pcm-schedule.api +++ b/api/desc/schedule/pcm-schedule.api @@ -129,4 +129,13 @@ type ( UploadAlgorithmCodeResp { } + + GetComputeCardsByClusterReq { + AdapterId string `path:"adapterId"` + ClusterId string `path:"clusterId"` + } + + GetComputeCardsByClusterResp { + Cards []string `json:"cards"` + } ) \ No newline at end of file diff --git a/api/internal/logic/ai/getcentertasklistlogic.go b/api/internal/logic/ai/getcentertasklistlogic.go index ff1d2883..edf3d1b4 100644 --- a/api/internal/logic/ai/getcentertasklistlogic.go +++ b/api/internal/logic/ai/getcentertasklistlogic.go @@ -77,7 +77,7 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList select { case _ = <-ch: return resp, nil - case <-time.After(1 * time.Second): + case <-time.After(2 * time.Second): return resp, nil } From 8d8d6a9822ab12fc0cf9ff9e84faa8b2a18dd7e4 Mon Sep 17 00:00:00 2001 From: Jake <450705171@qq.com> Date: Thu, 9 May 2024 19:05:16 +0800 Subject: [PATCH 21/40] push notice when new hpc job submitted Former-commit-id: 459845bff6477ce1a0b43cba3eac68aa2c7d1223 --- api/internal/logic/hpc/commithpctasklogic.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/api/internal/logic/hpc/commithpctasklogic.go b/api/internal/logic/hpc/commithpctasklogic.go index 636400df..b763a068 100644 --- a/api/internal/logic/hpc/commithpctasklogic.go +++ b/api/internal/logic/hpc/commithpctasklogic.go @@ -2,10 +2,12 @@ package hpc import ( "context" + clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "k8s.io/apimachinery/pkg/util/json" "math/rand" + "strconv" "time" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" @@ -88,6 +90,21 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t if tx.Error != nil { return nil, tx.Error } + adapterId, _ := strconv.ParseUint(req.AdapterId, 10, 64) + adapterName := "" + tx.Table("t_adapter").Select("name").Where("id=?", adapterId).Find(&adapterName) + noticeInfo := clientCore.NoticeInfo{ + AdapterId: int64(adapterId), + AdapterName: adapterName, + NoticeType: "create", + TaskName: req.Name, + Incident: "任务创建中", + CreatedTime: time.Now(), + } + result := l.svcCtx.DbEngin.Table("t_notice").Create(¬iceInfo) + if result.Error != nil { + logx.Errorf("Task creation failure, err: %v", result.Error) + } // todo mq task manage //reqMessage, err := json.Marshal(mqInfo) //if err != nil { From 6bc14cfabd0e0e547b299a5f77a34c328e038892 Mon Sep 17 00:00:00 2001 From: tzwang Date: Thu, 9 May 2024 19:11:44 +0800 Subject: [PATCH 22/40] updated protobuf Former-commit-id: 6e3382beb8290c1c9602ca197c3781ce8a80a296 --- api/internal/handler/routes.go | 5 ++++ .../schedule/downloadalgothmcodehandler.go | 28 +++++++++++++++++++ .../getcomputecardsbyclusterhandler.go | 28 +++++++++++++++++++ .../schedule/uploadalgothmcodehandler.go | 28 +++++++++++++++++++ api/internal/types/types.go | 9 ++++++ 5 files changed, 98 insertions(+) create mode 100644 api/internal/handler/schedule/downloadalgothmcodehandler.go create mode 100644 api/internal/handler/schedule/getcomputecardsbyclusterhandler.go create mode 100644 api/internal/handler/schedule/uploadalgothmcodehandler.go diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index c558141a..0a2e86a0 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -1215,6 +1215,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/schedule/getDownloadAlgothmCode", Handler: schedule.UploadAlgothmCodeHandler(serverCtx), }, + { + Method: http.MethodGet, + Path: "/schedule/getComputeCardsByCluster/:adapterId/:clusterId", + Handler: schedule.GetComputeCardsByClusterHandler(serverCtx), + }, }, rest.WithPrefix("/pcm/v1"), ) diff --git a/api/internal/handler/schedule/downloadalgothmcodehandler.go b/api/internal/handler/schedule/downloadalgothmcodehandler.go new file mode 100644 index 00000000..14207bba --- /dev/null +++ b/api/internal/handler/schedule/downloadalgothmcodehandler.go @@ -0,0 +1,28 @@ +package schedule + +import ( + "net/http" + + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" +) + +func DownloadAlgothmCodeHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.DownloadAlgorithmCodeReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + + l := schedule.NewDownloadAlgothmCodeLogic(r.Context(), svcCtx) + resp, err := l.DownloadAlgothmCode(&req) + if err != nil { + httpx.ErrorCtx(r.Context(), w, err) + } else { + httpx.OkJsonCtx(r.Context(), w, resp) + } + } +} diff --git a/api/internal/handler/schedule/getcomputecardsbyclusterhandler.go b/api/internal/handler/schedule/getcomputecardsbyclusterhandler.go new file mode 100644 index 00000000..2c4393e1 --- /dev/null +++ b/api/internal/handler/schedule/getcomputecardsbyclusterhandler.go @@ -0,0 +1,28 @@ +package schedule + +import ( + "net/http" + + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" +) + +func GetComputeCardsByClusterHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.GetComputeCardsByClusterReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + + l := schedule.NewGetComputeCardsByClusterLogic(r.Context(), svcCtx) + resp, err := l.GetComputeCardsByCluster(&req) + if err != nil { + httpx.ErrorCtx(r.Context(), w, err) + } else { + httpx.OkJsonCtx(r.Context(), w, resp) + } + } +} diff --git a/api/internal/handler/schedule/uploadalgothmcodehandler.go b/api/internal/handler/schedule/uploadalgothmcodehandler.go new file mode 100644 index 00000000..bfbd05d1 --- /dev/null +++ b/api/internal/handler/schedule/uploadalgothmcodehandler.go @@ -0,0 +1,28 @@ +package schedule + +import ( + "net/http" + + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" +) + +func UploadAlgothmCodeHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.UploadAlgorithmCodeReq + if err := httpx.Parse(r, &req); err != nil { + httpx.ErrorCtx(r.Context(), w, err) + return + } + + l := schedule.NewUploadAlgothmCodeLogic(r.Context(), svcCtx) + resp, err := l.UploadAlgothmCode(&req) + if err != nil { + httpx.ErrorCtx(r.Context(), w, err) + } else { + httpx.OkJsonCtx(r.Context(), w, resp) + } + } +} diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 887ef866..4520f873 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -5705,6 +5705,15 @@ type UploadAlgorithmCodeReq struct { type UploadAlgorithmCodeResp struct { } +type GetComputeCardsByClusterReq struct { + AdapterId string `path:"adapterId"` + ClusterId string `path:"clusterId"` +} + +type GetComputeCardsByClusterResp struct { + Cards []string `json:"cards"` +} + type CreateAlertRuleReq struct { CLusterId string `json:"clusterId"` ClusterName string `json:"clusterName"` From eb9d633b17d2c578c2881d56a15d36535a9f961e Mon Sep 17 00:00:00 2001 From: Jake <450705171@qq.com> Date: Thu, 9 May 2024 19:33:22 +0800 Subject: [PATCH 23/40] algorithm logic code update Former-commit-id: 816a4270bcf745710acecec7854e1c9331f55556 --- .../schedule/downloadalgothmcodelogic.go | 30 +++++++++++++++++++ .../schedule/getcomputecardsbyclusterlogic.go | 30 +++++++++++++++++++ .../logic/schedule/uploadalgothmcodelogic.go | 30 +++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 api/internal/logic/schedule/downloadalgothmcodelogic.go create mode 100644 api/internal/logic/schedule/getcomputecardsbyclusterlogic.go create mode 100644 api/internal/logic/schedule/uploadalgothmcodelogic.go diff --git a/api/internal/logic/schedule/downloadalgothmcodelogic.go b/api/internal/logic/schedule/downloadalgothmcodelogic.go new file mode 100644 index 00000000..e39f7651 --- /dev/null +++ b/api/internal/logic/schedule/downloadalgothmcodelogic.go @@ -0,0 +1,30 @@ +package schedule + +import ( + "context" + + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + + "github.com/zeromicro/go-zero/core/logx" +) + +type DownloadAlgothmCodeLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewDownloadAlgothmCodeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *DownloadAlgothmCodeLogic { + return &DownloadAlgothmCodeLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +func (l *DownloadAlgothmCodeLogic) DownloadAlgothmCode(req *types.DownloadAlgorithmCodeReq) (resp *types.DownloadAlgorithmCodeResp, err error) { + // todo: add your logic here and delete this line + + return +} diff --git a/api/internal/logic/schedule/getcomputecardsbyclusterlogic.go b/api/internal/logic/schedule/getcomputecardsbyclusterlogic.go new file mode 100644 index 00000000..772a5ce6 --- /dev/null +++ b/api/internal/logic/schedule/getcomputecardsbyclusterlogic.go @@ -0,0 +1,30 @@ +package schedule + +import ( + "context" + + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + + "github.com/zeromicro/go-zero/core/logx" +) + +type GetComputeCardsByClusterLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewGetComputeCardsByClusterLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetComputeCardsByClusterLogic { + return &GetComputeCardsByClusterLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +func (l *GetComputeCardsByClusterLogic) GetComputeCardsByCluster(req *types.GetComputeCardsByClusterReq) (resp *types.GetComputeCardsByClusterResp, err error) { + // todo: add your logic here and delete this line + + return +} diff --git a/api/internal/logic/schedule/uploadalgothmcodelogic.go b/api/internal/logic/schedule/uploadalgothmcodelogic.go new file mode 100644 index 00000000..a0771b04 --- /dev/null +++ b/api/internal/logic/schedule/uploadalgothmcodelogic.go @@ -0,0 +1,30 @@ +package schedule + +import ( + "context" + + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + + "github.com/zeromicro/go-zero/core/logx" +) + +type UploadAlgothmCodeLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext +} + +func NewUploadAlgothmCodeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *UploadAlgothmCodeLogic { + return &UploadAlgothmCodeLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + } +} + +func (l *UploadAlgothmCodeLogic) UploadAlgothmCode(req *types.UploadAlgorithmCodeReq) (resp *types.UploadAlgorithmCodeResp, err error) { + // todo: add your logic here and delete this line + + return +} From a10714ed57cbc42b8a0b7e775eddbefbbfbcbee8 Mon Sep 17 00:00:00 2001 From: Jake <450705171@qq.com> Date: Fri, 10 May 2024 10:53:24 +0800 Subject: [PATCH 24/40] core task status sync Former-commit-id: 51279892412a0a6c0d835bc33d57313ba7ada0eb --- api/client/types.go | 14 ++ api/internal/logic/core/pushtaskinfologic.go | 42 +++-- api/internal/types/types.go | 178 ------------------- 3 files changed, 44 insertions(+), 190 deletions(-) diff --git a/api/client/types.go b/api/client/types.go index 940c88df..df8db874 100644 --- a/api/client/types.go +++ b/api/client/types.go @@ -5,6 +5,20 @@ import ( "time" ) +var HpcStatusMapping = map[string][]string{ + "Running": {"RUNNING", "RUNNING", "CONFIGURING", "COMPLETING"}, + "Succeeded": {"COMPLETED"}, + "Failed": {"FAILED", "TIMEOUT", "DEADLINE", "OUT_OF_MEMORY", "BOOT_FAIL", "CANCELLED"}, +} + +var AiStatusMapping = map[string]string{ + "PENDING": "Running", +} + +var CloudStatusMapping = map[string]string{ + "PENDING": "Running", +} + type PullTaskInfoReq struct { AdapterId int64 `form:"adapterId"` } diff --git a/api/internal/logic/core/pushtaskinfologic.go b/api/internal/logic/core/pushtaskinfologic.go index 664a8df9..d9efac1b 100644 --- a/api/internal/logic/core/pushtaskinfologic.go +++ b/api/internal/logic/core/pushtaskinfologic.go @@ -73,20 +73,34 @@ func syncTask(gorm *gorm.DB, taskId int64) { if tx.Error != nil { logx.Error(tx.Error) } - // 子状态统一则修改主任务状态 - statusArray := strings.Split(allStatus, ",") - if len(removeRepeatedElement(statusArray)) == 1 { - updateTask(gorm, taskId, statusArray[0]) - } - // 子任务包含失败状态 主任务则失败 - if strings.Contains(allStatus, constants.Failed) { - updateTask(gorm, taskId, constants.Failed) + for pcmStatus, hpcStatus := range clientCore.HpcStatusMapping { + for _, status := range hpcStatus { + // if Failed type status appears in subTask then update mainTask to Failed + if pcmStatus == "Failed" && strings.Contains(allStatus, status) { + updateTask(gorm, taskId, constants.Failed) + return + // no Failed type status in subTask,if Saved type status appears in subTask then update mainTask to Saved + } else if pcmStatus == "Saved" { + if strings.Contains(allStatus, status) { + updateTask(gorm, taskId, constants.Saved) + return + } + // no Failed and Saved type status in subTask,if Running type status appears in subTask then update mainTask to Running + } else if pcmStatus == "Running" { + if strings.Contains(allStatus, status) { + updateTask(gorm, taskId, constants.Running) + return + } + // at last, mainTask should be succeeded + } else { + if strings.Contains(allStatus, status) { + updateTask(gorm, taskId, constants.Succeeded) + return + } + } + } } - if strings.Contains(allStatus, constants.Running) { - updateTask(gorm, taskId, constants.Running) - } - } func updateTask(gorm *gorm.DB, taskId int64, status string) { @@ -98,8 +112,12 @@ func updateTask(gorm *gorm.DB, taskId int64, status string) { if status == constants.Running { task.StartTime = &now } + if task.Status == constants.Failed || task.Status == constants.Succeeded { + task.EndTime = &now + } gorm.Updates(&task) } + } func removeRepeatedElement(arr []string) (newArr []string) { diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 4520f873..f9693ad1 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -938,184 +938,6 @@ type ListResult struct { List interface{} `json:"list,omitempty"` } -type HpcInfo struct { - Id int64 `json:"id"` // id - TaskId int64 `json:"task_id"` // 任务id - JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id) - AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id - ClusterId int64 `json:"cluster_id"` // 执行任务的集群id - ClusterType string `json:"cluster_type"` // 执行任务的集群类型 - Name string `json:"name"` // 名称 - Status string `json:"status"` // 状态 - CmdScript string `json:"cmd_script"` - StartTime string `json:"start_time"` // 开始时间 - RunningTime int64 `json:"running_time"` // 运行时间 - DerivedEs string `json:"derived_es"` - Cluster string `json:"cluster"` - BlockId int64 `json:"block_id"` - AllocNodes int64 `json:"alloc_nodes"` - AllocCpu int64 `json:"alloc_cpu"` - CardCount int64 `json:"card_count"` // 卡数 - Version string `json:"version"` - Account string `json:"account"` - WorkDir string `json:"work_dir"` // 工作路径 - AssocId int64 `json:"assoc_id"` - ExitCode int64 `json:"exit_code"` - WallTime string `json:"wall_time"` // 最大运行时间 - Result string `json:"result"` // 运行结果 - DeletedAt string `json:"deleted_at"` // 删除时间 - YamlString string `json:"yaml_string"` - AppType string `json:"app_type"` // 应用类型 - AppName string `json:"app_name"` // 应用名称 - Queue string `json:"queue"` // 队列名称 - SubmitType string `json:"submit_type"` // cmd(命令行模式) - NNode string `json:"n_node"` // 节点个数(当指定该参数时,GAP_NODE_STRING必须为"") - StdOutFile string `json:"std_out_file"` // 工作路径/std.err.%j - StdErrFile string `json:"std_err_file"` // 工作路径/std.err.%j - StdInput string `json:"std_input"` - Environment string `json:"environment"` - DeletedFlag int64 `json:"deleted_flag"` // 是否删除(0-否,1-是) - CreatedBy int64 `json:"created_by"` // 创建人 - CreateTime string `json:"created_time"` // 创建时间 - UpdatedBy int64 `json:"updated_by"` // 更新人 - UpdateTime string `json:"updated_time"` // 更新时间 -} - -type CloudInfo struct { - Participant int64 `json:"participant,omitempty"` - Id int64 `json:"id,omitempty"` - TaskId int64 `json:"taskId,omitempty"` - ApiVersion string `json:"apiVersion,omitempty"` - Kind string `json:"kind,omitempty"` - Namespace string `json:"namespace,omitempty"` - Name string `json:"name,omitempty"` - Status string `json:"status,omitempty"` - StartTime string `json:"startTime,omitempty"` - RunningTime int64 `json:"runningTime,omitempty"` - Result string `json:"result,omitempty"` - YamlString string `json:"yamlString,omitempty"` -} - -type AiInfo struct { - ParticipantId int64 `json:"participantId,omitempty"` - TaskId int64 `json:"taskId,omitempty"` - ProjectId string `json:"project_id,omitempty"` - Name string `json:"name,omitempty"` - Status string `json:"status,omitempty"` - StartTime string `json:"startTime,omitempty"` - RunningTime int64 `json:"runningTime,omitempty"` - Result string `json:"result,omitempty"` - JobId string `json:"jobId,omitempty"` - CreateTime string `json:"createTime,omitempty"` - ImageUrl string `json:"imageUrl,omitempty"` - Command string `json:"command,omitempty"` - FlavorId string `json:"flavorId,omitempty"` - SubscriptionId string `json:"subscriptionId,omitempty"` - ItemVersionId string `json:"itemVersionId,omitempty"` -} - -type VmInfo struct { - ParticipantId int64 `json:"participantId,omitempty"` - TaskId int64 `json:"taskId,omitempty"` - Name string `json:"name,omitempty"` - FlavorRef string `json:"flavor_ref,omitempty"` - ImageRef string `json:"image_ref,omitempty"` - NetworkUuid string `json:"network_uuid,omitempty"` - BlockUuid string `json:"block_uuid,omitempty"` - SourceType string `json:"source_type,omitempty"` - DeleteOnTermination bool `json:"delete_on_termination,omitempty"` - Status string `json:"status,omitempty"` - MinCount string `json:"min_count,omitempty"` - Platform string `json:"platform,omitempty"` - Uuid string `json:"uuid,omitempty"` -} - -type PullTaskInfoReq struct { - AdapterId int64 `form:"adapterId"` -} - -type PullTaskInfoResp struct { - HpcInfoList []*HpcInfo `json:"HpcInfoList,omitempty"` - CloudInfoList []*CloudInfo `json:"CloudInfoList,omitempty"` - AiInfoList []*AiInfo `json:"AiInfoList,omitempty"` - VmInfoList []*VmInfo `json:"VmInfoList,omitempty"` -} - -type PushTaskInfoReq struct { - AdapterId int64 `json:"adapterId"` - HpcInfoList []*HpcInfo `json:"hpcInfoList"` - CloudInfoList []*CloudInfo `json:"cloudInfoList"` - AiInfoList []*AiInfo `json:"aiInfoList"` - VmInfoList []*VmInfo `json:"vmInfoList"` -} - -type PushTaskInfoResp struct { - Code int64 `json:"code"` - Msg string `json:"msg"` -} - -type PushResourceInfoReq struct { - AdapterId int64 `json:"adapterId"` - ResourceStats []ResourceStats `json:"resourceStats"` -} - -type PushResourceInfoResp struct { - Code int64 `json:"code"` - Msg string `json:"msg"` -} - -type NoticeInfo struct { - AdapterId int64 `json:"adapterId"` - AdapterName string `json:"adapterName"` - ClusterId int64 `json:"clusterId"` - ClusterName string `json:"clusterName"` - NoticeType string `json:"noticeType"` - TaskName string `json:"taskName"` - Incident string `json:"incident"` -} - -type ListNoticeReq struct { -} - -type ListNoticeResp struct { - Code int64 `json:"code"` - Msg string `json:"msg"` - Data []NoticeInfo `json:"data"` -} - -type PushNoticeReq struct { - NoticeInfo NoticeInfo `json:"noticeInfo"` -} - -type PushNoticeResp struct { - Code int64 `json:"code"` - Msg string `json:"msg"` -} - -type ResourceStats struct { - ClusterId int64 `json:"clusterId"` - Name string `json:"name"` - CpuCoreAvail int64 `json:"cpuCoreAvail"` - CpuCoreTotal int64 `json:"cpuCoreTotal"` - MemAvail float64 `json:"memAvail"` - MemTotal float64 `json:"memTotal"` - DiskAvail float64 `json:"diskAvail"` - DiskTotal float64 `json:"diskTotal"` - GpuAvail int64 `json:"gpuAvail"` - CardsAvail []*Card `json:"cardsAvail"` - CpuCoreHours float64 `json:"cpuCoreHours"` - Balance float64 `json:"balance"` -} - -type Card struct { - Platform string `json:"platform"` - Type string `json:"type"` - Name string `json:"name"` - TOpsAtFp16 float64 `json:"TOpsAtFp16"` - CardHours float64 `json:"cardHours"` - CardNum int32 `json:"cardNum"` -} - type TaskStatusResp struct { Succeeded int `json:"Succeeded"` Failed int `json:"Failed"` From 275e14a75849b8a70b8b1a16807cc338ecdc8641 Mon Sep 17 00:00:00 2001 From: tzwang Date: Fri, 10 May 2024 16:17:33 +0800 Subject: [PATCH 25/40] added updating aitask status to taskList Former-commit-id: 8908fee9e07f5d334a6a7fd440d1d62246efa972 --- api/internal/logic/core/tasklistlogic.go | 67 +++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/api/internal/logic/core/tasklistlogic.go b/api/internal/logic/core/tasklistlogic.go index e56be650..382db2e2 100644 --- a/api/internal/logic/core/tasklistlogic.go +++ b/api/internal/logic/core/tasklistlogic.go @@ -17,6 +17,7 @@ package core import ( "context" "fmt" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "strconv" "time" @@ -55,6 +56,11 @@ func (l *TaskListLogic) TaskList(req *types.TaskListReq) (resp *types.TaskListRe if len(tasks) == 0 { return nil, nil } + + // 更新智算任务状态 + var ch = make(chan struct{}) + go l.updateAitaskStatus(tasks, ch) + // 查询任务总数 l.svcCtx.DbEngin.Model(&models.Task{}).Count(&resp.TotalCount) @@ -106,5 +112,64 @@ func (l *TaskListLogic) TaskList(req *types.TaskListReq) (resp *types.TaskListRe } - return + select { + case _ = <-ch: + return resp, nil + case <-time.After(1 * time.Second): + return resp, nil + } +} + +func (l *TaskListLogic) updateAitaskStatus(tasks []models.Task, ch chan<- struct{}) { + for _, task := range tasks { + if task.AdapterTypeDict != 1 { + continue + } + if task.Status == constants.Succeeded { + continue + } + + var aiTask []*models.TaskAi + tx := l.svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", task.Id).Scan(&aiTask) + if tx.Error != nil { + logx.Errorf(tx.Error.Error()) + return + } + + start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local) + end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local) + var status = constants.Succeeded + for _, a := range aiTask { + s, _ := time.ParseInLocation(constants.Layout, a.StartTime, time.Local) + e, _ := time.ParseInLocation(constants.Layout, a.EndTime, time.Local) + + if s.Before(start) { + start = s + } + + if e.After(end) { + end = e + } + + if a.Status == constants.Failed { + status = a.Status + break + } + + if a.Status == constants.Running { + status = a.Status + continue + } + } + + task.Status = status + task.StartTime = &start + task.EndTime = &end + + tx = l.svcCtx.DbEngin.Updates(task) + if tx.Error != nil { + return + } + } + ch <- struct{}{} } From 3c4e18f8d470491b95ac1b20f3420a73f872f915 Mon Sep 17 00:00:00 2001 From: tzwang Date: Fri, 10 May 2024 17:20:52 +0800 Subject: [PATCH 26/40] updated api desc Former-commit-id: 1fcf13a7f363e7e200f560873e3faa62826bce8d --- api/desc/pcm.api | 4 ++-- api/desc/schedule/pcm-schedule.api | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 73c6577d..9eb14b17 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -968,10 +968,10 @@ service pcm { post /schedule/getOverview returns (ScheduleOverviewResp) @handler DownloadAlgothmCodeHandler - get /schedule/getDownloadAlgothmCode (DownloadAlgorithmCodeReq) returns (DownloadAlgorithmCodeResp) + get /schedule/downloadAlgorithmCodeReq (DownloadAlgorithmCodeReq) returns (DownloadAlgorithmCodeResp) @handler UploadAlgothmCodeHandler - post /schedule/getDownloadAlgothmCode (UploadAlgorithmCodeReq) returns (UploadAlgorithmCodeResp) + post /schedule/uploadAlgorithmCode (UploadAlgorithmCodeReq) returns (UploadAlgorithmCodeResp) @handler GetComputeCardsByClusterHandler get /schedule/getComputeCardsByCluster/:adapterId/:clusterId (GetComputeCardsByClusterReq) returns (GetComputeCardsByClusterResp) diff --git a/api/desc/schedule/pcm-schedule.api b/api/desc/schedule/pcm-schedule.api index 776851db..99574110 100644 --- a/api/desc/schedule/pcm-schedule.api +++ b/api/desc/schedule/pcm-schedule.api @@ -109,7 +109,6 @@ type ( TaskType string `form:"taskType"` Dataset string `form:"dataset"` Algorithm string `form:"algorithm"` - Code string `form:"code"` } DownloadAlgorithmCodeResp { From c5def7491fe7b79b116f6ea3ff5c1c0b5405fdbd Mon Sep 17 00:00:00 2001 From: jagger Date: Fri, 10 May 2024 17:28:02 +0800 Subject: [PATCH 27/40] fix Signed-off-by: jagger Former-commit-id: 5e50a3782fbd4227fc7d107f34f2a7cc570b3f7a --- api/desc/core/pcm-core.api | 12 ++ api/desc/pcm.api | 3 + api/internal/handler/cloud/podlogshandler.go | 24 +++ api/internal/handler/routes.go | 5 + .../logic/cloud/commitgeneraltasklogic.go | 5 +- api/internal/logic/cloud/podlogslogic.go | 30 +++ api/internal/logic/core/pagelisttasklogic.go | 15 +- api/internal/types/types.go | 189 ++++++++++++++++++ pkg/models/cloud/task_cloud.go | 1 + 9 files changed, 274 insertions(+), 10 deletions(-) create mode 100644 api/internal/handler/cloud/podlogshandler.go create mode 100644 api/internal/logic/cloud/podlogslogic.go diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index f433416e..ccc57605 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -166,6 +166,18 @@ type ( ReqBody []string `json:"reqBody"` Replicas int64 `json:"replicas,string"` } + + PodLogsReq { + TaskId string `json:"taskId"` + TaskName string `json:"taskName"` + ClusterId string `json:"clusterId"` + ClusterName string `json:"clusterName"` + AdapterId string `json:"adapterId"` + AdapterName string `json:"adapterName"` + PodName string `json:"podName"` + stream bool `json:"stream"` + + } ) type deleteTaskReq { diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 73c6577d..e6fd637d 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -231,6 +231,9 @@ service pcm { @doc "Create cloud computing common tasks" @handler commitGeneralTask post /cloud/task/create (GeneralTaskReq) returns () + + @handler podLogs + post /cloud/pod/logs (PodLogsReq) returns (string) } //智算二级接口 diff --git a/api/internal/handler/cloud/podlogshandler.go b/api/internal/handler/cloud/podlogshandler.go new file mode 100644 index 00000000..78824344 --- /dev/null +++ b/api/internal/handler/cloud/podlogshandler.go @@ -0,0 +1,24 @@ +package cloud + +import ( + "github.com/zeromicro/go-zero/rest/httpx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/cloud" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" + "net/http" +) + +func PodLogsHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + var req types.PodLogsReq + if err := httpx.Parse(r, &req); err != nil { + result.ParamErrorResult(r, w, err) + return + } + + l := cloud.NewPodLogsLogic(r.Context(), svcCtx, w) + resp, err := l.PodLogs(&req, w) + result.HttpResult(r, w, resp, err) + } +} diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index 0a2e86a0..918c8057 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -277,6 +277,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/cloud/task/create", Handler: cloud.CommitGeneralTaskHandler(serverCtx), }, + { + Method: http.MethodPost, + Path: "/cloud/pod/logs", + Handler: cloud.PodLogsHandler(serverCtx), + }, }, rest.WithPrefix("/pcm/v1"), ) diff --git a/api/internal/logic/cloud/commitgeneraltasklogic.go b/api/internal/logic/cloud/commitgeneraltasklogic.go index 0162b832..14240bc0 100644 --- a/api/internal/logic/cloud/commitgeneraltasklogic.go +++ b/api/internal/logic/cloud/commitgeneraltasklogic.go @@ -99,6 +99,8 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er Strategy: strategy, } var taskClouds []cloud.TaskCloudModel + adapterName := "" + tx.Table("t_adapter").Select("name").Where("id=?", adapterId).Find(&adapterName) for _, r := range rs { for _, s := range req.ReqBody { sStruct := UnMarshalK8sStruct(s, int64(r.Replica)) @@ -107,6 +109,7 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er taskCloud.TaskId = uint(taskModel.Id) clusterId, _ := strconv.ParseUint(r.ClusterId, 10, 64) taskCloud.AdapterId = uint(adapterId) + taskCloud.AdapterName = adapterName taskCloud.ClusterId = uint(clusterId) taskCloud.ClusterName = r.ClusterName taskCloud.Status = constants.Saved @@ -116,8 +119,6 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er taskClouds = append(taskClouds, taskCloud) } } - adapterName := "" - tx.Table("t_adapter").Select("name").Where("id=?", adapterId).Find(&adapterName) noticeInfo := clientCore.NoticeInfo{ AdapterId: int64(adapterId), AdapterName: adapterName, diff --git a/api/internal/logic/cloud/podlogslogic.go b/api/internal/logic/cloud/podlogslogic.go new file mode 100644 index 00000000..480c2d21 --- /dev/null +++ b/api/internal/logic/cloud/podlogslogic.go @@ -0,0 +1,30 @@ +package cloud + +import ( + "context" + "github.com/zeromicro/go-zero/core/logx" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + "net/http" +) + +type PodLogsLogic struct { + logx.Logger + ctx context.Context + svcCtx *svc.ServiceContext + w http.ResponseWriter +} + +func NewPodLogsLogic(ctx context.Context, svcCtx *svc.ServiceContext, w http.ResponseWriter) *PodLogsLogic { + return &PodLogsLogic{ + Logger: logx.WithContext(ctx), + ctx: ctx, + svcCtx: svcCtx, + w: w, + } +} + +func (l *PodLogsLogic) PodLogs(req *types.PodLogsReq, w http.ResponseWriter) (resp string, err error) { + // todo: add your logic here and delete this line + return +} diff --git a/api/internal/logic/core/pagelisttasklogic.go b/api/internal/logic/core/pagelisttasklogic.go index ce8ae65d..be3baccd 100644 --- a/api/internal/logic/core/pagelisttasklogic.go +++ b/api/internal/logic/core/pagelisttasklogic.go @@ -2,13 +2,12 @@ package core import ( "context" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" + "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils" "time" - "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" - "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" - "github.com/zeromicro/go-zero/core/logx" ) @@ -51,15 +50,15 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa return nil, result.NewDefaultError(err.Error()) } for _, model := range list { - if model.EndTime != "" && model.StartTime != "" { + if model.StartTime != "" && model.EndTime == "" { + startTime := timeutils.TimeStringToGoTime(model.StartTime) + model.RunningTime = int64(time.Now().Sub(startTime).Seconds()) + } + if model.StartTime != "" && model.EndTime != "" { startTime := timeutils.TimeStringToGoTime(model.StartTime) endTime := timeutils.TimeStringToGoTime(model.EndTime) model.RunningTime = int64(endTime.Sub(startTime).Seconds()) } - if model.StartTime != "" { - startTime := timeutils.TimeStringToGoTime(model.StartTime) - model.RunningTime = int64(time.Now().Sub(startTime).Seconds()) - } } resp.List = &list resp.PageSize = req.PageSize diff --git a/api/internal/types/types.go b/api/internal/types/types.go index f9693ad1..2e9b99ac 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -150,6 +150,17 @@ type GeneralTaskReq struct { Replicas int64 `json:"replicas,string"` } +type PodLogsReq struct { + TaskId string `json:"taskId"` + TaskName string `json:"taskName"` + ClusterId string `json:"clusterId"` + ClusterName string `json:"clusterName"` + AdapterId string `json:"adapterId"` + AdapterName string `json:"adapterName"` + PodName string `json:"podName"` + Stream bool `json:"stream"` +} + type DeleteTaskReq struct { Id int64 `path:"id"` } @@ -938,6 +949,184 @@ type ListResult struct { List interface{} `json:"list,omitempty"` } +type HpcInfo struct { + Id int64 `json:"id"` // id + TaskId int64 `json:"task_id"` // 任务id + JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id) + AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id + ClusterId int64 `json:"cluster_id"` // 执行任务的集群id + ClusterType string `json:"cluster_type"` // 执行任务的集群类型 + Name string `json:"name"` // 名称 + Status string `json:"status"` // 状态 + CmdScript string `json:"cmd_script"` + StartTime string `json:"start_time"` // 开始时间 + RunningTime int64 `json:"running_time"` // 运行时间 + DerivedEs string `json:"derived_es"` + Cluster string `json:"cluster"` + BlockId int64 `json:"block_id"` + AllocNodes int64 `json:"alloc_nodes"` + AllocCpu int64 `json:"alloc_cpu"` + CardCount int64 `json:"card_count"` // 卡数 + Version string `json:"version"` + Account string `json:"account"` + WorkDir string `json:"work_dir"` // 工作路径 + AssocId int64 `json:"assoc_id"` + ExitCode int64 `json:"exit_code"` + WallTime string `json:"wall_time"` // 最大运行时间 + Result string `json:"result"` // 运行结果 + DeletedAt string `json:"deleted_at"` // 删除时间 + YamlString string `json:"yaml_string"` + AppType string `json:"app_type"` // 应用类型 + AppName string `json:"app_name"` // 应用名称 + Queue string `json:"queue"` // 队列名称 + SubmitType string `json:"submit_type"` // cmd(命令行模式) + NNode string `json:"n_node"` // 节点个数(当指定该参数时,GAP_NODE_STRING必须为"") + StdOutFile string `json:"std_out_file"` // 工作路径/std.err.%j + StdErrFile string `json:"std_err_file"` // 工作路径/std.err.%j + StdInput string `json:"std_input"` + Environment string `json:"environment"` + DeletedFlag int64 `json:"deleted_flag"` // 是否删除(0-否,1-是) + CreatedBy int64 `json:"created_by"` // 创建人 + CreateTime string `json:"created_time"` // 创建时间 + UpdatedBy int64 `json:"updated_by"` // 更新人 + UpdateTime string `json:"updated_time"` // 更新时间 +} + +type CloudInfo struct { + Participant int64 `json:"participant,omitempty"` + Id int64 `json:"id,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + ApiVersion string `json:"apiVersion,omitempty"` + Kind string `json:"kind,omitempty"` + Namespace string `json:"namespace,omitempty"` + Name string `json:"name,omitempty"` + Status string `json:"status,omitempty"` + StartTime string `json:"startTime,omitempty"` + RunningTime int64 `json:"runningTime,omitempty"` + Result string `json:"result,omitempty"` + YamlString string `json:"yamlString,omitempty"` +} + +type AiInfo struct { + ParticipantId int64 `json:"participantId,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + ProjectId string `json:"project_id,omitempty"` + Name string `json:"name,omitempty"` + Status string `json:"status,omitempty"` + StartTime string `json:"startTime,omitempty"` + RunningTime int64 `json:"runningTime,omitempty"` + Result string `json:"result,omitempty"` + JobId string `json:"jobId,omitempty"` + CreateTime string `json:"createTime,omitempty"` + ImageUrl string `json:"imageUrl,omitempty"` + Command string `json:"command,omitempty"` + FlavorId string `json:"flavorId,omitempty"` + SubscriptionId string `json:"subscriptionId,omitempty"` + ItemVersionId string `json:"itemVersionId,omitempty"` +} + +type VmInfo struct { + ParticipantId int64 `json:"participantId,omitempty"` + TaskId int64 `json:"taskId,omitempty"` + Name string `json:"name,omitempty"` + FlavorRef string `json:"flavor_ref,omitempty"` + ImageRef string `json:"image_ref,omitempty"` + NetworkUuid string `json:"network_uuid,omitempty"` + BlockUuid string `json:"block_uuid,omitempty"` + SourceType string `json:"source_type,omitempty"` + DeleteOnTermination bool `json:"delete_on_termination,omitempty"` + Status string `json:"status,omitempty"` + MinCount string `json:"min_count,omitempty"` + Platform string `json:"platform,omitempty"` + Uuid string `json:"uuid,omitempty"` +} + +type PullTaskInfoReq struct { + AdapterId int64 `form:"adapterId"` +} + +type PullTaskInfoResp struct { + HpcInfoList []*HpcInfo `json:"HpcInfoList,omitempty"` + CloudInfoList []*CloudInfo `json:"CloudInfoList,omitempty"` + AiInfoList []*AiInfo `json:"AiInfoList,omitempty"` + VmInfoList []*VmInfo `json:"VmInfoList,omitempty"` +} + +type PushTaskInfoReq struct { + AdapterId int64 `json:"adapterId"` + HpcInfoList []*HpcInfo `json:"hpcInfoList"` + CloudInfoList []*CloudInfo `json:"cloudInfoList"` + AiInfoList []*AiInfo `json:"aiInfoList"` + VmInfoList []*VmInfo `json:"vmInfoList"` +} + +type PushTaskInfoResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` +} + +type PushResourceInfoReq struct { + AdapterId int64 `json:"adapterId"` + ResourceStats []ResourceStats `json:"resourceStats"` +} + +type PushResourceInfoResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` +} + +type NoticeInfo struct { + AdapterId int64 `json:"adapterId"` + AdapterName string `json:"adapterName"` + ClusterId int64 `json:"clusterId"` + ClusterName string `json:"clusterName"` + NoticeType string `json:"noticeType"` + TaskName string `json:"taskName"` + Incident string `json:"incident"` +} + +type ListNoticeReq struct { +} + +type ListNoticeResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` + Data []NoticeInfo `json:"data"` +} + +type PushNoticeReq struct { + NoticeInfo NoticeInfo `json:"noticeInfo"` +} + +type PushNoticeResp struct { + Code int64 `json:"code"` + Msg string `json:"msg"` +} + +type ResourceStats struct { + ClusterId int64 `json:"clusterId"` + Name string `json:"name"` + CpuCoreAvail int64 `json:"cpuCoreAvail"` + CpuCoreTotal int64 `json:"cpuCoreTotal"` + MemAvail float64 `json:"memAvail"` + MemTotal float64 `json:"memTotal"` + DiskAvail float64 `json:"diskAvail"` + DiskTotal float64 `json:"diskTotal"` + GpuAvail int64 `json:"gpuAvail"` + CardsAvail []*Card `json:"cardsAvail"` + CpuCoreHours float64 `json:"cpuCoreHours"` + Balance float64 `json:"balance"` +} + +type Card struct { + Platform string `json:"platform"` + Type string `json:"type"` + Name string `json:"name"` + TOpsAtFp16 float64 `json:"TOpsAtFp16"` + CardHours float64 `json:"cardHours"` + CardNum int32 `json:"cardNum"` +} + type TaskStatusResp struct { Succeeded int `json:"Succeeded"` Failed int `json:"Failed"` diff --git a/pkg/models/cloud/task_cloud.go b/pkg/models/cloud/task_cloud.go index d60c236e..90949145 100644 --- a/pkg/models/cloud/task_cloud.go +++ b/pkg/models/cloud/task_cloud.go @@ -8,6 +8,7 @@ import ( type TaskCloudModel struct { Id uint `json:"id" gorm:"primarykey;not null;comment:id"` TaskId uint `json:"taskId" gorm:"not null;comment:task表id"` + AdapterName string `json:"adapterName" gorm:"not null;comment:适配器名称"` AdapterId uint `json:"adapterId" gorm:"not null;comment:适配器id"` ClusterId uint `json:"clusterId" gorm:"not null;comment:集群id"` ClusterName string `json:"clusterName" gorm:"not null;comment:集群名称"` From 3392fa01ac6eab46466110ffebe1b580e20a4c1a Mon Sep 17 00:00:00 2001 From: tzwang Date: Fri, 10 May 2024 21:50:03 +0800 Subject: [PATCH 28/40] updated downloadalgorithmcode logic Former-commit-id: 75272c3ecdea08e482f3f78125ec3019a833095e --- api/desc/pcm.api | 2 +- api/desc/schedule/pcm-schedule.api | 2 +- api/internal/handler/routes.go | 4 +- .../schedule/downloadalgothmcodehandler.go | 11 ++--- .../schedule/uploadalgothmcodehandler.go | 11 ++--- .../schedule/downloadalgothmcodelogic.go | 12 ++++-- .../logic/schedule/uploadalgothmcodelogic.go | 11 +++-- api/internal/storeLink/octopus.go | 40 ++++++++++++++++++- api/internal/types/types.go | 3 +- go.mod | 2 +- go.sum | 4 +- 11 files changed, 72 insertions(+), 30 deletions(-) diff --git a/api/desc/pcm.api b/api/desc/pcm.api index 31116667..03779d46 100644 --- a/api/desc/pcm.api +++ b/api/desc/pcm.api @@ -971,7 +971,7 @@ service pcm { post /schedule/getOverview returns (ScheduleOverviewResp) @handler DownloadAlgothmCodeHandler - get /schedule/downloadAlgorithmCodeReq (DownloadAlgorithmCodeReq) returns (DownloadAlgorithmCodeResp) + get /schedule/downloadAlgorithmCode (DownloadAlgorithmCodeReq) returns (DownloadAlgorithmCodeResp) @handler UploadAlgothmCodeHandler post /schedule/uploadAlgorithmCode (UploadAlgorithmCodeReq) returns (UploadAlgorithmCodeResp) diff --git a/api/desc/schedule/pcm-schedule.api b/api/desc/schedule/pcm-schedule.api index 99574110..2fd6e4f8 100644 --- a/api/desc/schedule/pcm-schedule.api +++ b/api/desc/schedule/pcm-schedule.api @@ -112,7 +112,7 @@ type ( } DownloadAlgorithmCodeResp { - Code string `json:"algorithms"` + Code string `json:"code"` } UploadAlgorithmCodeReq { diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index 918c8057..d63d02e1 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -1212,12 +1212,12 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { }, { Method: http.MethodGet, - Path: "/schedule/getDownloadAlgothmCode", + Path: "/schedule/downloadAlgorithmCode", Handler: schedule.DownloadAlgothmCodeHandler(serverCtx), }, { Method: http.MethodPost, - Path: "/schedule/getDownloadAlgothmCode", + Path: "/schedule/uploadAlgorithmCode", Handler: schedule.UploadAlgothmCodeHandler(serverCtx), }, { diff --git a/api/internal/handler/schedule/downloadalgothmcodehandler.go b/api/internal/handler/schedule/downloadalgothmcodehandler.go index 14207bba..e70d85ea 100644 --- a/api/internal/handler/schedule/downloadalgothmcodehandler.go +++ b/api/internal/handler/schedule/downloadalgothmcodehandler.go @@ -7,22 +7,19 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" ) func DownloadAlgothmCodeHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { var req types.DownloadAlgorithmCodeReq if err := httpx.Parse(r, &req); err != nil { - httpx.ErrorCtx(r.Context(), w, err) + result.ParamErrorResult(r, w, err) return } l := schedule.NewDownloadAlgothmCodeLogic(r.Context(), svcCtx) - resp, err := l.DownloadAlgothmCode(&req) - if err != nil { - httpx.ErrorCtx(r.Context(), w, err) - } else { - httpx.OkJsonCtx(r.Context(), w, resp) - } + resp, err := l.DownloadAlgorithmCode(&req) + result.HttpResult(r, w, resp, err) } } diff --git a/api/internal/handler/schedule/uploadalgothmcodehandler.go b/api/internal/handler/schedule/uploadalgothmcodehandler.go index bfbd05d1..681715e2 100644 --- a/api/internal/handler/schedule/uploadalgothmcodehandler.go +++ b/api/internal/handler/schedule/uploadalgothmcodehandler.go @@ -7,22 +7,19 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" ) func UploadAlgothmCodeHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { var req types.UploadAlgorithmCodeReq if err := httpx.Parse(r, &req); err != nil { - httpx.ErrorCtx(r.Context(), w, err) + result.ParamErrorResult(r, w, err) return } l := schedule.NewUploadAlgothmCodeLogic(r.Context(), svcCtx) - resp, err := l.UploadAlgothmCode(&req) - if err != nil { - httpx.ErrorCtx(r.Context(), w, err) - } else { - httpx.OkJsonCtx(r.Context(), w, resp) - } + resp, err := l.UploadAlgorithmCode(&req) + result.HttpResult(r, w, resp, err) } } diff --git a/api/internal/logic/schedule/downloadalgothmcodelogic.go b/api/internal/logic/schedule/downloadalgothmcodelogic.go index e39f7651..81b96579 100644 --- a/api/internal/logic/schedule/downloadalgothmcodelogic.go +++ b/api/internal/logic/schedule/downloadalgothmcodelogic.go @@ -23,8 +23,14 @@ func NewDownloadAlgothmCodeLogic(ctx context.Context, svcCtx *svc.ServiceContext } } -func (l *DownloadAlgothmCodeLogic) DownloadAlgothmCode(req *types.DownloadAlgorithmCodeReq) (resp *types.DownloadAlgorithmCodeResp, err error) { - // todo: add your logic here and delete this line +func (l *DownloadAlgothmCodeLogic) DownloadAlgorithmCode(req *types.DownloadAlgorithmCodeReq) (resp *types.DownloadAlgorithmCodeResp, err error) { + resp = &types.DownloadAlgorithmCodeResp{} + code, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].DownloadAlgorithmCode(l.ctx, + req.ResourceType, req.Card, req.TaskType, req.Dataset, req.Algorithm) + if err != nil { + return nil, err + } + resp.Code = code - return + return resp, nil } diff --git a/api/internal/logic/schedule/uploadalgothmcodelogic.go b/api/internal/logic/schedule/uploadalgothmcodelogic.go index a0771b04..052412c4 100644 --- a/api/internal/logic/schedule/uploadalgothmcodelogic.go +++ b/api/internal/logic/schedule/uploadalgothmcodelogic.go @@ -23,8 +23,13 @@ func NewUploadAlgothmCodeLogic(ctx context.Context, svcCtx *svc.ServiceContext) } } -func (l *UploadAlgothmCodeLogic) UploadAlgothmCode(req *types.UploadAlgorithmCodeReq) (resp *types.UploadAlgorithmCodeResp, err error) { - // todo: add your logic here and delete this line +func (l *UploadAlgothmCodeLogic) UploadAlgorithmCode(req *types.UploadAlgorithmCodeReq) (resp *types.UploadAlgorithmCodeResp, err error) { + resp = &types.UploadAlgorithmCodeResp{} + err = l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].UploadAlgorithmCode(l.ctx, + req.ResourceType, req.Card, req.TaskType, req.Dataset, req.Algorithm, req.Code) + if err != nil { + return nil, err + } - return + return resp, nil } diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index 8e1f7af0..a088e56a 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -54,6 +54,7 @@ const ( CAMBRICON = "cambricon" TRAIN_CMD = "cd /code; python train.py" VERSION = "V1" + DOMAIN = "http://192.168.242.41:8001/" ) var ( @@ -340,7 +341,44 @@ func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm } func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) { - return "", nil + dcReq := &octopus.DownloadCompressReq{ + Platform: o.platform, + Version: VERSION, + AlgorithmId: "", + } + dcResp, err := o.octopusRpc.DownloadCompress(ctx, dcReq) + if err != nil { + return "", err + } + + if !dcResp.Success { + return "", errors.New(dcResp.Error.Message) + } + + daReq := &octopus.DownloadAlgorithmReq{ + Platform: o.platform, + Version: VERSION, + AlgorithmId: "", + CompressAt: dcResp.Payload.CompressAt, + Domain: DOMAIN, + } + daResp, err := o.octopusRpc.DownloadAlgorithm(ctx, daReq) + if err != nil { + return "", err + } + if !daResp.Success { + return "", errors.New(dcResp.Error.Message) + } + urlReq := &octopus.AlgorithmUrlReq{ + Platform: o.platform, + Url: daResp.Payload.DownloadUrl, + } + urlResp, err := o.octopusRpc.DownloadAlgorithmUrl(ctx, urlReq) + if err != nil { + return "", err + } + + return urlResp.Algorithm, nil } func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error { diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 2e9b99ac..eef38e23 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -5695,11 +5695,10 @@ type DownloadAlgorithmCodeReq struct { TaskType string `form:"taskType"` Dataset string `form:"dataset"` Algorithm string `form:"algorithm"` - Code string `form:"code"` } type DownloadAlgorithmCodeResp struct { - Code string `json:"algorithms"` + Code string `json:"code"` } type UploadAlgorithmCodeReq struct { diff --git a/go.mod b/go.mod index fb46a881..bf50e2ee 100644 --- a/go.mod +++ b/go.mod @@ -26,7 +26,7 @@ require ( github.com/zeromicro/go-zero v1.6.3 gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c - gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142 + gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d diff --git a/go.sum b/go.sum index c8224c75..e1d1ec5b 100644 --- a/go.sum +++ b/go.sum @@ -1082,8 +1082,8 @@ gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece h1:W3yBnvAVV gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo= gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c h1:2Wl/hvaSFjh6fmCSIQhjkr9llMRREQeqcXNLZ/HPY18= gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c/go.mod h1:lSRfGs+PxFvw7CcndHWRd6UlLlGrZn0b0hp5cfaMNGw= -gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142 h1:+po0nesBDSWsgCySBG7eEXk7i9Ytd58wqvjL1M9y6d8= -gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= +gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 h1:E2QfpS3Y0FjR8Zyv5l2Ti/2NetQFqHG66c8+T/+J1u0= +gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203/go.mod h1:i2rrbMQ+Fve345BY9Heh4MUqVTAimZQElQhzzRee5B8= gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnzkJBfMRnya1NrhOzlroUtRa5ePiYbPKlHLoLV0= From 0fbbf4c454d4f1632c63385dba7c67b37d6d7ee9 Mon Sep 17 00:00:00 2001 From: Jake <450705171@qq.com> Date: Fri, 10 May 2024 22:07:06 +0800 Subject: [PATCH 29/40] send notice in core && adapter and cluster info saved in subtask Former-commit-id: 6d85d7373c62b604c84956d0b82d51670b39b045 --- api/client/types.go | 38 ++--- api/internal/logic/core/pushtaskinfologic.go | 140 +++++++++++++----- api/internal/logic/hpc/commithpctasklogic.go | 19 ++- .../scheduler/schedulers/aiScheduler.go | 8 +- pkg/models/aimodel_gen.go | 29 ++-- pkg/models/cloud/task_cloud.go | 1 + pkg/models/taskhpcmodel_gen.go | 15 +- pkg/models/taskvmmodel_gen.go | 6 +- 8 files changed, 173 insertions(+), 83 deletions(-) diff --git a/api/client/types.go b/api/client/types.go index df8db874..32e22c1b 100644 --- a/api/client/types.go +++ b/api/client/types.go @@ -5,20 +5,12 @@ import ( "time" ) -var HpcStatusMapping = map[string][]string{ +var StatusMapping = map[string][]string{ "Running": {"RUNNING", "RUNNING", "CONFIGURING", "COMPLETING"}, "Succeeded": {"COMPLETED"}, "Failed": {"FAILED", "TIMEOUT", "DEADLINE", "OUT_OF_MEMORY", "BOOT_FAIL", "CANCELLED"}, } -var AiStatusMapping = map[string]string{ - "PENDING": "Running", -} - -var CloudStatusMapping = map[string]string{ - "PENDING": "Running", -} - type PullTaskInfoReq struct { AdapterId int64 `form:"adapterId"` } @@ -58,6 +50,7 @@ type NoticeInfo struct { ClusterId int64 `json:"clusterId"` ClusterName string `json:"clusterName"` NoticeType string `json:"noticeType"` + TaskId int64 `json:"taskId"` TaskName string `json:"taskName"` Incident string `json:"incident"` CreatedTime time.Time `json:"createdTime"` @@ -82,11 +75,13 @@ type PushNoticeResp struct { } type HpcInfo struct { - Id int64 `json:"id"` // id - TaskId int64 `json:"task_id"` // 任务id - JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id) - AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id - ClusterId int64 `json:"cluster_id"` // 执行任务的集群id + Id int64 `json:"id"` // id + TaskId int64 `json:"task_id"` // 任务id + JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id) + AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id + AdapterName string `json:"adapterName,omitempty,optional"` + ClusterId int64 `json:"cluster_id"` // 执行任务的集群id + ClusterName string `json:"clusterName,omitempty,optional"` ClusterType string `json:"cluster_type"` // 执行任务的集群类型 Name string `json:"name"` // 名称 Status string `json:"status"` // 状态 @@ -127,8 +122,9 @@ type HpcInfo struct { type CloudInfo struct { Id uint `json:"id,omitempty,optional"` TaskId int64 `json:"taskId,omitempty,optional"` - AdapterId uint `json:"adapterId,omitempty,optional"` - ClusterId uint `json:"clusterId,omitempty,optional"` + AdapterId int64 `json:"adapterId,omitempty,optional"` + AdapterName string `json:"adapterName,omitempty,optional"` + ClusterId int64 `json:"clusterId,omitempty,optional"` ClusterName string `json:"clusterName,omitempty,optional"` Kind string `json:"kind,omitempty,optional"` Status string `json:"status,omitempty,optional"` @@ -139,9 +135,12 @@ type CloudInfo struct { } type AiInfo struct { - ParticipantId int64 `json:"participantId,omitempty"` TaskId int64 `json:"taskId,omitempty"` ProjectId string `json:"project_id,omitempty"` + AdapterId int64 `json:"adapterId,omitempty,optional"` + AdapterName string `json:"adapterName,omitempty,optional"` + ClusterId int64 `json:"clusterId,omitempty,optional"` + ClusterName string `json:"clusterName,omitempty,optional"` Name string `json:"name,omitempty"` Status string `json:"status,omitempty"` StartTime string `json:"startTime,omitempty"` @@ -157,9 +156,12 @@ type AiInfo struct { } type VmInfo struct { - ParticipantId int64 `json:"participantId,omitempty"` TaskId int64 `json:"taskId,omitempty"` Name string `json:"name,omitempty"` + AdapterId int64 `json:"adapterId,omitempty,optional"` + AdapterName string `json:"adapterName,omitempty,optional"` + ClusterId int64 `json:"clusterId,omitempty,optional"` + ClusterName string `json:"clusterName,omitempty,optional"` FlavorRef string `json:"flavor_ref,omitempty"` ImageRef string `json:"image_ref,omitempty"` NetworkUuid string `json:"network_uuid,omitempty"` diff --git a/api/internal/logic/core/pushtaskinfologic.go b/api/internal/logic/core/pushtaskinfologic.go index d9efac1b..ff5a102f 100644 --- a/api/internal/logic/core/pushtaskinfologic.go +++ b/api/internal/logic/core/pushtaskinfologic.go @@ -41,61 +41,143 @@ func (l *PushTaskInfoLogic) PushTaskInfo(req *clientCore.PushTaskInfoReq) (*clie } l.svcCtx.DbEngin.Exec("update task_cloud set status = ?,start_time = ?,result = ? where task_id = ?", cloudInfo.Status, cloudInfo.StartTime, cloudInfo.Result, cloudInfo.TaskId) - syncTask(l.svcCtx.DbEngin, int64(taskId)) + var taskName string + l.svcCtx.DbEngin.Raw("select name as kind from task where id = ?", taskId).Scan(&taskName) + noticeInfo := clientCore.NoticeInfo{ + TaskId: cloudInfo.TaskId, + AdapterId: cloudInfo.AdapterId, + AdapterName: cloudInfo.AdapterName, + ClusterId: cloudInfo.ClusterId, + ClusterName: cloudInfo.ClusterName, + TaskName: taskName, + } + syncTask(l.svcCtx.DbEngin, noticeInfo) } case 2: for _, hpcInfo := range req.HpcInfoList { l.svcCtx.DbEngin.Exec("update task_hpc set status = ?,start_time = ?,job_id = ? where cluster_id = ? and task_id = ? and name = ?", hpcInfo.Status, hpcInfo.StartTime, hpcInfo.JobId, hpcInfo.ClusterId, hpcInfo.TaskId, hpcInfo.Name) - syncTask(l.svcCtx.DbEngin, hpcInfo.TaskId) + noticeInfo := clientCore.NoticeInfo{ + TaskId: hpcInfo.TaskId, + AdapterId: hpcInfo.AdapterId, + AdapterName: hpcInfo.AdapterName, + ClusterId: hpcInfo.ClusterId, + ClusterName: hpcInfo.ClusterName, + TaskName: hpcInfo.Name, + } + syncTask(l.svcCtx.DbEngin, noticeInfo) } case 1: for _, aiInfo := range req.AiInfoList { l.svcCtx.DbEngin.Exec("update task_ai set status = ?,start_time = ?,project_id = ?,job_id = ? where participant_id = ? and task_id = ? and name = ?", aiInfo.Status, aiInfo.StartTime, aiInfo.ProjectId, aiInfo.JobId, req.AdapterId, aiInfo.TaskId, aiInfo.Name) - syncTask(l.svcCtx.DbEngin, aiInfo.TaskId) + noticeInfo := clientCore.NoticeInfo{ + TaskId: aiInfo.TaskId, + AdapterId: aiInfo.AdapterId, + AdapterName: aiInfo.AdapterName, + ClusterId: aiInfo.ClusterId, + ClusterName: aiInfo.ClusterName, + TaskName: aiInfo.Name, + } + syncTask(l.svcCtx.DbEngin, noticeInfo) } case 3: for _, vmInfo := range req.VmInfoList { l.svcCtx.DbEngin.Exec("update task_vm set status = ?,start_time = ? where participant_id = ? and task_id = ? and name = ?", vmInfo.Status, vmInfo.StartTime, req.AdapterId, vmInfo.TaskId, vmInfo.Name) - syncTask(l.svcCtx.DbEngin, vmInfo.TaskId) + noticeInfo := clientCore.NoticeInfo{ + TaskId: vmInfo.TaskId, + AdapterId: vmInfo.AdapterId, + AdapterName: vmInfo.AdapterName, + ClusterId: vmInfo.ClusterId, + ClusterName: vmInfo.ClusterName, + TaskName: vmInfo.Name, + } + syncTask(l.svcCtx.DbEngin, noticeInfo) } } - return &resp, nil } -func syncTask(gorm *gorm.DB, taskId int64) { +func syncTask(gorm *gorm.DB, noticeInfo clientCore.NoticeInfo) { var allStatus string - tx := gorm.Raw("SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.status) ,GROUP_CONCAT(DISTINCT a.status) ,GROUP_CONCAT(DISTINCT c.status))as status from task t left join task_hpc h on t.id = h.task_id left join task_cloud c on t.id = c.task_id left join task_ai a on t.id = a.task_id where t.id = ?", taskId).Scan(&allStatus) + tx := gorm.Raw("SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.status) ,GROUP_CONCAT(DISTINCT a.status) ,GROUP_CONCAT(DISTINCT c.status))as status from task t left join task_hpc h on t.id = h.task_id left join task_cloud c on t.id = c.task_id left join task_ai a on t.id = a.task_id where t.id = ?", noticeInfo.TaskId).Scan(&allStatus) if tx.Error != nil { logx.Error(tx.Error) } - for pcmStatus, hpcStatus := range clientCore.HpcStatusMapping { - for _, status := range hpcStatus { + for pcmStatus, ProviderStatus := range clientCore.StatusMapping { + for _, originalStatus := range ProviderStatus { // if Failed type status appears in subTask then update mainTask to Failed - if pcmStatus == "Failed" && strings.Contains(allStatus, status) { - updateTask(gorm, taskId, constants.Failed) + if pcmStatus == "Failed" && strings.Contains(allStatus, originalStatus) { + updateTask(gorm, noticeInfo.TaskId, constants.Failed) + noticeInfo := clientCore.NoticeInfo{ + AdapterId: noticeInfo.AdapterId, + AdapterName: noticeInfo.AdapterName, + ClusterId: noticeInfo.ClusterId, + ClusterName: noticeInfo.ClusterName, + NoticeType: "failed", + TaskName: noticeInfo.TaskName, + Incident: "任务执行失败,请查看日志!", + CreatedTime: time.Now(), + } + gorm.Table("t_notice").Create(¬iceInfo) return // no Failed type status in subTask,if Saved type status appears in subTask then update mainTask to Saved - } else if pcmStatus == "Saved" { - if strings.Contains(allStatus, status) { - updateTask(gorm, taskId, constants.Saved) + } else if pcmStatus == "Saved" && strings.Contains(allStatus, originalStatus) { + if getTaskStatus(gorm, noticeInfo.TaskId) != "Saved" { + updateTask(gorm, noticeInfo.TaskId, constants.Saved) + noticeInfo := clientCore.NoticeInfo{ + AdapterId: noticeInfo.AdapterId, + AdapterName: noticeInfo.AdapterName, + ClusterId: noticeInfo.ClusterId, + ClusterName: noticeInfo.ClusterName, + NoticeType: "saved", + TaskName: noticeInfo.TaskName, + Incident: "任务已处于队列中!", + CreatedTime: time.Now(), + } + gorm.Table("t_notice").Create(¬iceInfo) + return + } else { return } // no Failed and Saved type status in subTask,if Running type status appears in subTask then update mainTask to Running - } else if pcmStatus == "Running" { - if strings.Contains(allStatus, status) { - updateTask(gorm, taskId, constants.Running) + } else if pcmStatus == "Running" && strings.Contains(allStatus, originalStatus) { + if getTaskStatus(gorm, noticeInfo.TaskId) != "Running" { + updateTask(gorm, noticeInfo.TaskId, constants.Running) + noticeInfo := clientCore.NoticeInfo{ + AdapterId: noticeInfo.AdapterId, + AdapterName: noticeInfo.AdapterName, + ClusterId: noticeInfo.ClusterId, + ClusterName: noticeInfo.ClusterName, + NoticeType: "running", + TaskName: noticeInfo.TaskName, + Incident: "任务状态切换为运行中!", + CreatedTime: time.Now(), + } + gorm.Table("t_notice").Create(¬iceInfo) + return + } else { return } + // at last, mainTask should be succeeded } else { - if strings.Contains(allStatus, status) { - updateTask(gorm, taskId, constants.Succeeded) + if strings.Contains(allStatus, originalStatus) { + updateTask(gorm, noticeInfo.TaskId, constants.Succeeded) + noticeInfo := clientCore.NoticeInfo{ + AdapterId: noticeInfo.AdapterId, + AdapterName: noticeInfo.AdapterName, + ClusterId: noticeInfo.ClusterId, + ClusterName: noticeInfo.ClusterName, + NoticeType: "succeeded", + TaskName: noticeInfo.TaskName, + Incident: "任务执行完成!", + CreatedTime: time.Now(), + } + gorm.Table("t_notice").Create(¬iceInfo) return } } @@ -117,22 +199,10 @@ func updateTask(gorm *gorm.DB, taskId int64, status string) { } gorm.Updates(&task) } - } -func removeRepeatedElement(arr []string) (newArr []string) { - newArr = make([]string, 0) - for i := 0; i < len(arr); i++ { - repeat := false - for j := i + 1; j < len(arr); j++ { - if arr[i] == arr[j] { - repeat = true - break - } - } - if !repeat { - newArr = append(newArr, arr[i]) - } - } - return +func getTaskStatus(gorm *gorm.DB, taskId int64) (status string) { + var task models.Task + gorm.Where("id = ? ", taskId).Find(&task) + return task.Status } diff --git a/api/internal/logic/hpc/commithpctasklogic.go b/api/internal/logic/hpc/commithpctasklogic.go index b763a068..994f3f91 100644 --- a/api/internal/logic/hpc/commithpctasklogic.go +++ b/api/internal/logic/hpc/commithpctasklogic.go @@ -52,6 +52,13 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t var clusterIds []int64 l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? and label = ?", req.AdapterId, req.ClusterType).Scan(&clusterIds) + adapterId, _ := strconv.ParseInt(req.AdapterId, 10, 64) + var adapterName string + l.svcCtx.DbEngin.Raw("SELECT name FROM `t_adapter` where id = ?", req.AdapterId).Scan(&adapterName) + clusterId := clusterIds[rand.Intn(len(clusterIds))] + var clusterName string + l.svcCtx.DbEngin.Raw("SELECT nickname FROM `t_cluster` where id = ?", clusterId).Scan(&clusterName) + env, _ := json.Marshal(req.Environment) if len(clusterIds) == 0 || clusterIds == nil { @@ -62,7 +69,10 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t hpcInfo := models.TaskHpc{ TaskId: taskModel.Id, - ClusterId: clusterIds[rand.Intn(len(clusterIds))], + AdapterId: uint(adapterId), + AdapterName: adapterName, + ClusterId: uint(clusterId), + ClusterName: clusterName, Name: taskModel.Name, Status: "Saved", CmdScript: req.CmdScript, @@ -90,12 +100,11 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t if tx.Error != nil { return nil, tx.Error } - adapterId, _ := strconv.ParseUint(req.AdapterId, 10, 64) - adapterName := "" - tx.Table("t_adapter").Select("name").Where("id=?", adapterId).Find(&adapterName) noticeInfo := clientCore.NoticeInfo{ - AdapterId: int64(adapterId), + AdapterId: adapterId, AdapterName: adapterName, + ClusterId: clusterId, + ClusterName: clusterName, NoticeType: "create", TaskName: req.Name, Incident: "任务创建中", diff --git a/api/internal/scheduler/schedulers/aiScheduler.go b/api/internal/scheduler/schedulers/aiScheduler.go index 76b1a6b9..b61e03b7 100644 --- a/api/internal/scheduler/schedulers/aiScheduler.go +++ b/api/internal/scheduler/schedulers/aiScheduler.go @@ -55,10 +55,10 @@ func NewAiScheduler(ctx context.Context, val string, scheduler *scheduler.Schedu func (as *AiScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) { ai := models.Ai{ - ParticipantId: participantId, - TaskId: task.TaskId, - Status: "Saved", - YamlString: as.yamlString, + AdapterId: participantId, + TaskId: task.TaskId, + Status: "Saved", + YamlString: as.yamlString, } utils.Convert(task.Metadata, &ai) return ai, nil diff --git a/pkg/models/aimodel_gen.go b/pkg/models/aimodel_gen.go index 160c56fc..7736bc5f 100644 --- a/pkg/models/aimodel_gen.go +++ b/pkg/models/aimodel_gen.go @@ -36,19 +36,22 @@ type ( } Ai struct { - Id int64 `db:"id"` // id - TaskId int64 `db:"task_id"` // 任务id - ParticipantId int64 `db:"participant_id"` // 集群静态信息id - ProjectId string `db:"project_id"` // 项目id - Name string `db:"name"` // 名称 - Status string `db:"status"` // 状态 - StartTime string `db:"start_time"` // 开始时间 - RunningTime int64 `db:"running_time"` // 运行时间 - CreatedBy int64 `db:"created_by"` // 创建人 - CreatedTime sql.NullTime `db:"created_time"` // 创建时间 - UpdatedBy int64 `db:"updated_by"` // 更新人 - UpdatedTime sql.NullTime `db:"updated_time"` // 更新时间 - DeletedFlag int64 `db:"deleted_flag"` // 是否删除(0-否,1-是) + Id int64 `db:"id"` // id + TaskId int64 `db:"task_id"` // 任务id + AdapterId int64 `db:"adapter_id"` // 适配器id + AdapterName string `db:"adapter_name"` //适配器名称 + ClusterId int64 `db:"cluster_id"` //集群id + ClusterName string `db:"cluster_name"` //集群名称 + ProjectId string `db:"project_id"` // 项目id + Name string `db:"name"` // 名称 + Status string `db:"status"` // 状态 + StartTime string `db:"start_time"` // 开始时间 + RunningTime int64 `db:"running_time"` // 运行时间 + CreatedBy int64 `db:"created_by"` // 创建人 + CreatedTime sql.NullTime `db:"created_time"` // 创建时间 + UpdatedBy int64 `db:"updated_by"` // 更新人 + UpdatedTime sql.NullTime `db:"updated_time"` // 更新时间 + DeletedFlag int64 `db:"deleted_flag"` // 是否删除(0-否,1-是) Result string `db:"result"` YamlString string `db:"yaml_string"` JobId string `db:"job_id"` diff --git a/pkg/models/cloud/task_cloud.go b/pkg/models/cloud/task_cloud.go index d60c236e..5cb1e1c9 100644 --- a/pkg/models/cloud/task_cloud.go +++ b/pkg/models/cloud/task_cloud.go @@ -9,6 +9,7 @@ type TaskCloudModel struct { Id uint `json:"id" gorm:"primarykey;not null;comment:id"` TaskId uint `json:"taskId" gorm:"not null;comment:task表id"` AdapterId uint `json:"adapterId" gorm:"not null;comment:适配器id"` + AdapterName string `json:"adapterName" gorm:"not null;comment:适配器名称"` ClusterId uint `json:"clusterId" gorm:"not null;comment:集群id"` ClusterName string `json:"clusterName" gorm:"not null;comment:集群名称"` Kind string `json:"kind" gorm:"comment:种类"` diff --git a/pkg/models/taskhpcmodel_gen.go b/pkg/models/taskhpcmodel_gen.go index 9e2fc1cb..05b41aca 100644 --- a/pkg/models/taskhpcmodel_gen.go +++ b/pkg/models/taskhpcmodel_gen.go @@ -36,12 +36,15 @@ type ( } TaskHpc struct { - Id int64 `db:"id"` // id - TaskId int64 `db:"task_id"` // 任务id - JobId string `db:"job_id"` // 作业id(在第三方系统中的作业id) - ClusterId int64 `db:"cluster_id"` // 执行任务的集群id - Name string `db:"name"` // 名称 - Status string `db:"status"` // 状态 + Id int64 `db:"id"` // id + TaskId int64 `db:"task_id"` // 任务id + JobId string `db:"job_id"` // 作业id(在第三方系统中的作业id) + AdapterId uint `db:"adapter_d"` // 适配器id + AdapterName string `db:"adapter_name"` //适配器名称 + ClusterId uint `db:"cluster_id"` //集群id + ClusterName string `db:"cluster_name"` //集群名称 + Name string `db:"name"` // 名称 + Status string `db:"status"` // 状态 CmdScript string `db:"cmd_script"` StartTime string `db:"start_time"` // 开始时间 RunningTime int64 `db:"running_time"` // 运行时间 diff --git a/pkg/models/taskvmmodel_gen.go b/pkg/models/taskvmmodel_gen.go index 6a89501a..6b147b68 100644 --- a/pkg/models/taskvmmodel_gen.go +++ b/pkg/models/taskvmmodel_gen.go @@ -39,8 +39,10 @@ type ( ParticipantId int64 `db:"participant_id"` // p端id TaskId int64 `db:"task_id"` // 任务id Name string `db:"name"` // 虚拟机名称 - AdapterId int64 `db:"adapter_id"` // 执行任务的适配器id - ClusterId int64 `db:"cluster_id"` // 执行任务的集群id + AdapterId int64 `db:"adapter_id"` // 适配器id + AdapterName string `db:"adapter_name"` //适配器名称 + ClusterId int64 `db:"cluster_id"` //集群id + ClusterName string `db:"cluster_name"` //集群名称 FlavorRef string `db:"flavor_ref"` // 规格索引 ImageRef string `db:"image_ref"` // 镜像索引 Status string `db:"status"` // 状态 From 7c41605443dc6734e1b011a806b7d0230e7aaecd Mon Sep 17 00:00:00 2001 From: Jake <450705171@qq.com> Date: Fri, 10 May 2024 22:16:23 +0800 Subject: [PATCH 30/40] merge conflict Former-commit-id: 6bb70e13e9ec954eb128dd3d8c8201d9cbc54529 --- pkg/models/cloud/task_cloud.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/models/cloud/task_cloud.go b/pkg/models/cloud/task_cloud.go index 784196c0..5cb1e1c9 100644 --- a/pkg/models/cloud/task_cloud.go +++ b/pkg/models/cloud/task_cloud.go @@ -8,7 +8,6 @@ import ( type TaskCloudModel struct { Id uint `json:"id" gorm:"primarykey;not null;comment:id"` TaskId uint `json:"taskId" gorm:"not null;comment:task表id"` - AdapterName string `json:"adapterName" gorm:"not null;comment:适配器名称"` AdapterId uint `json:"adapterId" gorm:"not null;comment:适配器id"` AdapterName string `json:"adapterName" gorm:"not null;comment:适配器名称"` ClusterId uint `json:"clusterId" gorm:"not null;comment:集群id"` From 54dd6a7eb2788fb4a95cccaa0d62252e0d52ff29 Mon Sep 17 00:00:00 2001 From: qiwang <1364512070@qq.com> Date: Sat, 11 May 2024 08:58:57 +0800 Subject: [PATCH 31/40] =?UTF-8?q?fix=EF=BC=9Acreate=20vm=20server?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former-commit-id: d45c7c3228f7cedeaac94d7175d220855cf39c46 --- api/desc/core/pcm-core.api | 44 ++-- api/internal/handler/routes.go | 7 +- api/internal/logic/core/commitvmtasklogic.go | 191 ++++++++++++++---- .../scheduler/schedulers/vmScheduler.go | 2 +- api/internal/types/types.go | 19 +- pkg/models/taskvmmodel_gen.go | 27 ++- 6 files changed, 206 insertions(+), 84 deletions(-) diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index f433416e..e0f354fa 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -159,7 +159,7 @@ type ( type ( GeneralTaskReq { Name string `json:"name"` - AdapterIds []string `json:"adapterIds"` + AdapterIds []string `json:"adapterIds"` ClusterIds []string `json:"clusterIds"` Strategy string `json:"strategy"` StaticWeightMap map[string]int32 `json:"staticWeightMap,optional"` @@ -203,15 +203,30 @@ type ( type ( commitVmTaskReq { - // Name string `json:"name"` - // NsID string `json:"nsID"` + Name string `json:"name"` + AdapterIds []string `json:"adapterIds,optional"` + ClusterIds []string `json:"clusterIds"` + Strategy string `json:"strategy"` + StaticWeightMap map[string]int32 `json:"staticWeightMap,optional"` + MinCount int64 `json:"min_count,optional"` + ImageRef int64 `json:"imageRef,optional"` + FlavorRef int64 `json:"flavorRef,optional"` + Uuid int64 `json:"uuid,optional"` + //Replicas int64 `json:"replicas,string"` + VmName string `json:"vm_name,optional"` // Replicas int64 `json:"replicas,optional"` // MatchLabels map[string]string `json:"matchLabels,optional"` - // AdapterId string `json:"adapterId,optional"` + // ClusterType string `json:"clusterType,optional"` // //Virtual Machine Section - CreateMulServer []CreateMulDomainServer `json:"createMulServer,optional"` - VmOption *VmOption `json:"vmOption,optional"` + //CreateMulServer []CreateMulDomainServer `json:"createMulServer,optional"` + //VmOption *VmOption `json:"vmOption,optional"` + } + TaskVm { + ImageRef string `json:"imageRef"` + FlavorRef string `json:"flavorRef"` + Uuid string `json:"uuid"` + Platform string `json:"platform"` } VmOption { AdapterId string `json:"adapterId"` @@ -225,23 +240,6 @@ type ( MatchLabels map[string]string `json:"matchLabels,optional"` StaticWeightMap map[string]int32 `json:"staticWeightMap,optional"` CreateMulServer []CreateMulDomainServer `json:"createMulServer,optional"` - // Id int64 `json:"id"` - // ParticipantId int64 `json:"participantId"` - // TaskId int64 `json:"taskId"` - // AdapterId int64 `json:"adapterId"` - // ClusterId int64 `json:"clusterId"` - // FlavorRef string `json:"flavorRef"` - // ImageRef string `json:"imageRef"` - // Status string `json:"status"` - // Platform string `json:"platform"` - // Description string `json:"description"` - // AvailabilityZone string `json:"availabilityZone"` - // MinCount int64 `json:"minCount"` - // Uuid string `json:"uuid"` - // StartTime string `json:"startTime"` - // RunningTime string `json:"runningTime"` - // Result string `json:"result"` - // DeletedAt string `json:"deletedAt"` } CreateMulDomainServer { diff --git a/api/internal/handler/routes.go b/api/internal/handler/routes.go index bedd9c57..e55a3f79 100644 --- a/api/internal/handler/routes.go +++ b/api/internal/handler/routes.go @@ -1190,6 +1190,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { Path: "/schedule/ai/getAlgorithms/:adapterId/:resourceType/:taskType/:dataset", Handler: schedule.ScheduleGetAlgorithmsHandler(serverCtx), }, + { + Method: http.MethodGet, + Path: "/schedule/ai/getJobLog/:adapterId/:clusterId/:taskId/:instanceNum", + Handler: schedule.ScheduleGetAiJobLogLogHandler(serverCtx), + }, { Method: http.MethodPost, Path: "/schedule/submit", @@ -1294,7 +1299,7 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) { }, { Method: http.MethodPost, - Path: "/core/syncClusterAlert", + Path: "/monitoring/syncClusterAlert", Handler: monitoring.SyncClusterAlertHandler(serverCtx), }, { diff --git a/api/internal/logic/core/commitvmtasklogic.go b/api/internal/logic/core/commitvmtasklogic.go index ede94b86..6e818fcd 100644 --- a/api/internal/logic/core/commitvmtasklogic.go +++ b/api/internal/logic/core/commitvmtasklogic.go @@ -3,12 +3,14 @@ package core import ( "context" "fmt" + clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" + "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "strconv" "time" @@ -31,9 +33,27 @@ func NewCommitVmTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Comm func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *types.CommitVmTaskResp, err error) { // todo: add your logic here and delete this line + /*var ImageRef string + var FlavorRef string + var NetworkRef string*/ resp = &types.CommitVmTaskResp{} + tx := l.svcCtx.DbEngin.Begin() //Building the main task structure - opt := &option.VmOption{ + defer func() { + if p := recover(); p != nil { + tx.Rollback() + logx.Error(p) + } else if tx.Error != nil { + logx.Info("rollback, error", tx.Error) + tx.Rollback() + } else { + tx = tx.Commit() + logx.Info("commit success") + } + }() + //TODO adapter + adapterId, _ := strconv.ParseUint(req.AdapterIds[0], 10, 64) + /* opt := &option.VmOption{ AdapterId: req.VmOption.AdapterId, Replicas: req.VmOption.Replicas, Strategy: req.VmOption.Strategy, @@ -43,32 +63,21 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type StaticWeightMap: req.VmOption.StaticWeightMap, Name: req.VmOption.Name, CommitTime: time.Now(), - } - taskModel := models.Task{ - Status: constants.Saved, - Name: req.VmOption.Name, - CommitTime: time.Now(), - Description: "vm task", - } - // Save task data to database - tx := l.svcCtx.DbEngin.Create(&taskModel) - if tx.Error != nil { - return nil, tx.Error - } + }*/ - //var clusters []*models.VmModel - //err2 := l.svcCtx.DbEngin.Raw("SELECT * FROM `t_cluster` where adapter_id in ? and id in ?", req.VmOption.AdapterId, req.VmOption.VmClusterIds).Scan(&clusters).Error - //if err2 != nil { - // logx.Errorf("CommitGeneralTask() => sql execution error: %v", err) - // //return errors.Errorf("the cluster does not match the drive resources. Check the data"), nil - //} + var clusters []*models.VmModel + err2 := l.svcCtx.DbEngin.Raw("SELECT * FROM `t_cluster` where adapter_id in ? and id in ?", req.AdapterIds, req.ClusterIds).Scan(&clusters).Error + if err2 != nil { + logx.Errorf("CommitGeneralTask() => sql execution error: %v", err) + //return errors.Errorf("the cluster does not match the drive resources. Check the data"), nil + } taskVm := models.TaskVm{} //TODO 执行策略返回集群跟 Replica - /*opt := &option.VmOption{} - utils.Convert(&req, &opt)*/ + opt := &option.VmOption{} + utils.Convert(&req, &opt) // 2、Initialize scheduler - vmSchdl, err := schedulers.NewVmScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, l.svcCtx.DbEngin, l.svcCtx.PromClient) + vmSchdl, _ := schedulers.NewVmScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, l.svcCtx.DbEngin, l.svcCtx.PromClient) if err != nil { return nil, err } @@ -76,43 +85,139 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type // 3、Return scheduling results results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl) if err != nil { + logx.Errorf("AssignAndSchedule() => execution error: %v", err) return nil, err } rs := (results).([]*schedulers.VmResult) + var synergyStatus int64 + if len(rs) > 1 { + synergyStatus = 1 + } + + var strategy int64 + sqlStr := `select t_dict_item.item_value + from t_dict + left join t_dict_item on t_dict.id = t_dict_item.dict_id + where item_text = ? + and t_dict.dict_code = 'schedule_Strategy'` + //查询调度策略 + err = tx.Raw(sqlStr, req.Strategy).Scan(&strategy).Error + taskModel := models.Task{ + Id: utils.GenSnowflakeID(), + Status: constants.Saved, + Name: req.Name, + CommitTime: time.Now(), + Description: "vm task", + AdapterTypeDict: 0, + SynergyStatus: synergyStatus, + Strategy: strategy, + } + var taskVms models.TaskVm + var VmObject types.TaskVm for _, r := range rs { - for _, CreateMulServer := range req.CreateMulServer { - if r.Replica > 0 && r.ClusterId == CreateMulServer.ClusterId { - fmt.Println("", req.CreateMulServer) - var clusterIds []int64 - l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? ", req.VmOption.AdapterId).Scan(&clusterIds) - if len(clusterIds) == 0 || clusterIds == nil { - return nil, nil + for _, clusterId := range req.ClusterIds { + if r.Replica > 0 && r.ClusterId == clusterId { + fmt.Println("", clusterId) + sql := `SELECT vi.image_id as imageRef,vf.flavor_id as flavorRef,vn.network_id as uuid,vi.cluster_name as platform FROM + vm_flavor vf + LEFT JOIN vm_image vi ON vf.cluster_id = vi.cluster_id + LEFT JOIN vm_network vn ON vf.cluster_id = vn.cluster_id + WHERE + vi.cluster_id = ? + AND vf.public_flavor_id = ? + AND vi.public_image_id = ? + AND vn.public_network_id = ?` + // err2 := l.svcCtx.DbEngin.Raw(sql, clusterId, req.FlavorRef, req.ImageRef, req.Uuid).Scan(&taskVm).Error + txVm := l.svcCtx.DbEngin.Raw(sql, clusterId, req.FlavorRef, req.ImageRef, req.Uuid).Scan(&VmObject) + if txVm.Error != nil { + logx.Error(err) + return nil, txVm.Error } - adapterId, _ := strconv.ParseUint(req.VmOption.AdapterId, 10, 64) - taskVm.AdapterId = int64(adapterId) - clusterId, _ := strconv.ParseUint(r.ClusterId, 10, 64) - taskVm.ClusterId = int64(clusterId) - taskVm.Name = req.VmOption.Name - taskVm.TaskId = taskModel.Id - clusterId, _ = strconv.ParseUint(r.ClusterId, 10, 64) - taskVm.ClusterId = int64(clusterId) + if err2 != nil { + logx.Errorf("CommitGeneralTask() => sql execution error: %v", err) + //return errors.Errorf("the cluster does not match the drive resources. Check the data"), nil + } + taskVms.Name = req.Name taskVm.Status = "Saved" taskVm.StartTime = time.Now().String() - taskVm.MinCount = CreateMulServer.Min_count - taskVm.ImageRef = CreateMulServer.ImageRef - taskVm.FlavorRef = CreateMulServer.FlavorRef - taskVm.Uuid = CreateMulServer.Uuid - taskVm.Platform = CreateMulServer.Platform + taskVm.MinCount = req.MinCount + /* sqlImage := "SELECT image_id FROM `vm_image_dict` vm left join vm_image vi on vm.id=vi.public_image_id where cluster_id =? AND public_image_id = ?" + txImage := l.svcCtx.DbEngin.Raw(sqlImage, clusterId, req.ImageRef).Scan(&ImageRef) + if txImage.Error != nil { + logx.Error(err) + return nil, txImage.Error + }*/ + taskVm.ImageRef = VmObject.ImageRef + /* sqlFlavor := "SELECT * FROM `vm_flavor_dict` vm left join vm_flavor vf on vm.id=vf.public_flavor_id where cluster_id =? AND public_flavor_id = ?" + txFlavor := l.svcCtx.DbEngin.Raw(sqlFlavor, clusterId, req.FlavorRef).Scan(&FlavorRef) + if txFlavor.Error != nil { + logx.Error(err) + return nil, txFlavor.Error + }*/ + taskVm.FlavorRef = VmObject.FlavorRef + /* sqlNetwork := "SELECT * FROM `vm_network_dict` vm left join vm_network vi on vm.id=vi.public_network_id where cluster_id =? AND public_network_id = ?" + txNetwork := l.svcCtx.DbEngin.Raw(sqlNetwork, clusterId, req.Uuid).Scan(&NetworkRef) + if txNetwork.Error != nil { + logx.Error(err) + return nil, txNetwork.Error + }*/ + taskVm.Uuid = VmObject.Uuid + taskVm.Platform = VmObject.Platform tx = l.svcCtx.DbEngin.Create(&taskVm) if tx.Error != nil { return nil, tx.Error } + //var clusterIds []int64 + //l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? ", req.VmOption.AdapterId).Scan(&clusterIds) + //if len(clusterIds) == 0 || clusterIds == nil { + // return nil, nil + //} + //adapterId, _ := strconv.ParseUint(req.VmOption.AdapterId, 10, 64) + //taskVm.AdapterId = int64(adapterId) + //clusterId, _ = strconv.ParseUint(r.ClusterId, 10, 64) + //taskVm.ClusterId = int64(clusterId) + //taskVm.Status = "Saved" + //taskVm.StartTime = time.Now().String() + //taskVm.ImageRef = CreateMulServer.ImageRef + //taskVm.FlavorRef = CreateMulServer.FlavorRef + //taskVm.Uuid = CreateMulServer.Uuid + //taskVm.Platform = CreateMulServer.Platform + //tx = l.svcCtx.DbEngin.Create(&taskVm) + //if tx.Error != nil { + // return nil, tx.Error + //} } } } + adapterName := "" + tx.Table("t_adapter").Select("name").Where("id=?", adapterId).Find(&adapterName) + noticeInfo := clientCore.NoticeInfo{ + AdapterId: int64(adapterId), + AdapterName: adapterName, + NoticeType: "create", + TaskName: req.Name, + Incident: "任务创建中", + CreatedTime: time.Now(), + } + + db := tx.Table("task").Create(&taskModel) + db = tx.Table("task_cloud").Create(&taskVm) + db = tx.Table("t_notice").Create(¬iceInfo) + if db.Error != nil { + logx.Errorf("Task creation failure, err: %v", db.Error) + } + //db = tx.Table("t_notice").Create(¬iceInfo) + // Save task data to database + //tf := l.svcCtx.DbEngin.Create(&taskModel) + //if tf.Error != nil { + // return nil, tf.Error + //} + //tn := l.svcCtx.DbEngin.Create(¬iceInfo) + //if tn.Error != nil { + // return nil, tn.Error + //} resp.Code = 200 resp.Msg = "Success" - return resp, nil } diff --git a/api/internal/scheduler/schedulers/vmScheduler.go b/api/internal/scheduler/schedulers/vmScheduler.go index 7aa20f63..89d52a4d 100644 --- a/api/internal/scheduler/schedulers/vmScheduler.go +++ b/api/internal/scheduler/schedulers/vmScheduler.go @@ -83,7 +83,7 @@ func (vm *VmScheduler) PickOptimalStrategy() (strategy.Strategy, error) { return strategy, nil*/ case strategy.STATIC_WEIGHT: //todo resources should match cluster StaticWeightMap - strategy := strategy.NewStaticWeightStrategy(vm.option.ClusterToStaticWeight, 1) + strategy := strategy.NewStaticWeightStrategy(vm.option.StaticWeightMap, 1) return strategy, nil } diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 0a905601..ad83193a 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -182,8 +182,23 @@ type TaskYaml struct { } type CommitVmTaskReq struct { - CreateMulServer []CreateMulDomainServer `json:"createMulServer,optional"` - VmOption *VmOption `json:"vmOption,optional"` + Name string `json:"name"` + AdapterIds []string `json:"adapterIds,optional"` + ClusterIds []string `json:"clusterIds"` + Strategy string `json:"strategy"` + StaticWeightMap map[string]int32 `json:"staticWeightMap,optional"` + MinCount int64 `json:"min_count,optional"` + ImageRef int64 `json:"imageRef,optional"` + FlavorRef int64 `json:"flavorRef,optional"` + Uuid int64 `json:"uuid,optional"` + VmName string `json:"vm_name,optional"` +} + +type TaskVm struct { + ImageRef string `json:"imageRef"` + FlavorRef string `json:"flavorRef"` + Uuid string `json:"uuid"` + Platform string `json:"platform"` } type VmOption struct { diff --git a/pkg/models/taskvmmodel_gen.go b/pkg/models/taskvmmodel_gen.go index 6a89501a..3ba62297 100644 --- a/pkg/models/taskvmmodel_gen.go +++ b/pkg/models/taskvmmodel_gen.go @@ -35,17 +35,16 @@ type ( } TaskVm struct { - Id int64 `db:"id"` // id - ParticipantId int64 `db:"participant_id"` // p端id - TaskId int64 `db:"task_id"` // 任务id - Name string `db:"name"` // 虚拟机名称 - AdapterId int64 `db:"adapter_id"` // 执行任务的适配器id - ClusterId int64 `db:"cluster_id"` // 执行任务的集群id - FlavorRef string `db:"flavor_ref"` // 规格索引 - ImageRef string `db:"image_ref"` // 镜像索引 - Status string `db:"status"` // 状态 - Platform string `db:"platform"` // 平台 - Description string `db:"description"` // 描述 + Id int64 `db:"id"` // id + TaskId int64 `db:"task_id"` // 任务id + Name string `db:"name"` // 虚拟机名称 + AdapterId int64 `db:"adapter_id"` // 执行任务的适配器id + ClusterId int64 `db:"cluster_id"` // 执行任务的集群id + FlavorRef string `db:"flavor_ref"` // 规格索引 + ImageRef string `db:"image_ref"` // 镜像索引 + Status string `db:"status"` // 状态 + Platform string `db:"platform"` // 平台 + Description string `db:"description"` // 描述 AvailabilityZone string `db:"availability_zone"` MinCount int64 `db:"min_count"` // 数量 Uuid string `db:"uuid"` // 网络id @@ -91,14 +90,14 @@ func (m *defaultTaskVmModel) FindOne(ctx context.Context, id int64) (*TaskVm, er } func (m *defaultTaskVmModel) Insert(ctx context.Context, data *TaskVm) (sql.Result, error) { - query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskVmRowsExpectAutoSet) - ret, err := m.conn.ExecCtx(ctx, query, data.ParticipantId, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt) + query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskVmRowsExpectAutoSet) + ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt) return ret, err } func (m *defaultTaskVmModel) Update(ctx context.Context, data *TaskVm) error { query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskVmRowsWithPlaceHolder) - _, err := m.conn.ExecCtx(ctx, query, data.ParticipantId, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt, data.Id) + _, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt, data.Id) return err } From 3b59c81b2b83bc656f785efc996e753662278765 Mon Sep 17 00:00:00 2001 From: qiwang <1364512070@qq.com> Date: Sat, 11 May 2024 09:14:48 +0800 Subject: [PATCH 32/40] =?UTF-8?q?fix=EF=BC=9Acreate=20vm=20server?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former-commit-id: 647fd9512c10f21ce4c3797abf87b5b305b59d48 --- pkg/models/taskvmmodel_gen.go | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/pkg/models/taskvmmodel_gen.go b/pkg/models/taskvmmodel_gen.go index 3ba62297..fa749168 100644 --- a/pkg/models/taskvmmodel_gen.go +++ b/pkg/models/taskvmmodel_gen.go @@ -35,16 +35,18 @@ type ( } TaskVm struct { - Id int64 `db:"id"` // id - TaskId int64 `db:"task_id"` // 任务id - Name string `db:"name"` // 虚拟机名称 - AdapterId int64 `db:"adapter_id"` // 执行任务的适配器id - ClusterId int64 `db:"cluster_id"` // 执行任务的集群id - FlavorRef string `db:"flavor_ref"` // 规格索引 - ImageRef string `db:"image_ref"` // 镜像索引 - Status string `db:"status"` // 状态 - Platform string `db:"platform"` // 平台 - Description string `db:"description"` // 描述 + Id int64 `db:"id"` // id + TaskId int64 `db:"task_id"` // 任务id + Name string `db:"name"` // 虚拟机名称 + AdapterId int64 `db:"adapter_id"` // 执行任务的适配器id + AdapterName string `db:"adapter_name"` // 适配器名称 + ClusterId int64 `db:"cluster_id"` // 执行任务的集群id + ClusterName string `db:"cluster_name"` // 集群名称 + FlavorRef string `db:"flavor_ref"` // 规格索引 + ImageRef string `db:"image_ref"` // 镜像索引 + Status string `db:"status"` // 状态 + Platform string `db:"platform"` // 平台 + Description string `db:"description"` // 描述 AvailabilityZone string `db:"availability_zone"` MinCount int64 `db:"min_count"` // 数量 Uuid string `db:"uuid"` // 网络id @@ -90,14 +92,14 @@ func (m *defaultTaskVmModel) FindOne(ctx context.Context, id int64) (*TaskVm, er } func (m *defaultTaskVmModel) Insert(ctx context.Context, data *TaskVm) (sql.Result, error) { - query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskVmRowsExpectAutoSet) - ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt) + query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskVmRowsExpectAutoSet) + ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.Name, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt) return ret, err } func (m *defaultTaskVmModel) Update(ctx context.Context, data *TaskVm) error { query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskVmRowsWithPlaceHolder) - _, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt, data.Id) + _, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.Name, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt, data.Id) return err } From 4047bc98ddd24da518fe4e477de5c7ed922cd28d Mon Sep 17 00:00:00 2001 From: qiwang <1364512070@qq.com> Date: Sat, 11 May 2024 10:17:06 +0800 Subject: [PATCH 33/40] =?UTF-8?q?fix=EF=BC=9Aupdate=20vm=20network?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former-commit-id: 3195ae6397c24bda209c7c7a3fd61ee582076f1f --- api/desc/core/pcm-core.api | 2 +- api/internal/types/types.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/api/desc/core/pcm-core.api b/api/desc/core/pcm-core.api index aaf6200f..f3bab7f6 100644 --- a/api/desc/core/pcm-core.api +++ b/api/desc/core/pcm-core.api @@ -90,7 +90,7 @@ type ( } NetworkDict { Id int `json:"id"` - PublicImageName string `json:"public_image_name"` + PublicNetworkName string `json:"public_netWork_name"` } ) diff --git a/api/internal/types/types.go b/api/internal/types/types.go index a0be926b..61596427 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -79,8 +79,8 @@ type PublicNetworkResp struct { } type NetworkDict struct { - Id int `json:"id"` - PublicImageName string `json:"public_image_name"` + Id int `json:"id"` + PublicNetworkName string `json:"public_netWork_name"` } type RemoteResp struct { From 6e82702876266c6d91f96d210f158c1e44190d50 Mon Sep 17 00:00:00 2001 From: tzwang Date: Sat, 11 May 2024 11:22:29 +0800 Subject: [PATCH 34/40] added api getcomputecards Former-commit-id: 4514161574188e63305fe9066ef4ddad473a56b2 --- .../schedule/downloadalgothmcodelogic.go | 3 +- .../schedule/getcomputecardsbyclusterlogic.go | 9 ++- .../scheduler/service/collector/collector.go | 1 + api/internal/storeLink/modelarts.go | 4 ++ api/internal/storeLink/octopus.go | 64 +++++++++++++++---- api/internal/storeLink/shuguangai.go | 6 ++ 6 files changed, 70 insertions(+), 17 deletions(-) diff --git a/api/internal/logic/schedule/downloadalgothmcodelogic.go b/api/internal/logic/schedule/downloadalgothmcodelogic.go index 81b96579..03800fe7 100644 --- a/api/internal/logic/schedule/downloadalgothmcodelogic.go +++ b/api/internal/logic/schedule/downloadalgothmcodelogic.go @@ -2,6 +2,7 @@ package schedule import ( "context" + "strings" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types" @@ -26,7 +27,7 @@ func NewDownloadAlgothmCodeLogic(ctx context.Context, svcCtx *svc.ServiceContext func (l *DownloadAlgothmCodeLogic) DownloadAlgorithmCode(req *types.DownloadAlgorithmCodeReq) (resp *types.DownloadAlgorithmCodeResp, err error) { resp = &types.DownloadAlgorithmCodeResp{} code, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].DownloadAlgorithmCode(l.ctx, - req.ResourceType, req.Card, req.TaskType, req.Dataset, req.Algorithm) + req.ResourceType, strings.ToLower(req.Card), req.TaskType, req.Dataset, req.Algorithm) if err != nil { return nil, err } diff --git a/api/internal/logic/schedule/getcomputecardsbyclusterlogic.go b/api/internal/logic/schedule/getcomputecardsbyclusterlogic.go index 772a5ce6..4cb94f91 100644 --- a/api/internal/logic/schedule/getcomputecardsbyclusterlogic.go +++ b/api/internal/logic/schedule/getcomputecardsbyclusterlogic.go @@ -24,7 +24,12 @@ func NewGetComputeCardsByClusterLogic(ctx context.Context, svcCtx *svc.ServiceCo } func (l *GetComputeCardsByClusterLogic) GetComputeCardsByCluster(req *types.GetComputeCardsByClusterReq) (resp *types.GetComputeCardsByClusterResp, err error) { - // todo: add your logic here and delete this line + resp = &types.GetComputeCardsByClusterResp{} + cards, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].GetComputeCards(l.ctx) + if err != nil { + return nil, err + } + resp.Cards = cards - return + return resp, nil } diff --git a/api/internal/scheduler/service/collector/collector.go b/api/internal/scheduler/service/collector/collector.go index 453d710c..5e6a7940 100644 --- a/api/internal/scheduler/service/collector/collector.go +++ b/api/internal/scheduler/service/collector/collector.go @@ -10,6 +10,7 @@ type AiCollector interface { GetTrainingTask(ctx context.Context, taskId string) (*Task, error) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error + GetComputeCards(ctx context.Context) ([]string, error) } type ResourceStats struct { diff --git a/api/internal/storeLink/modelarts.go b/api/internal/storeLink/modelarts.go index 7bb6db2d..5843eeff 100644 --- a/api/internal/storeLink/modelarts.go +++ b/api/internal/storeLink/modelarts.go @@ -162,6 +162,10 @@ func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorit return nil, nil } +func (m *ModelArtsLink) GetComputeCards(ctx context.Context) ([]string, error) { + return nil, nil +} + func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) { return "", nil } diff --git a/api/internal/storeLink/octopus.go b/api/internal/storeLink/octopus.go index a088e56a..3b1d2521 100644 --- a/api/internal/storeLink/octopus.go +++ b/api/internal/storeLink/octopus.go @@ -46,12 +46,14 @@ const ( SUIYUAN = "suiyuan" SAILINGSI = "sailingsi" MLU = "MLU" + BIV100 = "BI-V100" CAMBRICONMLU290 = 256 GCU = "GCU" ENFLAME = "enflame" EnflameT20 = 128 BASE_TOPS = 128 CAMBRICON = "cambricon" + TIANSHU = "天数" TRAIN_CMD = "cd /code; python train.py" VERSION = "V1" DOMAIN = "http://192.168.242.41:8001/" @@ -59,8 +61,9 @@ const ( var ( cardAliasMap = map[string]string{ - MLU: CAMBRICON, - GCU: ENFLAME, + MLU: CAMBRICON, + GCU: ENFLAME, + BIV100: TIANSHU, } cardTopsMap = map[string]float64{ MLU: CAMBRICONMLU290, @@ -340,11 +343,54 @@ func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm return algorithms, nil } +func (o *OctopusLink) GetComputeCards(ctx context.Context) ([]string, error) { + var cards []string + for s, _ := range cardAliasMap { + cards = append(cards, s) + } + return cards, nil +} + func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) { + var name string + if resourceType == CARD { + name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card + } else { + name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU + } + + req := &octopus.GetMyAlgorithmListReq{ + Platform: o.platform, + PageIndex: o.pageIndex, + PageSize: o.pageSize, + } + resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req) + if err != nil { + return "", err + } + if !resp.Success { + return "", errors.New("failed to get algorithmList") + } + + var algorithmId string + for _, a := range resp.Payload.Algorithms { + if strings.ToLower(a.FrameworkName) != taskType { + continue + } + if a.AlgorithmName == name { + algorithmId = a.AlgorithmId + break + } + } + + if algorithmId == "" { + return "", errors.New("algorithmId not found") + } + dcReq := &octopus.DownloadCompressReq{ Platform: o.platform, Version: VERSION, - AlgorithmId: "", + AlgorithmId: algorithmId, } dcResp, err := o.octopusRpc.DownloadCompress(ctx, dcReq) if err != nil { @@ -358,7 +404,7 @@ func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType st daReq := &octopus.DownloadAlgorithmReq{ Platform: o.platform, Version: VERSION, - AlgorithmId: "", + AlgorithmId: algorithmId, CompressAt: dcResp.Payload.CompressAt, Domain: DOMAIN, } @@ -591,16 +637,6 @@ func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOpti } func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error { - // temporarily set algorithm to cnn - if option.AlgorithmName == "" { - switch option.DatasetsName { - case "cifar10": - option.AlgorithmName = "cnn" - case "mnist": - option.AlgorithmName = "fcn" - } - } - req := &octopus.GetMyAlgorithmListReq{ Platform: o.platform, PageIndex: o.pageIndex, diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index 4f783357..16811591 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -447,6 +447,12 @@ func (s *ShuguangAi) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, return algorithms, nil } +func (s *ShuguangAi) GetComputeCards(ctx context.Context) ([]string, error) { + var cards []string + cards = append(cards, DCU) + return cards, nil +} + func (s *ShuguangAi) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) { algoName := dataset + DASH + algorithm req := &hpcAC.GetFileReq{ From ce25b1da6bbc684c1abf67fdffc286f5b3eacd51 Mon Sep 17 00:00:00 2001 From: qiwang <1364512070@qq.com> Date: Sat, 11 May 2024 14:42:22 +0800 Subject: [PATCH 35/40] =?UTF-8?q?fix=EF=BC=9Aupdate=20vm=20network?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former-commit-id: 4220ea329aea451d15df4b786bbe3403166e2ba0 --- api/internal/logic/core/commitvmtasklogic.go | 55 +++----------------- pkg/models/taskvmmodel_gen.go | 31 ++++++----- 2 files changed, 23 insertions(+), 63 deletions(-) diff --git a/api/internal/logic/core/commitvmtasklogic.go b/api/internal/logic/core/commitvmtasklogic.go index 6e818fcd..283e9bc9 100644 --- a/api/internal/logic/core/commitvmtasklogic.go +++ b/api/internal/logic/core/commitvmtasklogic.go @@ -33,9 +33,6 @@ func NewCommitVmTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Comm func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *types.CommitVmTaskResp, err error) { // todo: add your logic here and delete this line - /*var ImageRef string - var FlavorRef string - var NetworkRef string*/ resp = &types.CommitVmTaskResp{} tx := l.svcCtx.DbEngin.Begin() //Building the main task structure @@ -53,25 +50,11 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type }() //TODO adapter adapterId, _ := strconv.ParseUint(req.AdapterIds[0], 10, 64) - /* opt := &option.VmOption{ - AdapterId: req.VmOption.AdapterId, - Replicas: req.VmOption.Replicas, - Strategy: req.VmOption.Strategy, - ClusterToStaticWeight: req.VmOption.StaticWeightMap, - Status: constants.Saved, - MatchLabels: req.VmOption.MatchLabels, - StaticWeightMap: req.VmOption.StaticWeightMap, - Name: req.VmOption.Name, - CommitTime: time.Now(), - }*/ - var clusters []*models.VmModel err2 := l.svcCtx.DbEngin.Raw("SELECT * FROM `t_cluster` where adapter_id in ? and id in ?", req.AdapterIds, req.ClusterIds).Scan(&clusters).Error if err2 != nil { logx.Errorf("CommitGeneralTask() => sql execution error: %v", err) - //return errors.Errorf("the cluster does not match the drive resources. Check the data"), nil } - taskVm := models.TaskVm{} //TODO 执行策略返回集群跟 Replica opt := &option.VmOption{} @@ -81,20 +64,17 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type if err != nil { return nil, err } - // 3、Return scheduling results results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl) if err != nil { logx.Errorf("AssignAndSchedule() => execution error: %v", err) return nil, err } - rs := (results).([]*schedulers.VmResult) var synergyStatus int64 if len(rs) > 1 { synergyStatus = 1 } - var strategy int64 sqlStr := `select t_dict_item.item_value from t_dict @@ -113,7 +93,7 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type SynergyStatus: synergyStatus, Strategy: strategy, } - var taskVms models.TaskVm + //var taskVms models.TaskVm var VmObject types.TaskVm for _, r := range rs { for _, clusterId := range req.ClusterIds { @@ -138,7 +118,7 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type logx.Errorf("CommitGeneralTask() => sql execution error: %v", err) //return errors.Errorf("the cluster does not match the drive resources. Check the data"), nil } - taskVms.Name = req.Name + taskVm.Name = req.VmName taskVm.Status = "Saved" taskVm.StartTime = time.Now().String() taskVm.MinCount = req.MinCount @@ -168,25 +148,6 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type if tx.Error != nil { return nil, tx.Error } - //var clusterIds []int64 - //l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? ", req.VmOption.AdapterId).Scan(&clusterIds) - //if len(clusterIds) == 0 || clusterIds == nil { - // return nil, nil - //} - //adapterId, _ := strconv.ParseUint(req.VmOption.AdapterId, 10, 64) - //taskVm.AdapterId = int64(adapterId) - //clusterId, _ = strconv.ParseUint(r.ClusterId, 10, 64) - //taskVm.ClusterId = int64(clusterId) - //taskVm.Status = "Saved" - //taskVm.StartTime = time.Now().String() - //taskVm.ImageRef = CreateMulServer.ImageRef - //taskVm.FlavorRef = CreateMulServer.FlavorRef - //taskVm.Uuid = CreateMulServer.Uuid - //taskVm.Platform = CreateMulServer.Platform - //tx = l.svcCtx.DbEngin.Create(&taskVm) - //if tx.Error != nil { - // return nil, tx.Error - //} } } } @@ -201,18 +162,18 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type CreatedTime: time.Now(), } - db := tx.Table("task").Create(&taskModel) - db = tx.Table("task_cloud").Create(&taskVm) + //db := tx.Table("task").Create(&taskModel) + db := tx.Table("task_vm").Create(&taskVm) db = tx.Table("t_notice").Create(¬iceInfo) if db.Error != nil { logx.Errorf("Task creation failure, err: %v", db.Error) } //db = tx.Table("t_notice").Create(¬iceInfo) // Save task data to database - //tf := l.svcCtx.DbEngin.Create(&taskModel) - //if tf.Error != nil { - // return nil, tf.Error - //} + tf := l.svcCtx.DbEngin.Create(&taskModel) + if tf.Error != nil { + return nil, tf.Error + } //tn := l.svcCtx.DbEngin.Create(¬iceInfo) //if tn.Error != nil { // return nil, tn.Error diff --git a/pkg/models/taskvmmodel_gen.go b/pkg/models/taskvmmodel_gen.go index 6b147b68..fa749168 100644 --- a/pkg/models/taskvmmodel_gen.go +++ b/pkg/models/taskvmmodel_gen.go @@ -35,19 +35,18 @@ type ( } TaskVm struct { - Id int64 `db:"id"` // id - ParticipantId int64 `db:"participant_id"` // p端id - TaskId int64 `db:"task_id"` // 任务id - Name string `db:"name"` // 虚拟机名称 - AdapterId int64 `db:"adapter_id"` // 适配器id - AdapterName string `db:"adapter_name"` //适配器名称 - ClusterId int64 `db:"cluster_id"` //集群id - ClusterName string `db:"cluster_name"` //集群名称 - FlavorRef string `db:"flavor_ref"` // 规格索引 - ImageRef string `db:"image_ref"` // 镜像索引 - Status string `db:"status"` // 状态 - Platform string `db:"platform"` // 平台 - Description string `db:"description"` // 描述 + Id int64 `db:"id"` // id + TaskId int64 `db:"task_id"` // 任务id + Name string `db:"name"` // 虚拟机名称 + AdapterId int64 `db:"adapter_id"` // 执行任务的适配器id + AdapterName string `db:"adapter_name"` // 适配器名称 + ClusterId int64 `db:"cluster_id"` // 执行任务的集群id + ClusterName string `db:"cluster_name"` // 集群名称 + FlavorRef string `db:"flavor_ref"` // 规格索引 + ImageRef string `db:"image_ref"` // 镜像索引 + Status string `db:"status"` // 状态 + Platform string `db:"platform"` // 平台 + Description string `db:"description"` // 描述 AvailabilityZone string `db:"availability_zone"` MinCount int64 `db:"min_count"` // 数量 Uuid string `db:"uuid"` // 网络id @@ -93,14 +92,14 @@ func (m *defaultTaskVmModel) FindOne(ctx context.Context, id int64) (*TaskVm, er } func (m *defaultTaskVmModel) Insert(ctx context.Context, data *TaskVm) (sql.Result, error) { - query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskVmRowsExpectAutoSet) - ret, err := m.conn.ExecCtx(ctx, query, data.ParticipantId, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt) + query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskVmRowsExpectAutoSet) + ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.Name, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt) return ret, err } func (m *defaultTaskVmModel) Update(ctx context.Context, data *TaskVm) error { query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskVmRowsWithPlaceHolder) - _, err := m.conn.ExecCtx(ctx, query, data.ParticipantId, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt, data.Id) + _, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.Name, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt, data.Id) return err } From 8660032d12a20e0dbb3c20c73dfd874778d5cf56 Mon Sep 17 00:00:00 2001 From: qiwang <1364512070@qq.com> Date: Sat, 11 May 2024 15:09:38 +0800 Subject: [PATCH 36/40] =?UTF-8?q?fix=EF=BC=9Aupdate=20vm=20network?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former-commit-id: ae5fdf0aa190fb666b64f97f7e77d9abb47bbe25 --- api/internal/logic/core/commitvmtasklogic.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/internal/logic/core/commitvmtasklogic.go b/api/internal/logic/core/commitvmtasklogic.go index 283e9bc9..580dbf38 100644 --- a/api/internal/logic/core/commitvmtasklogic.go +++ b/api/internal/logic/core/commitvmtasklogic.go @@ -89,7 +89,7 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type Name: req.Name, CommitTime: time.Now(), Description: "vm task", - AdapterTypeDict: 0, + AdapterTypeDict: 3, SynergyStatus: synergyStatus, Strategy: strategy, } From 3eb71a7bf592fa2de21222bdd25e1da6f4c6eb4c Mon Sep 17 00:00:00 2001 From: zhouqunjie Date: Sat, 11 May 2024 15:15:46 +0800 Subject: [PATCH 37/40] multiple adapters for hps task submit Former-commit-id: 27cbca8bf1eec9807636b541ed1c68741e85e043 --- api/desc/hpc/pcm-hpc.api | 2 +- api/internal/logic/hpc/commithpctasklogic.go | 12 ++++++------ api/internal/types/types.go | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/api/desc/hpc/pcm-hpc.api b/api/desc/hpc/pcm-hpc.api index 8584895c..5a798e19 100644 --- a/api/desc/hpc/pcm-hpc.api +++ b/api/desc/hpc/pcm-hpc.api @@ -14,7 +14,7 @@ type ( Description string `json:"description,optional"` TenantId int64 `json:"tenantId,optional"` TaskId int64 `json:"taskId,optional"` - AdapterId string `json:"adapterId,optional"` + AdapterIds []string `json:"adapterId"` MatchLabels map[string]string `json:"matchLabels,optional"` CardCount int64 `json:"cardCount,optional"` WorkDir string `json:"workDir,optional"` //paratera:workingDir diff --git a/api/internal/logic/hpc/commithpctasklogic.go b/api/internal/logic/hpc/commithpctasklogic.go index 994f3f91..e716b2c0 100644 --- a/api/internal/logic/hpc/commithpctasklogic.go +++ b/api/internal/logic/hpc/commithpctasklogic.go @@ -7,7 +7,6 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "k8s.io/apimachinery/pkg/util/json" "math/rand" - "strconv" "time" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" @@ -50,14 +49,15 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t } var clusterIds []int64 - l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? and label = ?", req.AdapterId, req.ClusterType).Scan(&clusterIds) + l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id in ? and label = ?", req.AdapterIds, req.ClusterType).Scan(&clusterIds) - adapterId, _ := strconv.ParseInt(req.AdapterId, 10, 64) - var adapterName string - l.svcCtx.DbEngin.Raw("SELECT name FROM `t_adapter` where id = ?", req.AdapterId).Scan(&adapterName) - clusterId := clusterIds[rand.Intn(len(clusterIds))] var clusterName string + var adapterId int64 + var adapterName string + clusterId := clusterIds[rand.Intn(len(clusterIds))] l.svcCtx.DbEngin.Raw("SELECT nickname FROM `t_cluster` where id = ?", clusterId).Scan(&clusterName) + l.svcCtx.DbEngin.Raw("SELECT adapter_id FROM `t_cluster` where id = ?", clusterId).Scan(&adapterId) + l.svcCtx.DbEngin.Raw("SELECT name FROM `t_adapter` where id = ?", adapterId).Scan(&adapterName) env, _ := json.Marshal(req.Environment) diff --git a/api/internal/types/types.go b/api/internal/types/types.go index 61596427..fa258e8a 100644 --- a/api/internal/types/types.go +++ b/api/internal/types/types.go @@ -1164,7 +1164,7 @@ type CommitHpcTaskReq struct { Description string `json:"description,optional"` TenantId int64 `json:"tenantId,optional"` TaskId int64 `json:"taskId,optional"` - AdapterId string `json:"adapterId,optional"` + AdapterIds []string `json:"adapterIds"` MatchLabels map[string]string `json:"matchLabels,optional"` CardCount int64 `json:"cardCount,optional"` WorkDir string `json:"workDir,optional"` //paratera:workingDir From 91c38d66fd00c9bbfdfb63c6f7cafff8cae01dce Mon Sep 17 00:00:00 2001 From: zhouqunjie Date: Sat, 11 May 2024 15:42:52 +0800 Subject: [PATCH 38/40] fix bug Former-commit-id: a32dab848d0e2055a62190de1461d4ba7be83ca5 --- api/internal/logic/hpc/commithpctasklogic.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/api/internal/logic/hpc/commithpctasklogic.go b/api/internal/logic/hpc/commithpctasklogic.go index e716b2c0..374d7807 100644 --- a/api/internal/logic/hpc/commithpctasklogic.go +++ b/api/internal/logic/hpc/commithpctasklogic.go @@ -50,6 +50,11 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t var clusterIds []int64 l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id in ? and label = ?", req.AdapterIds, req.ClusterType).Scan(&clusterIds) + if len(clusterIds) == 0 || clusterIds == nil { + resp.Code = 400 + resp.Msg = "no cluster found" + return resp, nil + } var clusterName string var adapterId int64 @@ -61,12 +66,6 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t env, _ := json.Marshal(req.Environment) - if len(clusterIds) == 0 || clusterIds == nil { - resp.Code = 400 - resp.Msg = "no cluster found" - return resp, nil - } - hpcInfo := models.TaskHpc{ TaskId: taskModel.Id, AdapterId: uint(adapterId), From 9722801813f1269f22d21dc1fabf18f9e8891b29 Mon Sep 17 00:00:00 2001 From: jagger Date: Sat, 11 May 2024 16:18:09 +0800 Subject: [PATCH 39/40] fix bug Signed-off-by: jagger Former-commit-id: 1ef360be0c89fdf57fe54497296380fbe068cb95 --- .gitignore | 1 - Makefile | 20 +++-------- api/Dockerfile | 35 +++++++------------- api/internal/logic/core/pushtaskinfologic.go | 2 +- rpc/Dockerfile | 34 +++++++------------ 5 files changed, 29 insertions(+), 63 deletions(-) diff --git a/.gitignore b/.gitignore index 47d6e36c..94eaa49f 100644 --- a/.gitignore +++ b/.gitignore @@ -27,5 +27,4 @@ buf.lock configs/tenanter.yaml log/ -/go_build_gitlink_org_cn_JCCE_PCM /cache/ diff --git a/Makefile b/Makefile index bbbb7e0c..1d87b2d5 100644 --- a/Makefile +++ b/Makefile @@ -1,21 +1,9 @@ pcm-core-api: - CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o pcm-core-api adaptor/PCM-CORE/api/pcm.go + CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o pcm-core-api api/pcm.go pcm-core-rpc: - CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o pcm-core-rpc adaptor/PCM-CORE/rpc/pcmcore.go + CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o pcm-core-rpc rpc/pcmcore.go -pcm-ac-rpc: - CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o pcm-ac adaptor/PCM-HPC/PCM-AC/rpc/hpcac.go +all-build: pcm-core-rpc pcm-core-api -pcm-kubenative-rpc: - CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o pcm-kubenative adaptor/PCM-K8S/PCM-K8S-NATIVE/rpc/pcmkubenative.go - -pcm-modelarts-rpc: - CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o pcm-modelarts adaptor/PCM-AI/PCM-MODELARTS/rpc/pcmmodelarts.go - -pcm-ceph-rpc: - CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o pcm-ceph adaptor/PCM-STORAGE/PCM-CEPH/rpc/pcmceph.go - -all-build: pcm-core-rpc pcm-core-api pcm-ac-rpc pcm-kubenative-rpc pcm-modelarts-rpc pcm-ceph-rpc - -.PHONY: pcm-core-rpc pcm-core-api pcm-ac-rpc pcm-kubenative-rpc pcm-modelarts-rpc pcm-ceph-rpc \ No newline at end of file +.PHONY: pcm-core-rpc pcm-core-api \ No newline at end of file diff --git a/api/Dockerfile b/api/Dockerfile index 11153a75..a4372ef0 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -1,40 +1,29 @@ FROM golang:1.21.2-alpine3.18 AS builder -LABEL stage=gobuilder - -ENV CGO_ENABLED 0 -ENV GOARCH amd64 -ENV GOPROXY https://goproxy.cn,direct - -RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.sjtug.sjtu.edu.cn/g' /etc/apk/repositories && \ - apk update --no-cache && apk add --no-cache tzdata - WORKDIR /app -ADD go.mod . -ADD go.sum . -RUN go mod download COPY . . -COPY api/etc/ /app/ + +RUN go env -w GO111MODULE=on \ +&& go env -w GOPROXY=https://goproxy.cn,direct \ +&& go env -w CGO_ENABLED=0 \ +&& go mod download RUN go build -o pcm-coordinator-api /app/api/pcm.go +FROM alpine:3.18 -FROM alpine:3.16.2 WORKDIR /app - #修改alpine源为上海交通大学 RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.sjtug.sjtu.edu.cn/g' /etc/apk/repositories && \ - apk update && \ - apk upgrade && \ - apk add --no-cache ca-certificates && update-ca-certificates && \ - apk add --update tzdata && \ - rm -rf /var/cache/apk/* + apk add --no-cache ca-certificates tzdata && \ + update-ca-certificates && \ + rm -rf /var/cache/apk/* -COPY --from=builder /app/pcm-coordinator-api . -COPY api/etc/pcm.yaml . +COPY --from=builder /app/pcm-coordinator-api /app/ +COPY --from=builder /app/api/etc/pcm.yaml /app/ ENV TZ=Asia/Shanghai EXPOSE 8999 -ENTRYPOINT ./pcm-coordinator-api -f pcm.yaml \ No newline at end of file +ENTRYPOINT ["./pcm-coordinator-api", "-f", "pcm.yaml"] \ No newline at end of file diff --git a/api/internal/logic/core/pushtaskinfologic.go b/api/internal/logic/core/pushtaskinfologic.go index ff5a102f..63169a86 100644 --- a/api/internal/logic/core/pushtaskinfologic.go +++ b/api/internal/logic/core/pushtaskinfologic.go @@ -106,7 +106,7 @@ func syncTask(gorm *gorm.DB, noticeInfo clientCore.NoticeInfo) { if tx.Error != nil { logx.Error(tx.Error) } - + allStatus = strings.ToUpper(allStatus) for pcmStatus, ProviderStatus := range clientCore.StatusMapping { for _, originalStatus := range ProviderStatus { // if Failed type status appears in subTask then update mainTask to Failed diff --git a/rpc/Dockerfile b/rpc/Dockerfile index 9c211753..fb0fb43a 100644 --- a/rpc/Dockerfile +++ b/rpc/Dockerfile @@ -1,37 +1,27 @@ FROM golang:1.21.2-alpine3.18 AS builder -LABEL stage=gobuilder - -ENV CGO_ENABLED 0 -ENV GOARCH amd64 -ENV GOPROXY https://goproxy.cn,direct - -RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.sjtug.sjtu.edu.cn/g' /etc/apk/repositories && \ - apk update --no-cache && apk add --no-cache tzdata - WORKDIR /app -ADD go.mod . -ADD go.sum . -RUN go mod download COPY . . -COPY rpc/etc/ /app/ + +RUN go env -w GO111MODULE=on \ +&& go env -w GOPROXY=https://goproxy.cn,direct \ +&& go env -w CGO_ENABLED=0 \ +&& go mod download RUN go build -o pcm-coordinator-rpc /app/rpc/pcmcore.go -FROM alpine:3.16.2 +FROM alpine:3.18 + WORKDIR /app - #修改alpine源为上海交通大学 RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.sjtug.sjtu.edu.cn/g' /etc/apk/repositories && \ - apk update && \ - apk upgrade && \ - apk add --no-cache ca-certificates && update-ca-certificates && \ - apk add --update tzdata && \ - rm -rf /var/cache/apk/* + apk add --no-cache ca-certificates tzdata && \ + update-ca-certificates && \ + rm -rf /var/cache/apk/* -COPY --from=builder /app/pcm-coordinator-rpc . -COPY rpc/etc/pcmcore.yaml . +COPY --from=builder /app/pcm-coordinator-api /app/ +COPY --from=builder /app/api/etc/pcm.yaml /app/ ENV TZ=Asia/Shanghai From e07106841fec4bdae083bb48a2455ffa18788a8a Mon Sep 17 00:00:00 2001 From: qiwang <1364512070@qq.com> Date: Sat, 11 May 2024 16:31:42 +0800 Subject: [PATCH 40/40] fix:update create vm task Former-commit-id: f8c57df90c9d6c9992712433cc854b4b1a89da98 --- api/internal/logic/core/commitvmtasklogic.go | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/api/internal/logic/core/commitvmtasklogic.go b/api/internal/logic/core/commitvmtasklogic.go index 580dbf38..b3623fb3 100644 --- a/api/internal/logic/core/commitvmtasklogic.go +++ b/api/internal/logic/core/commitvmtasklogic.go @@ -161,23 +161,18 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type Incident: "任务创建中", CreatedTime: time.Now(), } - - //db := tx.Table("task").Create(&taskModel) - db := tx.Table("task_vm").Create(&taskVm) - db = tx.Table("t_notice").Create(¬iceInfo) - if db.Error != nil { - logx.Errorf("Task creation failure, err: %v", db.Error) - } - //db = tx.Table("t_notice").Create(¬iceInfo) // Save task data to database + tf := l.svcCtx.DbEngin.Create(&taskModel) if tf.Error != nil { return nil, tf.Error } - //tn := l.svcCtx.DbEngin.Create(¬iceInfo) - //if tn.Error != nil { - // return nil, tn.Error - //} + result := l.svcCtx.DbEngin.Table("task_vm").Create(&taskVm) + result = l.svcCtx.DbEngin.Table("t_notice").Create(¬iceInfo) + if result.Error != nil { + logx.Errorf("Task creation failure, err: %v", result.Error) + } + resp.Code = 200 resp.Msg = "Success" return resp, nil