Merge remote-tracking branch 'upstream/master' into upmaster_wq

# Conflicts:
#	pkg/models/taskvmmodel_gen.go


Former-commit-id: f857097a067622235adfccfdc5631f8efc3d2062
This commit is contained in:
qiwang 2024-05-11 09:18:02 +08:00
commit f01a0c6354
33 changed files with 775 additions and 142 deletions

View File

@ -1,4 +1,5 @@
![PCM](https://www.gitlink.org.cn/images/avatars/Organization/123822?t=1689062058) <img src="https://www.gitlink.org.cn/images/avatars/Organization/123822?t=1689062058" alt="PCM" style="float:center" />
<p align="center"> <p align="center">
<a href="https://www.gitlink.org.cn/JointCloud/pcm-coordinator/tree/master/docs">Docs</a> | <a href="https://www.gitlink.org.cn/JointCloud/pcm-coordinator/tree/master/docs">Docs</a> |
<a href="https://www.gitlink.org.cn/JointCloud/pcm-coordinator/tree/master/docs">简体中文</a> | <a href="https://www.gitlink.org.cn/JointCloud/pcm-coordinator/tree/master/docs">简体中文</a> |
@ -7,35 +8,32 @@
## What is Jointcloud and PCM ## What is Jointcloud and PCM
&emsp;&emsp;Jointcloud Cooperation Environment (JCCE) comprises Distributed Cloud Trading, Distributed Cloud Community, Distributed Cloud Supervision, and a Blockchain-based Distributed Accounting System. JCCE, with its supporting technologies related to information and value exchange, facilitates breaking the information asymmetry among participants in cloud collaboration. It creates a win-win opportunity for all involved parties and provides robust support for innovative business models in the cloud collaboration computing environment. Jointcloud Cooperation Environment (JCCE) comprises Distributed Cloud Trading, Distributed Cloud Community, Distributed Cloud Supervision, and a Blockchain-based Distributed Accounting System. JCCE, with its supporting technologies related to information and value exchange, facilitates breaking the information asymmetry among participants in cloud collaboration. It creates a win-win opportunity for all involved parties and provides robust support for innovative business models in the cloud collaboration computing environment.
&emsp;&emsp;The vision of the Peer Cooperation Mechanism (PCM) is to build a set of collaboration mechanisms in the cloud environment where all service providers and demand-side entities can autonomously participate, achieve peer-to-peer transactions, and establish efficient connections and invocations in a non-intrusive manner. PCM focuses more on collaboration among clouds rather than individual clouds themselves. The vision of the Peer Cooperation Mechanism (PCM) is to build a set of collaboration mechanisms in the cloud environment where all service providers and demand-side entities can autonomously participate, achieve peer-to-peer transactions, and establish efficient connections and invocations in a non-intrusive manner. PCM focuses more on collaboration among clouds rather than individual clouds themselves.
&emsp;&emsp;PCM is built upon a standardized software-defined framework. Cloud service providers with physical cloud resources can autonomously define PCM rule frameworks. Other entities adhering to these rule frameworks can then participate in collaborative transactions. Subsequently, tailored virtual private clouds can be created for domain-specific applications, meeting the diverse requirements of various cloud participants, such as central, edge, and terminal infrastructure, enabling them to directly participate in collaboration and transactions. PCM is built upon a standardized software-defined framework. Cloud service providers with physical cloud resources can autonomously define PCM rule frameworks. Other entities adhering to these rule frameworks can then participate in collaborative transactions. Subsequently, tailored virtual private clouds can be created for domain-specific applications, meeting the diverse requirements of various cloud participants, such as central, edge, and terminal infrastructure, enabling them to directly participate in collaboration and transactions.
## Real-world Issues Addressed by PCM: ## Real-world Issues Addressed by PCM:
- **Performance Improvement**: - **Performance Improvement**
Developers in the cloud collaboration environment experience performance enhancement upon joining the PCM framework. They can generate code for the internal functionality structure and most specification definitions using descriptive language, allowing them to focus on business logic development without concerning themselves with underlying management functions. The framework offers features such as microservices management, multi-language code generation, and model bridging, reducing coding workload and entry barriers while improving efficiency.
Developers in the cloud collaboration environment experience performance enhancement upon joining the PCM framework. They can generate code for the internal functionality structure and most specification definitions using descriptive language, allowing them to focus on business logic development without concerning themselves with underlying management functions. The framework offers features such as microservices management, multi-language code generation, and model bridging, reducing coding workload and entry barriers while improving efficiency.
- **Platform Lock-in Resolution**: - **Platform Lock-in Resolution**
The PCM effectively addresses platform lock-in issues through multi-cloud adaptation, standardized interfaces, and abstraction layers. This enables cross-platform operations, deployment, and interaction. Standardized interfaces simplify the work for developers, lowering the risk of platform lock-in.
The PCM effectively addresses platform lock-in issues through multi-cloud adaptation, standardized interfaces, and abstraction layers. This enables cross-platform operations, deployment, and interaction. Standardized interfaces simplify the work for developers, lowering the risk of platform lock-in.
- **Reduced Code Development Complexity**: - **Reduced Code Development Complexity**
The PCM development framework lowers the complexity of development by providing structured development patterns, ready-made components, and documentation support. Developers of different skill levels can collaborate more fairly. The framework can save approximately 50% of development time, with junior programmers completing tasks that originally required one person-month in about 15 person-days. Features such as automatic generation tools, code templates, and component reuse allow developers to focus more on business logic implementation. There is no need for additional training of advanced developers, saving time, and reducing labor costs while improving the return on investment.
The PCM development framework lowers the complexity of development by providing structured development patterns, ready-made components, and documentation support. Developers of different skill levels can collaborate more fairly. The framework can save approximately 50% of development time, with junior programmers completing tasks that originally required one person-month in about 15 person-days. Features such as automatic generation tools, code templates, and component reuse allow developers to focus more on business logic implementation. There is no need for additional training of advanced developers, saving time, and reducing labor costs while improving the return on investment.
- **Reduced Code Development Workload**: - **Reduced Code Development Workload**
   The PCM framework offers automation features and code generation tools, reducing manual code writing and improving development speed and code quality. With the framework's auto-generation tools, over half of the code is generated, achieving a low code rate of approximately 63%. Developers primarily work on writing descriptive language files to generate the basic structure and then adjust and write logic functions. This enables teams to deliver products more quickly, iterate based on business optimization and user feedback, and be more agile. The PCM framework offers automation features and code generation tools, reducing manual code writing and improving development speed and code quality. With the framework's auto-generation tools, over half of the code is generated, achieving a low code rate of approximately 63%. Developers primarily work on writing descriptive language files to generate the basic structure and then adjust and write logic functions. This enables teams to deliver products more quickly, iterate based on business optimization and user feedback, and be more agile.
## Architecture ## Architecture
&emsp;&emsp;The Coordinator is the core component of the framework, providing heterogeneous abstraction for different Participant-side technology stacks. The framework, oriented towards the user side, primarily involves two major functionalities: resource changes (task submission) and centralized display. After task submission, PCM achieves dynamic task flow through a scheduling center deployed on cloud computing, intelligent computing, and supercomputing clouds. In the centralized display section, PCM mainly collects and aggregates information about resources and tasks from multiple Participant services through Tracker and actively reported by the Participant side. It provides users with a standardized unified interface. Users can view the information provided by PCM on the frontend page or directly access data through the interface. The Coordinator is the core component of the framework, providing heterogeneous abstraction for different Participant-side technology stacks. The framework, oriented towards the user side, primarily involves two major functionalities: resource changes (task submission) and centralized display. After task submission, PCM achieves dynamic task flow through a scheduling center deployed on cloud computing, intelligent computing, and supercomputing clouds. In the centralized display section, PCM mainly collects and aggregates information about resources and tasks from multiple Participant services through Tracker and actively reported by the Participant side. It provides users with a standardized unified interface. Users can view the information provided by PCM on the frontend page or directly access data through the interface.
![PCM架构](docs/images/arch-eng.png) ![PCM架构](docs/images/arch-eng.png)
## PCM deploy ## PCM deploy
@ -43,13 +41,13 @@
The development environment for PCM requires the installation of Go version 1.18 or above. Please refer to the following instructions to locally start the Kubernetes-related services. The development environment for PCM requires the installation of Go version 1.18 or above. Please refer to the following instructions to locally start the Kubernetes-related services.
The command to fetch the project is as follows: The command to fetch the project is as follows:
``` ```bash
git clone https://gitlink.org.cn/JointCloud/pcm-coordinator.git git clone https://gitlink.org.cn/JointCloud/pcm-coordinator.git
``` ```
&emsp;&emsp;After executing the following command, the Kubernetes RPC service will be launched locally. For its specific functionalities, please refer to the description in the architecture design mentioned earlier. After executing the following command, the Kubernetes RPC service will be launched locally. For its specific functionalities, please refer to the description in the architecture design mentioned earlier.
``` ``` bash
# get required packages # get required packages
go mod tidy go mod tidy
@ -57,8 +55,8 @@ go mod tidy
go run pcm-coordinator/rpc/pcmcore.go go run pcm-coordinator/rpc/pcmcore.go
``` ```
#### coordinator-api #### coordinator-api
&emsp;&emsp;The template for the configuration content of the API service is as follows, where the config information for each Participant (P-side) can be configured as needed based on the actual situation. The template for the configuration content of the API service is as follows, where the config information for each Participant (P-side) can be configured as needed based on the actual situation.
``` ``` bash
Name: pcm.core.api Name: pcm.core.api
Host: 0.0.0.0 Host: 0.0.0.0
Port: 8999 Port: 8999
@ -78,8 +76,8 @@ THRpcConf:
ModelArtsRpcConf: ModelArtsRpcConf:
Endpoints: - 127.0.0.1:2002NonBlock: true Endpoints: - 127.0.0.1:2002NonBlock: true
``` ```
&emsp;&emsp;After executing the following command, the Kubernetes API service will be launched locally. Once the service is started, users can make HTTP requests to its interfaces for various functional calls. After executing the following command, the Kubernetes API service will be launched locally. Once the service is started, users can make HTTP requests to its interfaces for various functional calls.
``` ``` bash
# get required packages # get required packages
go mod tidy go mod tidy
@ -89,10 +87,10 @@ go run pcm-coordinator/api/pcm.go
## Upcoming Plans ## Upcoming Plans
- Pluginization of Scheduling Algorithms and Definition of Basic Resource Templates - Pluginization of Scheduling Algorithms and Definition of Basic Resource Templates.
- Fundamental Definition of Resource Operation Types - Fundamental Definition of Resource Operation Types.
- Rapid Development Mode - Rapid Development Mode.
- Implementation of First-level Scheduling - Implementation of First-level Scheduling.
## JoinContribute ## JoinContribute
&emsp;&emsp;We look forward to your opinions and contributions. Welcome all friends to provide corrections and improvements to the project, collectively building an efficient and stable cloud collaboration mechanism. We look forward to your opinions and contributions. Welcome all friends to provide corrections and improvements to the project, collectively building an efficient and stable cloud collaboration mechanism.

View File

@ -5,6 +5,12 @@ import (
"time" "time"
) )
var StatusMapping = map[string][]string{
"Running": {"RUNNING", "RUNNING", "CONFIGURING", "COMPLETING"},
"Succeeded": {"COMPLETED"},
"Failed": {"FAILED", "TIMEOUT", "DEADLINE", "OUT_OF_MEMORY", "BOOT_FAIL", "CANCELLED"},
}
type PullTaskInfoReq struct { type PullTaskInfoReq struct {
AdapterId int64 `form:"adapterId"` AdapterId int64 `form:"adapterId"`
} }
@ -44,6 +50,7 @@ type NoticeInfo struct {
ClusterId int64 `json:"clusterId"` ClusterId int64 `json:"clusterId"`
ClusterName string `json:"clusterName"` ClusterName string `json:"clusterName"`
NoticeType string `json:"noticeType"` NoticeType string `json:"noticeType"`
TaskId int64 `json:"taskId"`
TaskName string `json:"taskName"` TaskName string `json:"taskName"`
Incident string `json:"incident"` Incident string `json:"incident"`
CreatedTime time.Time `json:"createdTime"` CreatedTime time.Time `json:"createdTime"`
@ -72,7 +79,9 @@ type HpcInfo struct {
TaskId int64 `json:"task_id"` // 任务id TaskId int64 `json:"task_id"` // 任务id
JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id) JobId string `json:"job_id"` // 作业id(在第三方系统中的作业id)
AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id AdapterId int64 `json:"adapter_id"` // 执行任务的适配器id
AdapterName string `json:"adapterName,omitempty,optional"`
ClusterId int64 `json:"cluster_id"` // 执行任务的集群id ClusterId int64 `json:"cluster_id"` // 执行任务的集群id
ClusterName string `json:"clusterName,omitempty,optional"`
ClusterType string `json:"cluster_type"` // 执行任务的集群类型 ClusterType string `json:"cluster_type"` // 执行任务的集群类型
Name string `json:"name"` // 名称 Name string `json:"name"` // 名称
Status string `json:"status"` // 状态 Status string `json:"status"` // 状态
@ -113,8 +122,9 @@ type HpcInfo struct {
type CloudInfo struct { type CloudInfo struct {
Id uint `json:"id,omitempty,optional"` Id uint `json:"id,omitempty,optional"`
TaskId int64 `json:"taskId,omitempty,optional"` TaskId int64 `json:"taskId,omitempty,optional"`
AdapterId uint `json:"adapterId,omitempty,optional"` AdapterId int64 `json:"adapterId,omitempty,optional"`
ClusterId uint `json:"clusterId,omitempty,optional"` AdapterName string `json:"adapterName,omitempty,optional"`
ClusterId int64 `json:"clusterId,omitempty,optional"`
ClusterName string `json:"clusterName,omitempty,optional"` ClusterName string `json:"clusterName,omitempty,optional"`
Kind string `json:"kind,omitempty,optional"` Kind string `json:"kind,omitempty,optional"`
Status string `json:"status,omitempty,optional"` Status string `json:"status,omitempty,optional"`
@ -125,9 +135,12 @@ type CloudInfo struct {
} }
type AiInfo struct { type AiInfo struct {
ParticipantId int64 `json:"participantId,omitempty"`
TaskId int64 `json:"taskId,omitempty"` TaskId int64 `json:"taskId,omitempty"`
ProjectId string `json:"project_id,omitempty"` ProjectId string `json:"project_id,omitempty"`
AdapterId int64 `json:"adapterId,omitempty,optional"`
AdapterName string `json:"adapterName,omitempty,optional"`
ClusterId int64 `json:"clusterId,omitempty,optional"`
ClusterName string `json:"clusterName,omitempty,optional"`
Name string `json:"name,omitempty"` Name string `json:"name,omitempty"`
Status string `json:"status,omitempty"` Status string `json:"status,omitempty"`
StartTime string `json:"startTime,omitempty"` StartTime string `json:"startTime,omitempty"`
@ -143,9 +156,12 @@ type AiInfo struct {
} }
type VmInfo struct { type VmInfo struct {
ParticipantId int64 `json:"participantId,omitempty"`
TaskId int64 `json:"taskId,omitempty"` TaskId int64 `json:"taskId,omitempty"`
Name string `json:"name,omitempty"` Name string `json:"name,omitempty"`
AdapterId int64 `json:"adapterId,omitempty,optional"`
AdapterName string `json:"adapterName,omitempty,optional"`
ClusterId int64 `json:"clusterId,omitempty,optional"`
ClusterName string `json:"clusterName,omitempty,optional"`
FlavorRef string `json:"flavor_ref,omitempty"` FlavorRef string `json:"flavor_ref,omitempty"`
ImageRef string `json:"image_ref,omitempty"` ImageRef string `json:"image_ref,omitempty"`
NetworkUuid string `json:"network_uuid,omitempty"` NetworkUuid string `json:"network_uuid,omitempty"`

View File

@ -166,6 +166,18 @@ type (
ReqBody []string `json:"reqBody"` ReqBody []string `json:"reqBody"`
Replicas int64 `json:"replicas,string"` Replicas int64 `json:"replicas,string"`
} }
PodLogsReq {
TaskId string `json:"taskId"`
TaskName string `json:"taskName"`
ClusterId string `json:"clusterId"`
ClusterName string `json:"clusterName"`
AdapterId string `json:"adapterId"`
AdapterName string `json:"adapterName"`
PodName string `json:"podName"`
stream bool `json:"stream"`
}
) )
type deleteTaskReq { type deleteTaskReq {

View File

@ -231,6 +231,9 @@ service pcm {
@doc "Create cloud computing common tasks" @doc "Create cloud computing common tasks"
@handler commitGeneralTask @handler commitGeneralTask
post /cloud/task/create (GeneralTaskReq) returns () post /cloud/task/create (GeneralTaskReq) returns ()
@handler podLogs
post /cloud/pod/logs (PodLogsReq) returns (string)
} }
//智算二级接口 //智算二级接口
@ -966,6 +969,15 @@ service pcm {
@handler ScheduleGetOverviewHandler @handler ScheduleGetOverviewHandler
post /schedule/getOverview returns (ScheduleOverviewResp) post /schedule/getOverview returns (ScheduleOverviewResp)
@handler DownloadAlgothmCodeHandler
get /schedule/downloadAlgorithmCode (DownloadAlgorithmCodeReq) returns (DownloadAlgorithmCodeResp)
@handler UploadAlgothmCodeHandler
post /schedule/uploadAlgorithmCode (UploadAlgorithmCodeReq) returns (UploadAlgorithmCodeResp)
@handler GetComputeCardsByClusterHandler
get /schedule/getComputeCardsByCluster/:adapterId/:clusterId (GetComputeCardsByClusterReq) returns (GetComputeCardsByClusterResp)
} }
@server( @server(

View File

@ -100,4 +100,41 @@ type (
StartTime string `json:"startTime,omitempty" db:"start_time"` StartTime string `json:"startTime,omitempty" db:"start_time"`
EndTime string `json:"endTime,omitempty" db:"end_time"` EndTime string `json:"endTime,omitempty" db:"end_time"`
} }
DownloadAlgorithmCodeReq {
AdapterId string `form:"adapterId"`
ClusterId string `form:"clusterId"`
ResourceType string `form:"resourceType"`
Card string `form:"card"`
TaskType string `form:"taskType"`
Dataset string `form:"dataset"`
Algorithm string `form:"algorithm"`
}
DownloadAlgorithmCodeResp {
Code string `json:"code"`
}
UploadAlgorithmCodeReq {
AdapterId string `json:"adapterId"`
ClusterId string `json:"clusterId"`
ResourceType string `json:"resourceType"`
Card string `json:"card"`
TaskType string `json:"taskType"`
Dataset string `json:"dataset"`
Algorithm string `json:"algorithm"`
Code string `json:"code"`
}
UploadAlgorithmCodeResp {
}
GetComputeCardsByClusterReq {
AdapterId string `path:"adapterId"`
ClusterId string `path:"clusterId"`
}
GetComputeCardsByClusterResp {
Cards []string `json:"cards"`
}
) )

View File

@ -0,0 +1,24 @@
package cloud
import (
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/cloud"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
)
func PodLogsHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.PodLogsReq
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}
l := cloud.NewPodLogsLogic(r.Context(), svcCtx, w)
resp, err := l.PodLogs(&req, w)
result.HttpResult(r, w, resp, err)
}
}

View File

@ -277,6 +277,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/cloud/task/create", Path: "/cloud/task/create",
Handler: cloud.CommitGeneralTaskHandler(serverCtx), Handler: cloud.CommitGeneralTaskHandler(serverCtx),
}, },
{
Method: http.MethodPost,
Path: "/cloud/pod/logs",
Handler: cloud.PodLogsHandler(serverCtx),
},
}, },
rest.WithPrefix("/pcm/v1"), rest.WithPrefix("/pcm/v1"),
) )
@ -1205,6 +1210,21 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/schedule/getOverview", Path: "/schedule/getOverview",
Handler: schedule.ScheduleGetOverviewHandler(serverCtx), Handler: schedule.ScheduleGetOverviewHandler(serverCtx),
}, },
{
Method: http.MethodGet,
Path: "/schedule/downloadAlgorithmCode",
Handler: schedule.DownloadAlgothmCodeHandler(serverCtx),
},
{
Method: http.MethodPost,
Path: "/schedule/uploadAlgorithmCode",
Handler: schedule.UploadAlgothmCodeHandler(serverCtx),
},
{
Method: http.MethodGet,
Path: "/schedule/getComputeCardsByCluster/:adapterId/:clusterId",
Handler: schedule.GetComputeCardsByClusterHandler(serverCtx),
},
}, },
rest.WithPrefix("/pcm/v1"), rest.WithPrefix("/pcm/v1"),
) )

View File

@ -0,0 +1,25 @@
package schedule
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
)
func DownloadAlgothmCodeHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.DownloadAlgorithmCodeReq
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}
l := schedule.NewDownloadAlgothmCodeLogic(r.Context(), svcCtx)
resp, err := l.DownloadAlgorithmCode(&req)
result.HttpResult(r, w, resp, err)
}
}

View File

@ -0,0 +1,28 @@
package schedule
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
)
func GetComputeCardsByClusterHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.GetComputeCardsByClusterReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
return
}
l := schedule.NewGetComputeCardsByClusterLogic(r.Context(), svcCtx)
resp, err := l.GetComputeCardsByCluster(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
}
}

View File

@ -0,0 +1,25 @@
package schedule
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/schedule"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
)
func UploadAlgothmCodeHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.UploadAlgorithmCodeReq
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}
l := schedule.NewUploadAlgothmCodeLogic(r.Context(), svcCtx)
resp, err := l.UploadAlgorithmCode(&req)
result.HttpResult(r, w, resp, err)
}
}

View File

@ -8,6 +8,7 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"strconv" "strconv"
"sync" "sync"
"time"
) )
type GetCenterOverviewLogic struct { type GetCenterOverviewLogic struct {
@ -71,9 +72,14 @@ func (l *GetCenterOverviewLogic) GetCenterOverview() (resp *types.CenterOverview
} }
resp.CardNum = cardNum resp.CardNum = cardNum
resp.PowerInTops = totalTops resp.PowerInTops = totalTops
<-ch
select {
case _ = <-ch:
return resp, nil return resp, nil
case <-time.After(2 * time.Second):
return resp, nil
}
} }
func (l *GetCenterOverviewLogic) updateClusterResource(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) { func (l *GetCenterOverviewLogic) updateClusterResource(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) {

View File

@ -73,9 +73,14 @@ func (l *GetCenterTaskListLogic) GetCenterTaskList() (resp *types.CenterTaskList
resp.List = append(resp.List, t) resp.List = append(resp.List, t)
} }
} }
<-ch
select {
case _ = <-ch:
return resp, nil return resp, nil
case <-time.After(2 * time.Second):
return resp, nil
}
} }
func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) { func (l *GetCenterTaskListLogic) updateAiTaskStatus(mu *sync.RWMutex, ch chan<- struct{}, list []*types.AdapterInfo) {

View File

@ -99,6 +99,8 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er
Strategy: strategy, Strategy: strategy,
} }
var taskClouds []cloud.TaskCloudModel var taskClouds []cloud.TaskCloudModel
adapterName := ""
tx.Table("t_adapter").Select("name").Where("id=?", adapterId).Find(&adapterName)
for _, r := range rs { for _, r := range rs {
for _, s := range req.ReqBody { for _, s := range req.ReqBody {
sStruct := UnMarshalK8sStruct(s, int64(r.Replica)) sStruct := UnMarshalK8sStruct(s, int64(r.Replica))
@ -107,6 +109,7 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er
taskCloud.TaskId = uint(taskModel.Id) taskCloud.TaskId = uint(taskModel.Id)
clusterId, _ := strconv.ParseUint(r.ClusterId, 10, 64) clusterId, _ := strconv.ParseUint(r.ClusterId, 10, 64)
taskCloud.AdapterId = uint(adapterId) taskCloud.AdapterId = uint(adapterId)
taskCloud.AdapterName = adapterName
taskCloud.ClusterId = uint(clusterId) taskCloud.ClusterId = uint(clusterId)
taskCloud.ClusterName = r.ClusterName taskCloud.ClusterName = r.ClusterName
taskCloud.Status = constants.Saved taskCloud.Status = constants.Saved
@ -116,8 +119,6 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er
taskClouds = append(taskClouds, taskCloud) taskClouds = append(taskClouds, taskCloud)
} }
} }
adapterName := ""
tx.Table("t_adapter").Select("name").Where("id=?", adapterId).Find(&adapterName)
noticeInfo := clientCore.NoticeInfo{ noticeInfo := clientCore.NoticeInfo{
AdapterId: int64(adapterId), AdapterId: int64(adapterId),
AdapterName: adapterName, AdapterName: adapterName,

View File

@ -0,0 +1,30 @@
package cloud
import (
"context"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"net/http"
)
type PodLogsLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
w http.ResponseWriter
}
func NewPodLogsLogic(ctx context.Context, svcCtx *svc.ServiceContext, w http.ResponseWriter) *PodLogsLogic {
return &PodLogsLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
w: w,
}
}
func (l *PodLogsLogic) PodLogs(req *types.PodLogsReq, w http.ResponseWriter) (resp string, err error) {
// todo: add your logic here and delete this line
return
}

View File

@ -2,13 +2,12 @@ package core
import ( import (
"context" "context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils"
"time" "time"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
) )
@ -51,15 +50,15 @@ func (l *PageListTaskLogic) PageListTask(req *types.PageTaskReq) (resp *types.Pa
return nil, result.NewDefaultError(err.Error()) return nil, result.NewDefaultError(err.Error())
} }
for _, model := range list { for _, model := range list {
if model.EndTime != "" && model.StartTime != "" { if model.StartTime != "" && model.EndTime == "" {
startTime := timeutils.TimeStringToGoTime(model.StartTime)
model.RunningTime = int64(time.Now().Sub(startTime).Seconds())
}
if model.StartTime != "" && model.EndTime != "" {
startTime := timeutils.TimeStringToGoTime(model.StartTime) startTime := timeutils.TimeStringToGoTime(model.StartTime)
endTime := timeutils.TimeStringToGoTime(model.EndTime) endTime := timeutils.TimeStringToGoTime(model.EndTime)
model.RunningTime = int64(endTime.Sub(startTime).Seconds()) model.RunningTime = int64(endTime.Sub(startTime).Seconds())
} }
if model.StartTime != "" {
startTime := timeutils.TimeStringToGoTime(model.StartTime)
model.RunningTime = int64(time.Now().Sub(startTime).Seconds())
}
} }
resp.List = &list resp.List = &list
resp.PageSize = req.PageSize resp.PageSize = req.PageSize

View File

@ -41,52 +41,148 @@ func (l *PushTaskInfoLogic) PushTaskInfo(req *clientCore.PushTaskInfoReq) (*clie
} }
l.svcCtx.DbEngin.Exec("update task_cloud set status = ?,start_time = ?,result = ? where task_id = ?", l.svcCtx.DbEngin.Exec("update task_cloud set status = ?,start_time = ?,result = ? where task_id = ?",
cloudInfo.Status, cloudInfo.StartTime, cloudInfo.Result, cloudInfo.TaskId) cloudInfo.Status, cloudInfo.StartTime, cloudInfo.Result, cloudInfo.TaskId)
syncTask(l.svcCtx.DbEngin, int64(taskId)) var taskName string
l.svcCtx.DbEngin.Raw("select name as kind from task where id = ?", taskId).Scan(&taskName)
noticeInfo := clientCore.NoticeInfo{
TaskId: cloudInfo.TaskId,
AdapterId: cloudInfo.AdapterId,
AdapterName: cloudInfo.AdapterName,
ClusterId: cloudInfo.ClusterId,
ClusterName: cloudInfo.ClusterName,
TaskName: taskName,
}
syncTask(l.svcCtx.DbEngin, noticeInfo)
} }
case 2: case 2:
for _, hpcInfo := range req.HpcInfoList { for _, hpcInfo := range req.HpcInfoList {
l.svcCtx.DbEngin.Exec("update task_hpc set status = ?,start_time = ?,job_id = ? where cluster_id = ? and task_id = ? and name = ?", l.svcCtx.DbEngin.Exec("update task_hpc set status = ?,start_time = ?,job_id = ? where cluster_id = ? and task_id = ? and name = ?",
hpcInfo.Status, hpcInfo.StartTime, hpcInfo.JobId, hpcInfo.ClusterId, hpcInfo.TaskId, hpcInfo.Name) hpcInfo.Status, hpcInfo.StartTime, hpcInfo.JobId, hpcInfo.ClusterId, hpcInfo.TaskId, hpcInfo.Name)
syncTask(l.svcCtx.DbEngin, hpcInfo.TaskId) noticeInfo := clientCore.NoticeInfo{
TaskId: hpcInfo.TaskId,
AdapterId: hpcInfo.AdapterId,
AdapterName: hpcInfo.AdapterName,
ClusterId: hpcInfo.ClusterId,
ClusterName: hpcInfo.ClusterName,
TaskName: hpcInfo.Name,
}
syncTask(l.svcCtx.DbEngin, noticeInfo)
} }
case 1: case 1:
for _, aiInfo := range req.AiInfoList { for _, aiInfo := range req.AiInfoList {
l.svcCtx.DbEngin.Exec("update ai set status = ?,start_time = ?,project_id = ?,job_id = ? where participant_id = ? and task_id = ? and name = ?", l.svcCtx.DbEngin.Exec("update task_ai set status = ?,start_time = ?,project_id = ?,job_id = ? where participant_id = ? and task_id = ? and name = ?",
aiInfo.Status, aiInfo.StartTime, aiInfo.ProjectId, aiInfo.JobId, req.AdapterId, aiInfo.TaskId, aiInfo.Name) aiInfo.Status, aiInfo.StartTime, aiInfo.ProjectId, aiInfo.JobId, req.AdapterId, aiInfo.TaskId, aiInfo.Name)
syncTask(l.svcCtx.DbEngin, aiInfo.TaskId) noticeInfo := clientCore.NoticeInfo{
TaskId: aiInfo.TaskId,
AdapterId: aiInfo.AdapterId,
AdapterName: aiInfo.AdapterName,
ClusterId: aiInfo.ClusterId,
ClusterName: aiInfo.ClusterName,
TaskName: aiInfo.Name,
}
syncTask(l.svcCtx.DbEngin, noticeInfo)
} }
case 3: case 3:
for _, vmInfo := range req.VmInfoList { for _, vmInfo := range req.VmInfoList {
l.svcCtx.DbEngin.Exec("update task_vm set status = ?,start_time = ? where participant_id = ? and task_id = ? and name = ?", l.svcCtx.DbEngin.Exec("update task_vm set status = ?,start_time = ? where participant_id = ? and task_id = ? and name = ?",
vmInfo.Status, vmInfo.StartTime, req.AdapterId, vmInfo.TaskId, vmInfo.Name) vmInfo.Status, vmInfo.StartTime, req.AdapterId, vmInfo.TaskId, vmInfo.Name)
syncTask(l.svcCtx.DbEngin, vmInfo.TaskId) noticeInfo := clientCore.NoticeInfo{
TaskId: vmInfo.TaskId,
AdapterId: vmInfo.AdapterId,
AdapterName: vmInfo.AdapterName,
ClusterId: vmInfo.ClusterId,
ClusterName: vmInfo.ClusterName,
TaskName: vmInfo.Name,
}
syncTask(l.svcCtx.DbEngin, noticeInfo)
} }
} }
return &resp, nil return &resp, nil
} }
func syncTask(gorm *gorm.DB, taskId int64) { func syncTask(gorm *gorm.DB, noticeInfo clientCore.NoticeInfo) {
var allStatus string var allStatus string
tx := gorm.Raw("SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.status) ,GROUP_CONCAT(DISTINCT a.status) ,GROUP_CONCAT(DISTINCT c.status))as status from task t left join hpc h on t.id = h.task_id left join task_cloud c on t.id = c.task_id left join ai a on t.id = a.task_id where t.id = ?", taskId).Scan(&allStatus) tx := gorm.Raw("SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.status) ,GROUP_CONCAT(DISTINCT a.status) ,GROUP_CONCAT(DISTINCT c.status))as status from task t left join task_hpc h on t.id = h.task_id left join task_cloud c on t.id = c.task_id left join task_ai a on t.id = a.task_id where t.id = ?", noticeInfo.TaskId).Scan(&allStatus)
if tx.Error != nil { if tx.Error != nil {
logx.Error(tx.Error) logx.Error(tx.Error)
} }
// 子状态统一则修改主任务状态
statusArray := strings.Split(allStatus, ",")
if len(removeRepeatedElement(statusArray)) == 1 {
updateTask(gorm, taskId, statusArray[0])
}
// 子任务包含失败状态 主任务则失败
if strings.Contains(allStatus, constants.Failed) {
updateTask(gorm, taskId, constants.Failed)
for pcmStatus, ProviderStatus := range clientCore.StatusMapping {
for _, originalStatus := range ProviderStatus {
// if Failed type status appears in subTask then update mainTask to Failed
if pcmStatus == "Failed" && strings.Contains(allStatus, originalStatus) {
updateTask(gorm, noticeInfo.TaskId, constants.Failed)
noticeInfo := clientCore.NoticeInfo{
AdapterId: noticeInfo.AdapterId,
AdapterName: noticeInfo.AdapterName,
ClusterId: noticeInfo.ClusterId,
ClusterName: noticeInfo.ClusterName,
NoticeType: "failed",
TaskName: noticeInfo.TaskName,
Incident: "任务执行失败,请查看日志!",
CreatedTime: time.Now(),
} }
if strings.Contains(allStatus, constants.Running) { gorm.Table("t_notice").Create(&noticeInfo)
updateTask(gorm, taskId, constants.Running) return
// no Failed type status in subTask,if Saved type status appears in subTask then update mainTask to Saved
} else if pcmStatus == "Saved" && strings.Contains(allStatus, originalStatus) {
if getTaskStatus(gorm, noticeInfo.TaskId) != "Saved" {
updateTask(gorm, noticeInfo.TaskId, constants.Saved)
noticeInfo := clientCore.NoticeInfo{
AdapterId: noticeInfo.AdapterId,
AdapterName: noticeInfo.AdapterName,
ClusterId: noticeInfo.ClusterId,
ClusterName: noticeInfo.ClusterName,
NoticeType: "saved",
TaskName: noticeInfo.TaskName,
Incident: "任务已处于队列中!",
CreatedTime: time.Now(),
}
gorm.Table("t_notice").Create(&noticeInfo)
return
} else {
return
}
// no Failed and Saved type status in subTask,if Running type status appears in subTask then update mainTask to Running
} else if pcmStatus == "Running" && strings.Contains(allStatus, originalStatus) {
if getTaskStatus(gorm, noticeInfo.TaskId) != "Running" {
updateTask(gorm, noticeInfo.TaskId, constants.Running)
noticeInfo := clientCore.NoticeInfo{
AdapterId: noticeInfo.AdapterId,
AdapterName: noticeInfo.AdapterName,
ClusterId: noticeInfo.ClusterId,
ClusterName: noticeInfo.ClusterName,
NoticeType: "running",
TaskName: noticeInfo.TaskName,
Incident: "任务状态切换为运行中!",
CreatedTime: time.Now(),
}
gorm.Table("t_notice").Create(&noticeInfo)
return
} else {
return
} }
// at last, mainTask should be succeeded
} else {
if strings.Contains(allStatus, originalStatus) {
updateTask(gorm, noticeInfo.TaskId, constants.Succeeded)
noticeInfo := clientCore.NoticeInfo{
AdapterId: noticeInfo.AdapterId,
AdapterName: noticeInfo.AdapterName,
ClusterId: noticeInfo.ClusterId,
ClusterName: noticeInfo.ClusterName,
NoticeType: "succeeded",
TaskName: noticeInfo.TaskName,
Incident: "任务执行完成!",
CreatedTime: time.Now(),
}
gorm.Table("t_notice").Create(&noticeInfo)
return
}
}
}
}
} }
func updateTask(gorm *gorm.DB, taskId int64, status string) { func updateTask(gorm *gorm.DB, taskId int64, status string) {
@ -98,23 +194,15 @@ func updateTask(gorm *gorm.DB, taskId int64, status string) {
if status == constants.Running { if status == constants.Running {
task.StartTime = &now task.StartTime = &now
} }
if task.Status == constants.Failed || task.Status == constants.Succeeded {
task.EndTime = &now
}
gorm.Updates(&task) gorm.Updates(&task)
} }
} }
func removeRepeatedElement(arr []string) (newArr []string) { func getTaskStatus(gorm *gorm.DB, taskId int64) (status string) {
newArr = make([]string, 0) var task models.Task
for i := 0; i < len(arr); i++ { gorm.Where("id = ? ", taskId).Find(&task)
repeat := false return task.Status
for j := i + 1; j < len(arr); j++ {
if arr[i] == arr[j] {
repeat = true
break
}
}
if !repeat {
newArr = append(newArr, arr[i])
}
}
return
} }

View File

@ -17,6 +17,7 @@ package core
import ( import (
"context" "context"
"fmt" "fmt"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"strconv" "strconv"
"time" "time"
@ -55,6 +56,11 @@ func (l *TaskListLogic) TaskList(req *types.TaskListReq) (resp *types.TaskListRe
if len(tasks) == 0 { if len(tasks) == 0 {
return nil, nil return nil, nil
} }
// 更新智算任务状态
var ch = make(chan struct{})
go l.updateAitaskStatus(tasks, ch)
// 查询任务总数 // 查询任务总数
l.svcCtx.DbEngin.Model(&models.Task{}).Count(&resp.TotalCount) l.svcCtx.DbEngin.Model(&models.Task{}).Count(&resp.TotalCount)
@ -106,5 +112,64 @@ func (l *TaskListLogic) TaskList(req *types.TaskListReq) (resp *types.TaskListRe
} }
return select {
case _ = <-ch:
return resp, nil
case <-time.After(1 * time.Second):
return resp, nil
}
}
func (l *TaskListLogic) updateAitaskStatus(tasks []models.Task, ch chan<- struct{}) {
for _, task := range tasks {
if task.AdapterTypeDict != 1 {
continue
}
if task.Status == constants.Succeeded {
continue
}
var aiTask []*models.TaskAi
tx := l.svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", task.Id).Scan(&aiTask)
if tx.Error != nil {
logx.Errorf(tx.Error.Error())
return
}
start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local)
end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local)
var status = constants.Succeeded
for _, a := range aiTask {
s, _ := time.ParseInLocation(constants.Layout, a.StartTime, time.Local)
e, _ := time.ParseInLocation(constants.Layout, a.EndTime, time.Local)
if s.Before(start) {
start = s
}
if e.After(end) {
end = e
}
if a.Status == constants.Failed {
status = a.Status
break
}
if a.Status == constants.Running {
status = a.Status
continue
}
}
task.Status = status
task.StartTime = &start
task.EndTime = &end
tx = l.svcCtx.DbEngin.Updates(task)
if tx.Error != nil {
return
}
}
ch <- struct{}{}
} }

View File

@ -2,10 +2,12 @@ package hpc
import ( import (
"context" "context"
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"k8s.io/apimachinery/pkg/util/json" "k8s.io/apimachinery/pkg/util/json"
"math/rand" "math/rand"
"strconv"
"time" "time"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
@ -50,6 +52,13 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t
var clusterIds []int64 var clusterIds []int64
l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? and label = ?", req.AdapterId, req.ClusterType).Scan(&clusterIds) l.svcCtx.DbEngin.Raw("SELECT id FROM `t_cluster` where adapter_id = ? and label = ?", req.AdapterId, req.ClusterType).Scan(&clusterIds)
adapterId, _ := strconv.ParseInt(req.AdapterId, 10, 64)
var adapterName string
l.svcCtx.DbEngin.Raw("SELECT name FROM `t_adapter` where id = ?", req.AdapterId).Scan(&adapterName)
clusterId := clusterIds[rand.Intn(len(clusterIds))]
var clusterName string
l.svcCtx.DbEngin.Raw("SELECT nickname FROM `t_cluster` where id = ?", clusterId).Scan(&clusterName)
env, _ := json.Marshal(req.Environment) env, _ := json.Marshal(req.Environment)
if len(clusterIds) == 0 || clusterIds == nil { if len(clusterIds) == 0 || clusterIds == nil {
@ -60,7 +69,10 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t
hpcInfo := models.TaskHpc{ hpcInfo := models.TaskHpc{
TaskId: taskModel.Id, TaskId: taskModel.Id,
ClusterId: clusterIds[rand.Intn(len(clusterIds))], AdapterId: uint(adapterId),
AdapterName: adapterName,
ClusterId: uint(clusterId),
ClusterName: clusterName,
Name: taskModel.Name, Name: taskModel.Name,
Status: "Saved", Status: "Saved",
CmdScript: req.CmdScript, CmdScript: req.CmdScript,
@ -88,6 +100,20 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t
if tx.Error != nil { if tx.Error != nil {
return nil, tx.Error return nil, tx.Error
} }
noticeInfo := clientCore.NoticeInfo{
AdapterId: adapterId,
AdapterName: adapterName,
ClusterId: clusterId,
ClusterName: clusterName,
NoticeType: "create",
TaskName: req.Name,
Incident: "任务创建中",
CreatedTime: time.Now(),
}
result := l.svcCtx.DbEngin.Table("t_notice").Create(&noticeInfo)
if result.Error != nil {
logx.Errorf("Task creation failure, err: %v", result.Error)
}
// todo mq task manage // todo mq task manage
//reqMessage, err := json.Marshal(mqInfo) //reqMessage, err := json.Marshal(mqInfo)
//if err != nil { //if err != nil {

View File

@ -0,0 +1,36 @@
package schedule
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type DownloadAlgothmCodeLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewDownloadAlgothmCodeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *DownloadAlgothmCodeLogic {
return &DownloadAlgothmCodeLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *DownloadAlgothmCodeLogic) DownloadAlgorithmCode(req *types.DownloadAlgorithmCodeReq) (resp *types.DownloadAlgorithmCodeResp, err error) {
resp = &types.DownloadAlgorithmCodeResp{}
code, err := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].DownloadAlgorithmCode(l.ctx,
req.ResourceType, req.Card, req.TaskType, req.Dataset, req.Algorithm)
if err != nil {
return nil, err
}
resp.Code = code
return resp, nil
}

View File

@ -0,0 +1,30 @@
package schedule
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type GetComputeCardsByClusterLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewGetComputeCardsByClusterLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetComputeCardsByClusterLogic {
return &GetComputeCardsByClusterLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *GetComputeCardsByClusterLogic) GetComputeCardsByCluster(req *types.GetComputeCardsByClusterReq) (resp *types.GetComputeCardsByClusterResp, err error) {
// todo: add your logic here and delete this line
return
}

View File

@ -0,0 +1,35 @@
package schedule
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type UploadAlgothmCodeLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewUploadAlgothmCodeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *UploadAlgothmCodeLogic {
return &UploadAlgothmCodeLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *UploadAlgothmCodeLogic) UploadAlgorithmCode(req *types.UploadAlgorithmCodeReq) (resp *types.UploadAlgorithmCodeResp, err error) {
resp = &types.UploadAlgorithmCodeResp{}
err = l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[req.AdapterId][req.ClusterId].UploadAlgorithmCode(l.ctx,
req.ResourceType, req.Card, req.TaskType, req.Dataset, req.Algorithm, req.Code)
if err != nil {
return nil, err
}
return resp, nil
}

View File

@ -55,7 +55,7 @@ func NewAiScheduler(ctx context.Context, val string, scheduler *scheduler.Schedu
func (as *AiScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) { func (as *AiScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
ai := models.Ai{ ai := models.Ai{
ParticipantId: participantId, AdapterId: participantId,
TaskId: task.TaskId, TaskId: task.TaskId,
Status: "Saved", Status: "Saved",
YamlString: as.yamlString, YamlString: as.yamlString,

View File

@ -8,8 +8,8 @@ type AiCollector interface {
GetAlgorithms(ctx context.Context) ([]*Algorithm, error) GetAlgorithms(ctx context.Context) ([]*Algorithm, error)
GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error)
GetTrainingTask(ctx context.Context, taskId string) (*Task, error) GetTrainingTask(ctx context.Context, taskId string) (*Task, error)
DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error)
UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error
} }
type ResourceStats struct { type ResourceStats struct {

View File

@ -162,11 +162,11 @@ func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorit
return nil, nil return nil, nil
} }
func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) { func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
return "", nil return "", nil
} }
func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error { func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
return nil return nil
} }

View File

@ -54,6 +54,7 @@ const (
CAMBRICON = "cambricon" CAMBRICON = "cambricon"
TRAIN_CMD = "cd /code; python train.py" TRAIN_CMD = "cd /code; python train.py"
VERSION = "V1" VERSION = "V1"
DOMAIN = "http://192.168.242.41:8001/"
) )
var ( var (
@ -339,11 +340,48 @@ func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm
return algorithms, nil return algorithms, nil
} }
func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) { func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
return "", nil dcReq := &octopus.DownloadCompressReq{
Platform: o.platform,
Version: VERSION,
AlgorithmId: "",
}
dcResp, err := o.octopusRpc.DownloadCompress(ctx, dcReq)
if err != nil {
return "", err
}
if !dcResp.Success {
return "", errors.New(dcResp.Error.Message)
}
daReq := &octopus.DownloadAlgorithmReq{
Platform: o.platform,
Version: VERSION,
AlgorithmId: "",
CompressAt: dcResp.Payload.CompressAt,
Domain: DOMAIN,
}
daResp, err := o.octopusRpc.DownloadAlgorithm(ctx, daReq)
if err != nil {
return "", err
}
if !daResp.Success {
return "", errors.New(dcResp.Error.Message)
}
urlReq := &octopus.AlgorithmUrlReq{
Platform: o.platform,
Url: daResp.Payload.DownloadUrl,
}
urlResp, err := o.octopusRpc.DownloadAlgorithmUrl(ctx, urlReq)
if err != nil {
return "", err
}
return urlResp.Algorithm, nil
} }
func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error { func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
return nil return nil
} }

View File

@ -447,11 +447,32 @@ func (s *ShuguangAi) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm,
return algorithms, nil return algorithms, nil
} }
func (s *ShuguangAi) DownloadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string) (string, error) { func (s *ShuguangAi) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
return "", nil algoName := dataset + DASH + algorithm
req := &hpcAC.GetFileReq{
Path: ALGORITHM_DIR + FORWARD_SLASH + taskType + FORWARD_SLASH + algoName + FORWARD_SLASH + TRAIN_FILE,
}
resp, err := s.aCRpc.GetFile(ctx, req)
if err != nil {
return "", err
}
return resp.Content, nil
} }
func (s *ShuguangAi) UploadAlgorithmCode(ctx context.Context, resourceType string, taskType string, dataset string, algorithm string, code string) error { func (s *ShuguangAi) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
algoName := dataset + DASH + algorithm
req := &hpcAC.UploadFileReq{
Path: ALGORITHM_DIR + FORWARD_SLASH + taskType + FORWARD_SLASH + algoName + FORWARD_SLASH,
Cover: "cover",
File: code,
}
_, err := s.aCRpc.UploadFile(ctx, req)
if err != nil {
return err
}
return nil return nil
} }

View File

@ -150,6 +150,17 @@ type GeneralTaskReq struct {
Replicas int64 `json:"replicas,string"` Replicas int64 `json:"replicas,string"`
} }
type PodLogsReq struct {
TaskId string `json:"taskId"`
TaskName string `json:"taskName"`
ClusterId string `json:"clusterId"`
ClusterName string `json:"clusterName"`
AdapterId string `json:"adapterId"`
AdapterName string `json:"adapterName"`
PodName string `json:"podName"`
Stream bool `json:"stream"`
}
type DeleteTaskReq struct { type DeleteTaskReq struct {
Id int64 `path:"id"` Id int64 `path:"id"`
} }
@ -5691,6 +5702,43 @@ type AiTaskDb struct {
EndTime string `json:"endTime,omitempty" db:"end_time"` EndTime string `json:"endTime,omitempty" db:"end_time"`
} }
type DownloadAlgorithmCodeReq struct {
AdapterId string `form:"adapterId"`
ClusterId string `form:"clusterId"`
ResourceType string `form:"resourceType"`
Card string `form:"card"`
TaskType string `form:"taskType"`
Dataset string `form:"dataset"`
Algorithm string `form:"algorithm"`
}
type DownloadAlgorithmCodeResp struct {
Code string `json:"code"`
}
type UploadAlgorithmCodeReq struct {
AdapterId string `json:"adapterId"`
ClusterId string `json:"clusterId"`
ResourceType string `json:"resourceType"`
Card string `json:"card"`
TaskType string `json:"taskType"`
Dataset string `json:"dataset"`
Algorithm string `json:"algorithm"`
Code string `json:"code"`
}
type UploadAlgorithmCodeResp struct {
}
type GetComputeCardsByClusterReq struct {
AdapterId string `path:"adapterId"`
ClusterId string `path:"clusterId"`
}
type GetComputeCardsByClusterResp struct {
Cards []string `json:"cards"`
}
type CreateAlertRuleReq struct { type CreateAlertRuleReq struct {
CLusterId string `json:"clusterId"` CLusterId string `json:"clusterId"`
ClusterName string `json:"clusterName"` ClusterName string `json:"clusterName"`

2
go.mod
View File

@ -26,7 +26,7 @@ require (
github.com/zeromicro/go-zero v1.6.3 github.com/zeromicro/go-zero v1.6.3
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142 gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203
gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5
gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230904090036-24fc730ec87d

4
go.sum
View File

@ -1082,8 +1082,8 @@ gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece h1:W3yBnvAVV
gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo= gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20240426095603-549fefd8bece/go.mod h1:w3Nb5TNymCItQ7K3x4Q0JLuoq9OerwAzAWT2zsPE9Xo=
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c h1:2Wl/hvaSFjh6fmCSIQhjkr9llMRREQeqcXNLZ/HPY18= gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c h1:2Wl/hvaSFjh6fmCSIQhjkr9llMRREQeqcXNLZ/HPY18=
gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c/go.mod h1:lSRfGs+PxFvw7CcndHWRd6UlLlGrZn0b0hp5cfaMNGw= gitlink.org.cn/JointCloud/pcm-kubernetes v0.0.0-20240301071143-347480abff2c/go.mod h1:lSRfGs+PxFvw7CcndHWRd6UlLlGrZn0b0hp5cfaMNGw=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142 h1:+po0nesBDSWsgCySBG7eEXk7i9Ytd58wqvjL1M9y6d8= gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35 h1:E2QfpS3Y0FjR8Zyv5l2Ti/2NetQFqHG66c8+T/+J1u0=
gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240424085753-6899615e9142/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240510133934-6a5526289b35/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ=
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 h1:s6PsZ1+bev294IWdZRlV7mnOwI1+UzFcldVW/BqhQzI=
gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203/go.mod h1:i2rrbMQ+Fve345BY9Heh4MUqVTAimZQElQhzzRee5B8= gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203/go.mod h1:i2rrbMQ+Fve345BY9Heh4MUqVTAimZQElQhzzRee5B8=
gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnzkJBfMRnya1NrhOzlroUtRa5ePiYbPKlHLoLV0= gitlink.org.cn/JointCloud/pcm-slurm v0.0.0-20240301080743-8b94bbaf57f5 h1:+/5vnzkJBfMRnya1NrhOzlroUtRa5ePiYbPKlHLoLV0=

View File

@ -38,7 +38,10 @@ type (
Ai struct { Ai struct {
Id int64 `db:"id"` // id Id int64 `db:"id"` // id
TaskId int64 `db:"task_id"` // 任务id TaskId int64 `db:"task_id"` // 任务id
ParticipantId int64 `db:"participant_id"` // 集群静态信息id AdapterId int64 `db:"adapter_id"` // 适配器id
AdapterName string `db:"adapter_name"` //适配器名称
ClusterId int64 `db:"cluster_id"` //集群id
ClusterName string `db:"cluster_name"` //集群名称
ProjectId string `db:"project_id"` // 项目id ProjectId string `db:"project_id"` // 项目id
Name string `db:"name"` // 名称 Name string `db:"name"` // 名称
Status string `db:"status"` // 状态 Status string `db:"status"` // 状态

View File

@ -9,6 +9,7 @@ type TaskCloudModel struct {
Id uint `json:"id" gorm:"primarykey;not null;comment:id"` Id uint `json:"id" gorm:"primarykey;not null;comment:id"`
TaskId uint `json:"taskId" gorm:"not null;comment:task表id"` TaskId uint `json:"taskId" gorm:"not null;comment:task表id"`
AdapterId uint `json:"adapterId" gorm:"not null;comment:适配器id"` AdapterId uint `json:"adapterId" gorm:"not null;comment:适配器id"`
AdapterName string `json:"adapterName" gorm:"not null;comment:适配器名称"`
ClusterId uint `json:"clusterId" gorm:"not null;comment:集群id"` ClusterId uint `json:"clusterId" gorm:"not null;comment:集群id"`
ClusterName string `json:"clusterName" gorm:"not null;comment:集群名称"` ClusterName string `json:"clusterName" gorm:"not null;comment:集群名称"`
Kind string `json:"kind" gorm:"comment:种类"` Kind string `json:"kind" gorm:"comment:种类"`

View File

@ -39,7 +39,10 @@ type (
Id int64 `db:"id"` // id Id int64 `db:"id"` // id
TaskId int64 `db:"task_id"` // 任务id TaskId int64 `db:"task_id"` // 任务id
JobId string `db:"job_id"` // 作业id(在第三方系统中的作业id) JobId string `db:"job_id"` // 作业id(在第三方系统中的作业id)
ClusterId int64 `db:"cluster_id"` // 执行任务的集群id AdapterId uint `db:"adapter_d"` // 适配器id
AdapterName string `db:"adapter_name"` //适配器名称
ClusterId uint `db:"cluster_id"` //集群id
ClusterName string `db:"cluster_name"` //集群名称
Name string `db:"name"` // 名称 Name string `db:"name"` // 名称
Status string `db:"status"` // 状态 Status string `db:"status"` // 状态
CmdScript string `db:"cmd_script"` CmdScript string `db:"cmd_script"`

View File

@ -36,12 +36,13 @@ type (
TaskVm struct { TaskVm struct {
Id int64 `db:"id"` // id Id int64 `db:"id"` // id
ParticipantId int64 `db:"participant_id"` // p端id
TaskId int64 `db:"task_id"` // 任务id TaskId int64 `db:"task_id"` // 任务id
Name string `db:"name"` // 虚拟机名称 Name string `db:"name"` // 虚拟机名称
AdapterId int64 `db:"adapter_id"` // 执行任务的适配器id AdapterId int64 `db:"adapter_id"` // 适配器id
AdapterName string `db:"adapter_name"` // 适配器名称 AdapterName string `db:"adapter_name"` //适配器名称
ClusterId int64 `db:"cluster_id"` // 执行任务的集群id ClusterId int64 `db:"cluster_id"` //集群id
ClusterName string `db:"cluster_name"` // 集群名称 ClusterName string `db:"cluster_name"` //集群名称
FlavorRef string `db:"flavor_ref"` // 规格索引 FlavorRef string `db:"flavor_ref"` // 规格索引
ImageRef string `db:"image_ref"` // 镜像索引 ImageRef string `db:"image_ref"` // 镜像索引
Status string `db:"status"` // 状态 Status string `db:"status"` // 状态
@ -92,14 +93,14 @@ func (m *defaultTaskVmModel) FindOne(ctx context.Context, id int64) (*TaskVm, er
} }
func (m *defaultTaskVmModel) Insert(ctx context.Context, data *TaskVm) (sql.Result, error) { func (m *defaultTaskVmModel) Insert(ctx context.Context, data *TaskVm) (sql.Result, error) {
query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskVmRowsExpectAutoSet) query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskVmRowsExpectAutoSet)
ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.Name, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt) ret, err := m.conn.ExecCtx(ctx, query, data.ParticipantId, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt)
return ret, err return ret, err
} }
func (m *defaultTaskVmModel) Update(ctx context.Context, data *TaskVm) error { func (m *defaultTaskVmModel) Update(ctx context.Context, data *TaskVm) error {
query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskVmRowsWithPlaceHolder) query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskVmRowsWithPlaceHolder)
_, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.Name, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt, data.Id) _, err := m.conn.ExecCtx(ctx, query, data.ParticipantId, data.TaskId, data.Name, data.AdapterId, data.ClusterId, data.FlavorRef, data.ImageRef, data.Status, data.Platform, data.Description, data.AvailabilityZone, data.MinCount, data.Uuid, data.StartTime, data.RunningTime, data.Result, data.DeletedAt, data.Id)
return err return err
} }