Merge pull request 'updated pcm.api' (#97) from tzwang/pcm-coordinator:master into master
Former-commit-id: 7a56f0e4d078c97ac30e882a84a9c6c1d58f36f1
This commit is contained in:
commit
72fbb7c787
|
@ -13,8 +13,14 @@ type (
|
||||||
}
|
}
|
||||||
|
|
||||||
ScheduleResp {
|
ScheduleResp {
|
||||||
Success bool `json:"success"`
|
Results []*ScheduleResult `json:"results"`
|
||||||
ErrorMsg string `json:"errorMsg"`
|
}
|
||||||
|
|
||||||
|
ScheduleResult {
|
||||||
|
ClusterId string `json:"clusterId"`
|
||||||
|
TaskId string `json:"taskId"`
|
||||||
|
Replica int32 `json:"replica"`
|
||||||
|
Msg string `json:"msg"`
|
||||||
}
|
}
|
||||||
|
|
||||||
AiOption {
|
AiOption {
|
||||||
|
|
|
@ -4,7 +4,6 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
|
||||||
|
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
|
||||||
|
|
||||||
|
@ -28,26 +27,43 @@ func NewScheduleSubmitLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Sc
|
||||||
func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *types.ScheduleResp, err error) {
|
func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *types.ScheduleResp, err error) {
|
||||||
resp = &types.ScheduleResp{}
|
resp = &types.ScheduleResp{}
|
||||||
opt := &option.AiOption{
|
opt := &option.AiOption{
|
||||||
ResourceType: req.AiOption.ResourceType,
|
ResourceType: req.AiOption.ResourceType,
|
||||||
Tops: 0,
|
Tops: req.AiOption.Tops,
|
||||||
TaskType: req.AiOption.TaskType,
|
TaskType: req.AiOption.TaskType,
|
||||||
DatasetsName: req.AiOption.Datasets,
|
DatasetsName: req.AiOption.Datasets,
|
||||||
//AlgorithmName: "cnn",
|
AlgorithmName: req.AiOption.Algorithm,
|
||||||
StrategyName: req.AiOption.Strategy,
|
StrategyName: req.AiOption.Strategy,
|
||||||
ClusterToStaticWeight: nil,
|
ClusterToStaticWeight: req.AiOption.StaticWeightMap,
|
||||||
Params: []string{
|
Params: req.AiOption.Params,
|
||||||
"epoch,1",
|
Envs: req.AiOption.Envs,
|
||||||
},
|
Cmd: req.AiOption.Cmd,
|
||||||
}
|
}
|
||||||
aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
|
aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl)
|
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
switch opt.GetOptionType() {
|
||||||
|
case option.AI:
|
||||||
|
for _, result := range results {
|
||||||
|
_ = (result).(*schedulers.AiResult)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if resp.ErrorMsg == "" {
|
||||||
|
resp.Success = true
|
||||||
|
err = l.svcCtx.Scheduler.AiStorages.SaveTask(req.AiOption.TaskName)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
resp.Success = false
|
||||||
|
}
|
||||||
|
|
||||||
return resp, nil
|
return resp, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,6 +16,7 @@ package schedulers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
|
||||||
|
@ -25,6 +26,9 @@ import (
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
|
||||||
|
"gitlink.org.cn/jcce-pcm/pcm-ac/hpcAC"
|
||||||
|
"gitlink.org.cn/jcce-pcm/pcm-participant-octopus/octopus"
|
||||||
|
"strconv"
|
||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -36,6 +40,13 @@ type AiScheduler struct {
|
||||||
ctx context.Context
|
ctx context.Context
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type AiResult struct {
|
||||||
|
taskId string
|
||||||
|
clusterId string
|
||||||
|
replica int32
|
||||||
|
msg string
|
||||||
|
}
|
||||||
|
|
||||||
func NewAiScheduler(ctx context.Context, val string, scheduler *scheduler.Scheduler, option *option.AiOption) (*AiScheduler, error) {
|
func NewAiScheduler(ctx context.Context, val string, scheduler *scheduler.Scheduler, option *option.AiOption) (*AiScheduler, error) {
|
||||||
return &AiScheduler{ctx: ctx, yamlString: val, Scheduler: scheduler, option: option}, nil
|
return &AiScheduler{ctx: ctx, yamlString: val, Scheduler: scheduler, option: option}, nil
|
||||||
}
|
}
|
||||||
|
@ -100,12 +111,10 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) ([]inter
|
||||||
return nil, errors.New("clusters is nil")
|
return nil, errors.New("clusters is nil")
|
||||||
}
|
}
|
||||||
|
|
||||||
//res := struct {
|
|
||||||
//}{}
|
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
var result []interface{}
|
var results []interface{}
|
||||||
var errs []error
|
var errs []error
|
||||||
var ch = make(chan interface{}, len(clusters))
|
var ch = make(chan *AiResult, len(clusters))
|
||||||
var errCh = make(chan error, len(clusters))
|
var errCh = make(chan error, len(clusters))
|
||||||
|
|
||||||
executorMap := *as.AiExecutor
|
executorMap := *as.AiExecutor
|
||||||
|
@ -116,23 +125,20 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) ([]inter
|
||||||
}
|
}
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
resp, err := executorMap[c.Name].Execute(as.ctx, as.option)
|
opt, _ := cloneAiOption(as.option)
|
||||||
|
resp, err := executorMap[c.Name].Execute(as.ctx, opt)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// TODO: database operation
|
|
||||||
errCh <- err
|
errCh <- err
|
||||||
wg.Done()
|
wg.Done()
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
// TODO: database operation
|
|
||||||
data := struct {
|
result, _ := convertType(resp)
|
||||||
Resp interface{}
|
result.replica = c.Replicas
|
||||||
ClusterId int64
|
result.clusterId = strconv.FormatInt(c.ParticipantId, 10)
|
||||||
}{
|
|
||||||
Resp: resp,
|
ch <- result
|
||||||
ClusterId: c.ParticipantId,
|
|
||||||
}
|
|
||||||
ch <- data
|
|
||||||
wg.Done()
|
wg.Done()
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
@ -149,19 +155,11 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) ([]inter
|
||||||
}
|
}
|
||||||
|
|
||||||
for s := range ch {
|
for s := range ch {
|
||||||
data := (s).(struct {
|
// TODO: database operation
|
||||||
Resp interface{}
|
results = append(results, s)
|
||||||
ClusterId int64
|
|
||||||
})
|
|
||||||
|
|
||||||
result = append(result, data.Resp)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
err := as.AiStorages.SaveTask(as.option.TaskName)
|
return results, nil
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return result, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats, error) {
|
func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats, error) {
|
||||||
|
@ -207,3 +205,43 @@ func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats,
|
||||||
}
|
}
|
||||||
return resourceSpecs, nil
|
return resourceSpecs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func convertType(in interface{}) (*AiResult, error) {
|
||||||
|
var result AiResult
|
||||||
|
switch (in).(type) {
|
||||||
|
case *hpcAC.SubmitTaskAiResp:
|
||||||
|
resp := (in).(*hpcAC.SubmitTaskAiResp)
|
||||||
|
if resp.Code == "0" {
|
||||||
|
result.taskId = resp.Data
|
||||||
|
} else {
|
||||||
|
result.msg = resp.Msg
|
||||||
|
}
|
||||||
|
return &result, nil
|
||||||
|
case *octopus.CreateTrainJobResp:
|
||||||
|
resp := (in).(*octopus.CreateTrainJobResp)
|
||||||
|
|
||||||
|
if resp.Success {
|
||||||
|
result.taskId = resp.Payload.JobId
|
||||||
|
} else {
|
||||||
|
result.msg = resp.Error.Message
|
||||||
|
}
|
||||||
|
|
||||||
|
return &result, nil
|
||||||
|
default:
|
||||||
|
return nil, errors.New("ai task response failed")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func cloneAiOption(opt *option.AiOption) (*option.AiOption, error) {
|
||||||
|
origJSON, err := json.Marshal(opt)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
clone := option.AiOption{}
|
||||||
|
if err = json.Unmarshal(origJSON, &clone); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &clone, nil
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
package option
|
||||||
|
|
||||||
|
type CloudOption struct {
|
||||||
|
task interface{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c CloudOption) GetOptionType() string {
|
||||||
|
return CLOUD
|
||||||
|
}
|
|
@ -64,6 +64,8 @@ func (ps *DynamicResourcesStrategy) Schedule() ([]*AssignedCluster, error) {
|
||||||
}
|
}
|
||||||
results = append(results, &assignedCluster)
|
results = append(results, &assignedCluster)
|
||||||
return results, nil
|
return results, nil
|
||||||
|
case option.CLOUD:
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, errors.New("failed to apply DynamicResourcesStrategy")
|
return nil, errors.New("failed to apply DynamicResourcesStrategy")
|
||||||
|
|
|
@ -370,6 +370,14 @@ func ConvertType(in interface{}, out interface{}, participant *models.StorelinkC
|
||||||
|
|
||||||
resp.TaskId = inresp.Payload.JobId
|
resp.TaskId = inresp.Payload.JobId
|
||||||
|
|
||||||
|
return resp, nil
|
||||||
|
case *types.ScheduleResp:
|
||||||
|
resp := (interface{})(out).(*types.ScheduleResp)
|
||||||
|
resp.Success = inresp.Success
|
||||||
|
if !resp.Success {
|
||||||
|
resp.ErrorMsg = inresp.Error.Message
|
||||||
|
return resp, nil
|
||||||
|
}
|
||||||
return resp, nil
|
return resp, nil
|
||||||
}
|
}
|
||||||
return nil, nil
|
return nil, nil
|
||||||
|
@ -404,6 +412,15 @@ func ConvertType(in interface{}, out interface{}, participant *models.StorelinkC
|
||||||
resp.ErrorMsg = inresp.Msg
|
resp.ErrorMsg = inresp.Msg
|
||||||
}
|
}
|
||||||
return resp, nil
|
return resp, nil
|
||||||
|
case *types.ScheduleResp:
|
||||||
|
resp := (interface{})(out).(*types.ScheduleResp)
|
||||||
|
if inresp.Code == "0" {
|
||||||
|
resp.Success = true
|
||||||
|
} else {
|
||||||
|
resp.Success = false
|
||||||
|
resp.ErrorMsg = inresp.Msg
|
||||||
|
}
|
||||||
|
return resp, nil
|
||||||
}
|
}
|
||||||
return nil, nil
|
return nil, nil
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue