fix aischeduler bugs
Former-commit-id: 887857a40bd3492ac3a412902893cb3990048677
This commit is contained in:
parent
32b4c340be
commit
62ecd4e2e4
|
@ -130,39 +130,44 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
var results []*AiResult
|
var results []*AiResult
|
||||||
var errs []interface{}
|
var errs []interface{}
|
||||||
var ch = make(chan *AiResult, len(clusters))
|
var taskNum int32
|
||||||
var errCh = make(chan interface{}, len(clusters))
|
for _, cluster := range clusters {
|
||||||
|
taskNum += cluster.Replicas
|
||||||
|
}
|
||||||
|
var ch = make(chan *AiResult, taskNum)
|
||||||
|
var errCh = make(chan interface{}, taskNum)
|
||||||
|
|
||||||
executorMap := as.AiService.AiExecutorAdapterMap[as.option.AdapterId]
|
executorMap := as.AiService.AiExecutorAdapterMap[as.option.AdapterId]
|
||||||
for _, cluster := range clusters {
|
for _, cluster := range clusters {
|
||||||
c := cluster
|
c := cluster
|
||||||
wg.Add(1)
|
for i := 0; i < int(c.Replicas); i++ {
|
||||||
go func() {
|
wg.Add(1)
|
||||||
opt, _ := cloneAiOption(as.option)
|
go func() {
|
||||||
resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt)
|
opt, _ := cloneAiOption(as.option)
|
||||||
|
resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
e := struct {
|
e := struct {
|
||||||
err error
|
err error
|
||||||
clusterId string
|
clusterId string
|
||||||
}{
|
}{
|
||||||
err: err,
|
err: err,
|
||||||
clusterId: c.ClusterId,
|
clusterId: c.ClusterId,
|
||||||
|
}
|
||||||
|
errCh <- e
|
||||||
|
wg.Done()
|
||||||
|
return
|
||||||
}
|
}
|
||||||
errCh <- e
|
|
||||||
|
result, _ := convertType(resp)
|
||||||
|
result.Replica = c.Replicas
|
||||||
|
result.ClusterId = c.ClusterId
|
||||||
|
result.Strategy = as.option.StrategyName
|
||||||
|
result.Card = opt.ComputeCard
|
||||||
|
|
||||||
|
ch <- result
|
||||||
wg.Done()
|
wg.Done()
|
||||||
return
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
result, _ := convertType(resp)
|
|
||||||
result.Replica = c.Replicas
|
|
||||||
result.ClusterId = c.ClusterId
|
|
||||||
result.Strategy = as.option.StrategyName
|
|
||||||
result.Card = opt.ComputeCard
|
|
||||||
|
|
||||||
ch <- result
|
|
||||||
wg.Done()
|
|
||||||
}()
|
|
||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
close(ch)
|
close(ch)
|
||||||
|
|
|
@ -591,7 +591,10 @@ func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiO
|
||||||
}
|
}
|
||||||
|
|
||||||
if option.ResourceType == CARD {
|
if option.ResourceType == CARD {
|
||||||
err = setResourceIdByCard(option, specResp, GCU)
|
if option.ComputeCard == "" {
|
||||||
|
option.ComputeCard = GCU
|
||||||
|
}
|
||||||
|
err = setResourceIdByCard(option, specResp, option.ComputeCard)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -742,6 +745,8 @@ func (o *OctopusLink) generateCmd(option *option.AiOption) error {
|
||||||
switch option.ComputeCard {
|
switch option.ComputeCard {
|
||||||
case GCU:
|
case GCU:
|
||||||
option.Cmd = "cd /code; python3 train.py"
|
option.Cmd = "cd /code; python3 train.py"
|
||||||
|
case MLU:
|
||||||
|
option.Cmd = "su root; cd /torch/venv3/pytorch/bin; source activate; cd /code; python train.py"
|
||||||
default:
|
default:
|
||||||
option.Cmd = TRAIN_CMD
|
option.Cmd = TRAIN_CMD
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,7 +33,7 @@ import (
|
||||||
const (
|
const (
|
||||||
RAM_SIZE_1G = 1024 // 1G
|
RAM_SIZE_1G = 1024 // 1G
|
||||||
WORKER_NUMBER = 1
|
WORKER_NUMBER = 1
|
||||||
DCU = "dcu"
|
DCU = "DCU"
|
||||||
DCU_TOPS = 24.5
|
DCU_TOPS = 24.5
|
||||||
PYTORCH = "Pytorch"
|
PYTORCH = "Pytorch"
|
||||||
TASK_PYTORCH_PREFIX = "PytorchTask"
|
TASK_PYTORCH_PREFIX = "PytorchTask"
|
||||||
|
@ -570,7 +570,13 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
if option.ResourceType == CARD {
|
if option.ResourceType == CARD {
|
||||||
|
if option.ComputeCard == "" {
|
||||||
|
option.ComputeCard = DCU
|
||||||
|
}
|
||||||
|
|
||||||
|
if strings.ToUpper(option.ComputeCard) != DCU {
|
||||||
|
return errors.New("computeCard not found")
|
||||||
|
}
|
||||||
option.ComputeCard = DCU
|
option.ComputeCard = DCU
|
||||||
|
|
||||||
if 0 <= option.Tops && option.Tops <= DCU_TOPS {
|
if 0 <= option.Tops && option.Tops <= DCU_TOPS {
|
||||||
|
|
Loading…
Reference in New Issue