Merge pull request 'fix aischeduler bugs' (#180) from tzwang/pcm-coordinator:master into master

Former-commit-id: 5d2c35a4825c2828486adfd178bc86b70f0b42e4
This commit is contained in:
tzwang 2024-05-16 16:59:09 +08:00
commit ba367102ac
3 changed files with 45 additions and 29 deletions

View File

@ -130,17 +130,21 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
var wg sync.WaitGroup var wg sync.WaitGroup
var results []*AiResult var results []*AiResult
var errs []interface{} var errs []interface{}
var ch = make(chan *AiResult, len(clusters)) var taskNum int32
var errCh = make(chan interface{}, len(clusters)) for _, cluster := range clusters {
taskNum += cluster.Replicas
}
var ch = make(chan *AiResult, taskNum)
var errCh = make(chan interface{}, taskNum)
executorMap := as.AiService.AiExecutorAdapterMap[as.option.AdapterId] executorMap := as.AiService.AiExecutorAdapterMap[as.option.AdapterId]
for _, cluster := range clusters { for _, cluster := range clusters {
c := cluster c := cluster
for i := 0; i < int(c.Replicas); i++ {
wg.Add(1) wg.Add(1)
go func() { go func() {
opt, _ := cloneAiOption(as.option) opt, _ := cloneAiOption(as.option)
resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt) resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt)
if err != nil { if err != nil {
e := struct { e := struct {
err error err error
@ -164,6 +168,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
wg.Done() wg.Done()
}() }()
} }
}
wg.Wait() wg.Wait()
close(ch) close(ch)
close(errCh) close(errCh)

View File

@ -591,7 +591,10 @@ func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiO
} }
if option.ResourceType == CARD { if option.ResourceType == CARD {
err = setResourceIdByCard(option, specResp, GCU) if option.ComputeCard == "" {
option.ComputeCard = GCU
}
err = setResourceIdByCard(option, specResp, option.ComputeCard)
if err != nil { if err != nil {
return err return err
} }
@ -742,6 +745,8 @@ func (o *OctopusLink) generateCmd(option *option.AiOption) error {
switch option.ComputeCard { switch option.ComputeCard {
case GCU: case GCU:
option.Cmd = "cd /code; python3 train.py" option.Cmd = "cd /code; python3 train.py"
case MLU:
option.Cmd = "su root; cd /torch/venv3/pytorch/bin; source activate; cd /code; python train.py"
default: default:
option.Cmd = TRAIN_CMD option.Cmd = TRAIN_CMD
} }

View File

@ -33,7 +33,7 @@ import (
const ( const (
RAM_SIZE_1G = 1024 // 1G RAM_SIZE_1G = 1024 // 1G
WORKER_NUMBER = 1 WORKER_NUMBER = 1
DCU = "dcu" DCU = "DCU"
DCU_TOPS = 24.5 DCU_TOPS = 24.5
PYTORCH = "Pytorch" PYTORCH = "Pytorch"
TASK_PYTORCH_PREFIX = "PytorchTask" TASK_PYTORCH_PREFIX = "PytorchTask"
@ -570,7 +570,13 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error {
} }
if option.ResourceType == CARD { if option.ResourceType == CARD {
if option.ComputeCard == "" {
option.ComputeCard = DCU
}
if strings.ToUpper(option.ComputeCard) != DCU {
return errors.New("computeCard not found")
}
option.ComputeCard = DCU option.ComputeCard = DCU
if 0 <= option.Tops && option.Tops <= DCU_TOPS { if 0 <= option.Tops && option.Tops <= DCU_TOPS {