Merge pull request 'updated acquire resources concurrently' (#86) from tzwang/pcm-coordinator:master into master

Former-commit-id: 2559e61a8deed328a47f5e640d2096ce11e8be9f
This commit is contained in:
tzwang 2024-03-28 17:37:49 +08:00
commit 3065d16756
2 changed files with 144 additions and 58 deletions

View File

@ -24,6 +24,7 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/api/pkg/response"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"sync"
)
type AiScheduler struct {
@ -98,25 +99,60 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) error {
executorMap := *as.AiExecutor
for _, cluster := range clusters {
_, err := executorMap[cluster.Name].Execute(as.option)
if err != nil {
// TODO: database operation
c := cluster
if cluster.Replicas == 0 {
continue
}
// TODO: database operation
go func() {
_, err := executorMap[c.Name].Execute(as.option)
if err != nil {
// TODO: database operation
}
// TODO: database operation
}()
}
return nil
}
func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats, error) {
var wg sync.WaitGroup
var ch = make(chan *collector.ResourceStats, len(*as.ResourceCollector))
var errCh = make(chan error, len(*as.ResourceCollector))
var resourceSpecs []*collector.ResourceStats
var errs []error
for _, resourceCollector := range *as.ResourceCollector {
spec, err := resourceCollector.GetResourceStats()
if err != nil {
continue
}
resourceSpecs = append(resourceSpecs, spec)
wg.Add(1)
rc := resourceCollector
go func() {
spec, err := rc.GetResourceStats()
if err != nil {
errCh <- err
wg.Done()
return
}
ch <- spec
wg.Done()
}()
}
wg.Wait()
close(ch)
close(errCh)
for s := range ch {
resourceSpecs = append(resourceSpecs, s)
}
for e := range errCh {
errs = append(errs, e)
}
if len(errs) != 0 {
return nil, errors.New("get resources failed")
}
if len(resourceSpecs) == 0 {
return nil, errors.New("no resource found")
}

View File

@ -28,6 +28,7 @@ import (
"gitlink.org.cn/jcce-pcm/pcm-participant-octopus/octopus"
"gorm.io/gorm"
"strings"
"sync"
)
type Linkage interface {
@ -124,73 +125,122 @@ func GetResourceTypes() []string {
}
func GetDatasetsNames(collectorMap *map[string]collector.AiCollector) ([]string, error) {
var wg sync.WaitGroup
var errCh = make(chan error, len(*collectorMap))
var errs []error
var names []string
//errCount := 0
var mu sync.Mutex
colMap := *collectorMap
for _, col := range colMap {
var ns []string
specs, err := col.GetDatasetsSpecs()
if err != nil {
return nil, errors.New("failed to acquire datasets list")
}
for _, spec := range specs {
ns = append(ns, spec.Name)
}
if len(ns) == 0 {
continue
}
if len(names) == 0 {
names = ns
continue
}
names = common.IntersectString(names, ns)
wg.Add(1)
c := col
go func() {
var ns []string
specs, err := c.GetDatasetsSpecs()
if err != nil {
errCh <- err
wg.Done()
return
}
for _, spec := range specs {
ns = append(ns, spec.Name)
}
if len(ns) == 0 {
wg.Done()
return
}
mu.Lock()
if len(names) == 0 {
names = ns
wg.Done()
mu.Unlock()
return
}
names = common.IntersectString(names, ns)
wg.Done()
mu.Unlock()
}()
}
//if (len(*collectorMap) - errCount) < 2 {
//
//}
wg.Wait()
close(errCh)
for e := range errCh {
errs = append(errs, e)
}
if len(errs) != 0 {
return nil, errors.New("get DatasetsNames failed")
}
names = common.RemoveDuplicates(names)
return names, nil
}
func GetAlgorithms(collectorMap *map[string]collector.AiCollector, resourceType string, taskType string, dataset string) ([]string, error) {
var names []string
var wg sync.WaitGroup
var errCh = make(chan error, len(*collectorMap))
var errs []error
var mu sync.Mutex
colMap := *collectorMap
for _, col := range colMap {
var ns []string
algorithms, err := col.GetAlgorithms()
if err != nil {
return nil, err
}
for _, algorithm := range algorithms {
if algorithm.TaskType != taskType {
continue
wg.Add(1)
c := col
go func() {
var ns []string
algorithms, err := c.GetAlgorithms()
if err != nil {
errCh <- err
wg.Done()
return
}
switch algorithm.Platform {
case OCTOPUS:
splitns := strings.Split(algorithm.Name, UNDERSCORE)
if dataset != splitns[0] || len(splitns) == 1 {
for _, algorithm := range algorithms {
if algorithm.TaskType != taskType {
continue
}
ns = append(ns, splitns[1])
case SHUGUANGAI:
splitns := strings.Split(algorithm.Name, DASH)
if dataset != splitns[0] || len(splitns) == 1 {
continue
switch algorithm.Platform {
case OCTOPUS:
splitns := strings.Split(algorithm.Name, UNDERSCORE)
if dataset != splitns[0] || len(splitns) == 1 {
continue
}
ns = append(ns, splitns[1])
case SHUGUANGAI:
splitns := strings.Split(algorithm.Name, DASH)
if dataset != splitns[0] || len(splitns) == 1 {
continue
}
ns = append(ns, splitns[1])
}
ns = append(ns, splitns[1])
}
}
if len(ns) == 0 {
continue
}
if len(names) == 0 {
names = ns
continue
}
names = common.IntersectString(names, ns)
if len(ns) == 0 {
wg.Done()
return
}
mu.Lock()
if len(names) == 0 {
names = ns
wg.Done()
mu.Unlock()
return
}
names = common.IntersectString(names, ns)
wg.Done()
mu.Unlock()
}()
}
wg.Wait()
close(errCh)
for e := range errCh {
errs = append(errs, e)
}
if len(errs) != 0 {
return nil, errors.New("get Algorithms failed")
}
names = common.RemoveDuplicates(names)
return names, nil
}