modified shuguangai getResourceStats
Former-commit-id: a108e00353ec13e300772ebbaa2a40a46e376377
This commit is contained in:
parent
e40a9bb2d0
commit
1cb0a39320
|
@ -61,12 +61,17 @@ func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
|
||||||
if len(resources) == 0 {
|
if len(resources) == 0 {
|
||||||
return nil, errors.New("no cluster has resources")
|
return nil, errors.New("no cluster has resources")
|
||||||
}
|
}
|
||||||
params := ¶m.Params{Resources: resources}
|
|
||||||
|
|
||||||
if len(resources) == 1 {
|
if len(resources) == 1 {
|
||||||
return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ParticipantId: 0, Name: "", Replicas: 1}}, nil
|
var cluster strategy.AssignedCluster
|
||||||
|
cluster.ParticipantId = resources[0].ParticipantId
|
||||||
|
cluster.Name = resources[0].Name
|
||||||
|
cluster.Replicas = 1
|
||||||
|
return &strategy.SingleAssignment{Cluster: &cluster}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
params := ¶m.Params{Resources: resources}
|
||||||
|
|
||||||
switch as.option.StrategyName {
|
switch as.option.StrategyName {
|
||||||
case strategy.REPLICATION:
|
case strategy.REPLICATION:
|
||||||
strategy := strategy.NewReplicationStrategy(¶m.ReplicationParams{Params: params, Replicas: 1})
|
strategy := strategy.NewReplicationStrategy(¶m.ReplicationParams{Params: params, Replicas: 1})
|
||||||
|
@ -75,7 +80,7 @@ func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
|
||||||
strategy := strategy.NewPricingStrategy(¶m.ResourcePricingParams{Params: params, Replicas: 1})
|
strategy := strategy.NewPricingStrategy(¶m.ResourcePricingParams{Params: params, Replicas: 1})
|
||||||
return strategy, nil
|
return strategy, nil
|
||||||
case strategy.DYNAMIC_RESOURCES:
|
case strategy.DYNAMIC_RESOURCES:
|
||||||
strategy := strategy.NewDynamicResourcesStrategy(resources, as.option, 1)
|
strategy := strategy.NewDynamicResourcesStrategy(params.Resources, as.option, 1)
|
||||||
return strategy, nil
|
return strategy, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -87,14 +92,14 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) error {
|
||||||
return errors.New("clusters is nil")
|
return errors.New("clusters is nil")
|
||||||
}
|
}
|
||||||
|
|
||||||
executorMap := *as.AiExecutor
|
//executorMap := *as.AiExecutor
|
||||||
for _, cluster := range clusters {
|
//for _, cluster := range clusters {
|
||||||
_, err := executorMap[cluster.Name].Execute(as.option)
|
// _, err := executorMap[cluster.Name].Execute(as.option)
|
||||||
if err != nil {
|
// if err != nil {
|
||||||
// TODO: database operation
|
// // TODO: database operation
|
||||||
}
|
// }
|
||||||
// TODO: database operation
|
// // TODO: database operation
|
||||||
}
|
//}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,8 +9,11 @@ type ResourceStats struct {
|
||||||
ParticipantId int64
|
ParticipantId int64
|
||||||
Name string
|
Name string
|
||||||
CpuCoreAvail int64
|
CpuCoreAvail int64
|
||||||
|
CpuCoreTotal int64
|
||||||
MemAvail float64
|
MemAvail float64
|
||||||
|
MemTotal float64
|
||||||
DiskAvail float64
|
DiskAvail float64
|
||||||
|
DiskTotal float64
|
||||||
GpuAvail int64
|
GpuAvail int64
|
||||||
CardsAvail []*Card
|
CardsAvail []*Card
|
||||||
CpuCoreHours float64
|
CpuCoreHours float64
|
||||||
|
@ -23,7 +26,7 @@ type Card struct {
|
||||||
Name string
|
Name string
|
||||||
TOpsAtFp16 float64
|
TOpsAtFp16 float64
|
||||||
CardHours float64
|
CardHours float64
|
||||||
Num int32
|
CardNum int32
|
||||||
}
|
}
|
||||||
|
|
||||||
type DatasetsSpecs struct {
|
type DatasetsSpecs struct {
|
||||||
|
|
|
@ -28,10 +28,10 @@ func (ps *DynamicResourcesStrategy) Schedule() ([]*AssignedCluster, error) {
|
||||||
|
|
||||||
var maxCardHoursAvailable float64
|
var maxCardHoursAvailable float64
|
||||||
var maxCpuCoreHoursAvailable float64
|
var maxCpuCoreHoursAvailable float64
|
||||||
var assignedCluster *AssignedCluster
|
var assignedCluster AssignedCluster
|
||||||
var results []*AssignedCluster
|
var results []*AssignedCluster
|
||||||
for _, res := range ps.resources {
|
for _, res := range ps.resources {
|
||||||
if opt.ResourceType == "" {
|
if opt.ResourceType == "cpu" {
|
||||||
if res.CpuCoreHours <= 0 {
|
if res.CpuCoreHours <= 0 {
|
||||||
cluster := &AssignedCluster{ParticipantId: res.ParticipantId, Name: res.Name, Replicas: ps.replicas}
|
cluster := &AssignedCluster{ParticipantId: res.ParticipantId, Name: res.Name, Replicas: ps.replicas}
|
||||||
results = append(results, cluster)
|
results = append(results, cluster)
|
||||||
|
@ -46,7 +46,7 @@ func (ps *DynamicResourcesStrategy) Schedule() ([]*AssignedCluster, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if opt.ResourceType == "" {
|
if opt.ResourceType == "computeCard" {
|
||||||
var maxCurrentCardHours float64
|
var maxCurrentCardHours float64
|
||||||
for _, card := range res.CardsAvail {
|
for _, card := range res.CardsAvail {
|
||||||
cardHours := common.RoundFloat(card.TOpsAtFp16*card.CardHours, 3)
|
cardHours := common.RoundFloat(card.TOpsAtFp16*card.CardHours, 3)
|
||||||
|
@ -62,7 +62,7 @@ func (ps *DynamicResourcesStrategy) Schedule() ([]*AssignedCluster, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
results = append(results, assignedCluster)
|
results = append(results, &assignedCluster)
|
||||||
return results, nil
|
return results, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,7 @@ const (
|
||||||
TRAIN_FILE = "train.py"
|
TRAIN_FILE = "train.py"
|
||||||
CPUCOREPRICEPERHOUR = 0.09
|
CPUCOREPRICEPERHOUR = 0.09
|
||||||
DCUPRICEPERHOUR = 2.0
|
DCUPRICEPERHOUR = 2.0
|
||||||
|
KB = 1024
|
||||||
)
|
)
|
||||||
|
|
||||||
var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
|
var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
|
||||||
|
@ -272,17 +273,25 @@ func (s *ShuguangAi) GetResourceStats() (*collector.ResourceStats, error) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
//limitReq := &hpcAC.QueueReq{}
|
limitReq := &hpcAC.QueueReq{}
|
||||||
//limitResp, err := s.svcCtx.ACRpc.QueryUserQuotasLimit(s.ctx, limitReq)
|
limitResp, err := s.svcCtx.ACRpc.QueryUserQuotasLimit(s.ctx, limitReq)
|
||||||
//if err != nil {
|
if err != nil {
|
||||||
// return nil, err
|
return nil, err
|
||||||
//}
|
}
|
||||||
|
totalCpu := limitResp.Data.AccountMaxCpu
|
||||||
|
totalDcu := limitResp.Data.AccountMaxDcu
|
||||||
|
|
||||||
//diskReq := &hpcAC.ParaStorQuotaReq{}
|
diskReq := &hpcAC.ParaStorQuotaReq{}
|
||||||
//diskResp, err := s.svcCtx.ACRpc.ParaStorQuota(s.ctx, diskReq)
|
diskResp, err := s.svcCtx.ACRpc.ParaStorQuota(s.ctx, diskReq)
|
||||||
//if err != nil {
|
if err != nil {
|
||||||
// return nil, err
|
return nil, err
|
||||||
//}
|
}
|
||||||
|
|
||||||
|
totalDisk := common.RoundFloat(diskResp.Data[0].Threshold*KB*KB, 3)
|
||||||
|
availDisk := common.RoundFloat((diskResp.Data[0].Threshold-diskResp.Data[0].Usage)*KB*KB, 3)
|
||||||
|
|
||||||
|
generalInfo, err := s.svcCtx.ACRpc.GetGeneralInfo(s.ctx, nil)
|
||||||
|
memSize := common.RoundFloat(float64(generalInfo.MemoryInGib)*KB*KB, 3)
|
||||||
|
|
||||||
var cards []*collector.Card
|
var cards []*collector.Card
|
||||||
balance, _ := strconv.ParseFloat(userinfo.Data.AccountBalance, 64)
|
balance, _ := strconv.ParseFloat(userinfo.Data.AccountBalance, 64)
|
||||||
|
@ -295,14 +304,21 @@ func (s *ShuguangAi) GetResourceStats() (*collector.ResourceStats, error) {
|
||||||
Name: DCU,
|
Name: DCU,
|
||||||
TOpsAtFp16: DCU_TOPS,
|
TOpsAtFp16: DCU_TOPS,
|
||||||
CardHours: cardHours,
|
CardHours: cardHours,
|
||||||
|
CardNum: int32(totalDcu),
|
||||||
}
|
}
|
||||||
cards = append(cards, dcu)
|
cards = append(cards, dcu)
|
||||||
resourceStats := &collector.ResourceStats{
|
resourceStats := &collector.ResourceStats{
|
||||||
ParticipantId: s.participantId,
|
ParticipantId: s.participantId,
|
||||||
Name: s.platform,
|
Name: s.platform,
|
||||||
Balance: balance,
|
Balance: balance,
|
||||||
CardsAvail: cards,
|
CpuCoreTotal: totalCpu,
|
||||||
|
CpuCoreAvail: 0,
|
||||||
|
DiskTotal: totalDisk,
|
||||||
|
DiskAvail: availDisk,
|
||||||
|
MemTotal: memSize,
|
||||||
|
MemAvail: 0,
|
||||||
CpuCoreHours: cpuHours,
|
CpuCoreHours: cpuHours,
|
||||||
|
CardsAvail: cards,
|
||||||
}
|
}
|
||||||
|
|
||||||
return resourceStats, nil
|
return resourceStats, nil
|
||||||
|
|
Loading…
Reference in New Issue