Merge pull request 'updated aitask submit implementations' (#48) from tzwang/pcm-coordinator:master into master

Former-commit-id: d9cd1d41ac76d168a46afa70a289e7dc748a34da
This commit is contained in:
tzwang 2024-03-13 17:37:34 +08:00
commit 05cfa23329
3 changed files with 82 additions and 25 deletions

View File

@ -46,7 +46,8 @@ const (
SAILINGSI = "sailingsi" SAILINGSI = "sailingsi"
MLU = "MLU" MLU = "MLU"
CAMBRICONMLU290 = 256 CAMBRICONMLU290 = 256
GCU = "enflame" GCU = "GCU"
ENFLAME = "enflame"
EnflameT20 = 128 EnflameT20 = 128
BASE_TOPS = 128 BASE_TOPS = 128
CAMBRICON = "cambricon" CAMBRICON = "cambricon"
@ -57,7 +58,7 @@ const (
var ( var (
cardAliasMap = map[string]string{ cardAliasMap = map[string]string{
MLU: CAMBRICON, MLU: CAMBRICON,
GCU: GCU, GCU: ENFLAME,
} }
) )
@ -341,11 +342,10 @@ func (o *OctopusLink) generateResourceId(option *option.AiOption) error {
return nil return nil
} }
} }
} }
if option.ResourceType == CARD { if option.ResourceType == CARD {
err = setResourceIdByCard(option, specResp, MLU) err = setResourceIdByCard(option, specResp, GCU)
if err != nil { if err != nil {
return err return err
} }
@ -418,16 +418,23 @@ func (o *OctopusLink) generateImageId(option *option.AiOption) error {
if !preImgResp.Success { if !preImgResp.Success {
return errors.New("failed to get PresetImages") return errors.New("failed to get PresetImages")
} }
for _, image := range preImgResp.Payload.Images {
if strings.Contains(image.ImageName, option.TaskType) && strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) { if option.ResourceType == CARD {
option.ImageId = image.Id for _, image := range preImgResp.Payload.Images {
return nil if strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) {
option.ImageId = image.Id
return nil
}
} }
} }
return errors.New("failed to get ImageId") return errors.New("failed to get ImageId")
} }
func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error { func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
// temporarily set algorithm to cnn
option.AlgorithmName = "cnn"
req := &octopus.GetMyAlgorithmListReq{ req := &octopus.GetMyAlgorithmListReq{
Platform: o.platform, Platform: o.platform,
PageIndex: o.pageIndex, PageIndex: o.pageIndex,
@ -442,18 +449,33 @@ func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
} }
for _, algorithm := range resp.Payload.Algorithms { for _, algorithm := range resp.Payload.Algorithms {
if algorithm.FrameworkName == strings.Title(option.TaskType) && strings.Contains(algorithm.AlgorithmName, option.DatasetsName) { if algorithm.FrameworkName == strings.Title(option.TaskType) {
ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE)
if ns[0] != option.DatasetsName {
continue
}
if ns[1] != option.AlgorithmName {
continue
}
if ns[2] != option.ResourceType {
continue
}
option.AlgorithmId = algorithm.AlgorithmId option.AlgorithmId = algorithm.AlgorithmId
return nil return nil
} }
} }
return nil return errors.New("failed to get AlgorithmId")
} }
func (o *OctopusLink) generateCmd(option *option.AiOption) error { func (o *OctopusLink) generateCmd(option *option.AiOption) error {
if option.Cmd == "" { if option.Cmd == "" {
option.Cmd = TRAIN_CMD switch option.ComputeCard {
case GCU:
option.Cmd = "cd /code; python3 train.py"
default:
option.Cmd = TRAIN_CMD
}
} }
return nil return nil

View File

@ -57,17 +57,17 @@ var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
}, },
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": { "OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": {
CPU: 2, CPU: 2,
GPU: 1, GPU: 3,
RAM: 4 * RAM_SIZE_1G, RAM: 4 * RAM_SIZE_1G,
}, },
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": { "sBWfpkntUzsWYly11kdwEHZOYYIsFmve": {
CPU: 5, CPU: 4,
GPU: 1, GPU: 4,
RAM: 10 * RAM_SIZE_1G, RAM: 8 * RAM_SIZE_1G,
}, },
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": { "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": {
CPU: 5, CPU: 5,
GPU: 2, GPU: 5,
RAM: 10 * RAM_SIZE_1G, RAM: 10 * RAM_SIZE_1G,
}, },
} }
@ -75,9 +75,9 @@ var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
var RESOURCESPECSAI = map[string]string{ var RESOURCESPECSAI = map[string]string{
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G", "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G",
"6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G", "6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G",
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:1, RAM:4G", "OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:3, RAM:4G",
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:5, DCU:1, RAM:10G", "sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:4, DCU:4, RAM:8G",
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G", "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:5, RAM:10G",
} }
type ResourceSpecSGAI struct { type ResourceSpecSGAI struct {
@ -352,18 +352,29 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error {
if option.ResourceType == CPU { if option.ResourceType == CPU {
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
return nil
} }
if option.ResourceType == CARD { if option.ResourceType == CARD {
if option.Tops == 0 { if 0 <= option.Tops && option.Tops <= DCU_TOPS {
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
return nil
} }
if option.Tops > DCU_TOPS { cardNum := 5
for k, v := range RESOURCESGAIMAP {
for i := 1; i <= cardNum; i++ {
if float64(i)*DCU_TOPS <= option.Tops && option.Tops <= float64(v.GPU)*DCU_TOPS {
option.ResourceId = k
return nil
}
}
}
if option.Tops > float64(cardNum)*DCU_TOPS {
option.ResourceId = "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2" option.ResourceId = "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2"
return nil
} }
//Todo add more dcu specs
} }
return errors.New("failed to get ResourceId") return errors.New("failed to get ResourceId")
@ -386,7 +397,12 @@ func (s *ShuguangAi) generateImageId(option *option.AiOption) error {
return errors.New("failed to get imageId") return errors.New("failed to get imageId")
} }
if option.ResourceType == CPU { for _, datum := range resp.Data {
ns := strings.Split(datum.Version, COLON)
if ns[0] == "jupyterlab-pytorch" {
option.ImageId = datum.ImageId
return nil
}
} }
@ -412,6 +428,7 @@ func (s *ShuguangAi) generateAlgorithmId(option *option.AiOption) error {
if ns[0] == option.DatasetsName { if ns[0] == option.DatasetsName {
algorithmId = option.TaskType + DASH + file.Name algorithmId = option.TaskType + DASH + file.Name
option.AlgorithmId = algorithmId option.AlgorithmId = algorithmId
option.AlgorithmName = ns[1]
return nil return nil
} }
} }
@ -430,6 +447,23 @@ func (s *ShuguangAi) generateEnv(option *option.AiOption) error {
} }
func (s *ShuguangAi) generateParams(option *option.AiOption) error { func (s *ShuguangAi) generateParams(option *option.AiOption) error {
if option.ResourceType == "" {
return errors.New("ResourceType not set")
}
return nil //epoch := "epoch" + COMMA + "1"
//option.Params = append(option.Params, epoch)
switch option.ResourceType {
case CPU:
card := "card" + COMMA + CPU
option.Params = append(option.Params, card)
return nil
case CARD:
card := "card" + COMMA + "cuda:0"
option.Params = append(option.Params, card)
return nil
}
return errors.New("failed to set params")
} }

View File

@ -40,6 +40,7 @@ type Linkage interface {
} }
const ( const (
COLON = ":"
PY_PARAM_PREFIX = "--" PY_PARAM_PREFIX = "--"
SPACE = " " SPACE = " "
UNDERSCORE = "_" UNDERSCORE = "_"