Merge pull request 'updated aitask submit implementations' (#48) from tzwang/pcm-coordinator:master into master

Former-commit-id: d9cd1d41ac76d168a46afa70a289e7dc748a34da
This commit is contained in:
tzwang 2024-03-13 17:37:34 +08:00
commit 05cfa23329
3 changed files with 82 additions and 25 deletions

View File

@ -46,7 +46,8 @@ const (
SAILINGSI = "sailingsi"
MLU = "MLU"
CAMBRICONMLU290 = 256
GCU = "enflame"
GCU = "GCU"
ENFLAME = "enflame"
EnflameT20 = 128
BASE_TOPS = 128
CAMBRICON = "cambricon"
@ -57,7 +58,7 @@ const (
var (
cardAliasMap = map[string]string{
MLU: CAMBRICON,
GCU: GCU,
GCU: ENFLAME,
}
)
@ -341,11 +342,10 @@ func (o *OctopusLink) generateResourceId(option *option.AiOption) error {
return nil
}
}
}
if option.ResourceType == CARD {
err = setResourceIdByCard(option, specResp, MLU)
err = setResourceIdByCard(option, specResp, GCU)
if err != nil {
return err
}
@ -418,16 +418,23 @@ func (o *OctopusLink) generateImageId(option *option.AiOption) error {
if !preImgResp.Success {
return errors.New("failed to get PresetImages")
}
for _, image := range preImgResp.Payload.Images {
if strings.Contains(image.ImageName, option.TaskType) && strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) {
option.ImageId = image.Id
return nil
if option.ResourceType == CARD {
for _, image := range preImgResp.Payload.Images {
if strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) {
option.ImageId = image.Id
return nil
}
}
}
return errors.New("failed to get ImageId")
}
func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
// temporarily set algorithm to cnn
option.AlgorithmName = "cnn"
req := &octopus.GetMyAlgorithmListReq{
Platform: o.platform,
PageIndex: o.pageIndex,
@ -442,18 +449,33 @@ func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
}
for _, algorithm := range resp.Payload.Algorithms {
if algorithm.FrameworkName == strings.Title(option.TaskType) && strings.Contains(algorithm.AlgorithmName, option.DatasetsName) {
if algorithm.FrameworkName == strings.Title(option.TaskType) {
ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE)
if ns[0] != option.DatasetsName {
continue
}
if ns[1] != option.AlgorithmName {
continue
}
if ns[2] != option.ResourceType {
continue
}
option.AlgorithmId = algorithm.AlgorithmId
return nil
}
}
return nil
return errors.New("failed to get AlgorithmId")
}
func (o *OctopusLink) generateCmd(option *option.AiOption) error {
if option.Cmd == "" {
option.Cmd = TRAIN_CMD
switch option.ComputeCard {
case GCU:
option.Cmd = "cd /code; python3 train.py"
default:
option.Cmd = TRAIN_CMD
}
}
return nil

View File

@ -57,17 +57,17 @@ var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
},
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": {
CPU: 2,
GPU: 1,
GPU: 3,
RAM: 4 * RAM_SIZE_1G,
},
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": {
CPU: 5,
GPU: 1,
RAM: 10 * RAM_SIZE_1G,
CPU: 4,
GPU: 4,
RAM: 8 * RAM_SIZE_1G,
},
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": {
CPU: 5,
GPU: 2,
GPU: 5,
RAM: 10 * RAM_SIZE_1G,
},
}
@ -75,9 +75,9 @@ var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
var RESOURCESPECSAI = map[string]string{
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G",
"6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G",
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:1, RAM:4G",
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:5, DCU:1, RAM:10G",
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G",
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:3, RAM:4G",
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:4, DCU:4, RAM:8G",
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:5, RAM:10G",
}
type ResourceSpecSGAI struct {
@ -352,18 +352,29 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error {
if option.ResourceType == CPU {
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
return nil
}
if option.ResourceType == CARD {
if option.Tops == 0 {
if 0 <= option.Tops && option.Tops <= DCU_TOPS {
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
return nil
}
if option.Tops > DCU_TOPS {
cardNum := 5
for k, v := range RESOURCESGAIMAP {
for i := 1; i <= cardNum; i++ {
if float64(i)*DCU_TOPS <= option.Tops && option.Tops <= float64(v.GPU)*DCU_TOPS {
option.ResourceId = k
return nil
}
}
}
if option.Tops > float64(cardNum)*DCU_TOPS {
option.ResourceId = "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2"
return nil
}
//Todo add more dcu specs
}
return errors.New("failed to get ResourceId")
@ -386,7 +397,12 @@ func (s *ShuguangAi) generateImageId(option *option.AiOption) error {
return errors.New("failed to get imageId")
}
if option.ResourceType == CPU {
for _, datum := range resp.Data {
ns := strings.Split(datum.Version, COLON)
if ns[0] == "jupyterlab-pytorch" {
option.ImageId = datum.ImageId
return nil
}
}
@ -412,6 +428,7 @@ func (s *ShuguangAi) generateAlgorithmId(option *option.AiOption) error {
if ns[0] == option.DatasetsName {
algorithmId = option.TaskType + DASH + file.Name
option.AlgorithmId = algorithmId
option.AlgorithmName = ns[1]
return nil
}
}
@ -430,6 +447,23 @@ func (s *ShuguangAi) generateEnv(option *option.AiOption) error {
}
func (s *ShuguangAi) generateParams(option *option.AiOption) error {
if option.ResourceType == "" {
return errors.New("ResourceType not set")
}
return nil
//epoch := "epoch" + COMMA + "1"
//option.Params = append(option.Params, epoch)
switch option.ResourceType {
case CPU:
card := "card" + COMMA + CPU
option.Params = append(option.Params, card)
return nil
case CARD:
card := "card" + COMMA + "cuda:0"
option.Params = append(option.Params, card)
return nil
}
return errors.New("failed to set params")
}

View File

@ -40,6 +40,7 @@ type Linkage interface {
}
const (
COLON = ":"
PY_PARAM_PREFIX = "--"
SPACE = " "
UNDERSCORE = "_"