Merge pull request 'updated aitask submit implementations' (#48) from tzwang/pcm-coordinator:master into master
Former-commit-id: d9cd1d41ac76d168a46afa70a289e7dc748a34da
This commit is contained in:
commit
05cfa23329
|
@ -46,7 +46,8 @@ const (
|
|||
SAILINGSI = "sailingsi"
|
||||
MLU = "MLU"
|
||||
CAMBRICONMLU290 = 256
|
||||
GCU = "enflame"
|
||||
GCU = "GCU"
|
||||
ENFLAME = "enflame"
|
||||
EnflameT20 = 128
|
||||
BASE_TOPS = 128
|
||||
CAMBRICON = "cambricon"
|
||||
|
@ -57,7 +58,7 @@ const (
|
|||
var (
|
||||
cardAliasMap = map[string]string{
|
||||
MLU: CAMBRICON,
|
||||
GCU: GCU,
|
||||
GCU: ENFLAME,
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -341,11 +342,10 @@ func (o *OctopusLink) generateResourceId(option *option.AiOption) error {
|
|||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if option.ResourceType == CARD {
|
||||
err = setResourceIdByCard(option, specResp, MLU)
|
||||
err = setResourceIdByCard(option, specResp, GCU)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -418,16 +418,23 @@ func (o *OctopusLink) generateImageId(option *option.AiOption) error {
|
|||
if !preImgResp.Success {
|
||||
return errors.New("failed to get PresetImages")
|
||||
}
|
||||
for _, image := range preImgResp.Payload.Images {
|
||||
if strings.Contains(image.ImageName, option.TaskType) && strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) {
|
||||
option.ImageId = image.Id
|
||||
return nil
|
||||
|
||||
if option.ResourceType == CARD {
|
||||
for _, image := range preImgResp.Payload.Images {
|
||||
if strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) {
|
||||
option.ImageId = image.Id
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return errors.New("failed to get ImageId")
|
||||
}
|
||||
|
||||
func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
|
||||
// temporarily set algorithm to cnn
|
||||
option.AlgorithmName = "cnn"
|
||||
|
||||
req := &octopus.GetMyAlgorithmListReq{
|
||||
Platform: o.platform,
|
||||
PageIndex: o.pageIndex,
|
||||
|
@ -442,18 +449,33 @@ func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
|
|||
}
|
||||
|
||||
for _, algorithm := range resp.Payload.Algorithms {
|
||||
if algorithm.FrameworkName == strings.Title(option.TaskType) && strings.Contains(algorithm.AlgorithmName, option.DatasetsName) {
|
||||
if algorithm.FrameworkName == strings.Title(option.TaskType) {
|
||||
ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE)
|
||||
if ns[0] != option.DatasetsName {
|
||||
continue
|
||||
}
|
||||
if ns[1] != option.AlgorithmName {
|
||||
continue
|
||||
}
|
||||
if ns[2] != option.ResourceType {
|
||||
continue
|
||||
}
|
||||
option.AlgorithmId = algorithm.AlgorithmId
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
return errors.New("failed to get AlgorithmId")
|
||||
}
|
||||
|
||||
func (o *OctopusLink) generateCmd(option *option.AiOption) error {
|
||||
if option.Cmd == "" {
|
||||
option.Cmd = TRAIN_CMD
|
||||
switch option.ComputeCard {
|
||||
case GCU:
|
||||
option.Cmd = "cd /code; python3 train.py"
|
||||
default:
|
||||
option.Cmd = TRAIN_CMD
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
|
|
|
@ -57,17 +57,17 @@ var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
|
|||
},
|
||||
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": {
|
||||
CPU: 2,
|
||||
GPU: 1,
|
||||
GPU: 3,
|
||||
RAM: 4 * RAM_SIZE_1G,
|
||||
},
|
||||
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": {
|
||||
CPU: 5,
|
||||
GPU: 1,
|
||||
RAM: 10 * RAM_SIZE_1G,
|
||||
CPU: 4,
|
||||
GPU: 4,
|
||||
RAM: 8 * RAM_SIZE_1G,
|
||||
},
|
||||
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": {
|
||||
CPU: 5,
|
||||
GPU: 2,
|
||||
GPU: 5,
|
||||
RAM: 10 * RAM_SIZE_1G,
|
||||
},
|
||||
}
|
||||
|
@ -75,9 +75,9 @@ var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
|
|||
var RESOURCESPECSAI = map[string]string{
|
||||
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G",
|
||||
"6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G",
|
||||
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:1, RAM:4G",
|
||||
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:5, DCU:1, RAM:10G",
|
||||
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G",
|
||||
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:3, RAM:4G",
|
||||
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:4, DCU:4, RAM:8G",
|
||||
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:5, RAM:10G",
|
||||
}
|
||||
|
||||
type ResourceSpecSGAI struct {
|
||||
|
@ -352,18 +352,29 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error {
|
|||
|
||||
if option.ResourceType == CPU {
|
||||
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
|
||||
return nil
|
||||
}
|
||||
|
||||
if option.ResourceType == CARD {
|
||||
if option.Tops == 0 {
|
||||
if 0 <= option.Tops && option.Tops <= DCU_TOPS {
|
||||
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
|
||||
return nil
|
||||
}
|
||||
|
||||
if option.Tops > DCU_TOPS {
|
||||
cardNum := 5
|
||||
for k, v := range RESOURCESGAIMAP {
|
||||
for i := 1; i <= cardNum; i++ {
|
||||
if float64(i)*DCU_TOPS <= option.Tops && option.Tops <= float64(v.GPU)*DCU_TOPS {
|
||||
option.ResourceId = k
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if option.Tops > float64(cardNum)*DCU_TOPS {
|
||||
option.ResourceId = "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2"
|
||||
return nil
|
||||
}
|
||||
|
||||
//Todo add more dcu specs
|
||||
}
|
||||
|
||||
return errors.New("failed to get ResourceId")
|
||||
|
@ -386,7 +397,12 @@ func (s *ShuguangAi) generateImageId(option *option.AiOption) error {
|
|||
return errors.New("failed to get imageId")
|
||||
}
|
||||
|
||||
if option.ResourceType == CPU {
|
||||
for _, datum := range resp.Data {
|
||||
ns := strings.Split(datum.Version, COLON)
|
||||
if ns[0] == "jupyterlab-pytorch" {
|
||||
option.ImageId = datum.ImageId
|
||||
return nil
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -412,6 +428,7 @@ func (s *ShuguangAi) generateAlgorithmId(option *option.AiOption) error {
|
|||
if ns[0] == option.DatasetsName {
|
||||
algorithmId = option.TaskType + DASH + file.Name
|
||||
option.AlgorithmId = algorithmId
|
||||
option.AlgorithmName = ns[1]
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
@ -430,6 +447,23 @@ func (s *ShuguangAi) generateEnv(option *option.AiOption) error {
|
|||
}
|
||||
|
||||
func (s *ShuguangAi) generateParams(option *option.AiOption) error {
|
||||
if option.ResourceType == "" {
|
||||
return errors.New("ResourceType not set")
|
||||
}
|
||||
|
||||
return nil
|
||||
//epoch := "epoch" + COMMA + "1"
|
||||
//option.Params = append(option.Params, epoch)
|
||||
|
||||
switch option.ResourceType {
|
||||
case CPU:
|
||||
card := "card" + COMMA + CPU
|
||||
option.Params = append(option.Params, card)
|
||||
return nil
|
||||
case CARD:
|
||||
card := "card" + COMMA + "cuda:0"
|
||||
option.Params = append(option.Params, card)
|
||||
return nil
|
||||
}
|
||||
|
||||
return errors.New("failed to set params")
|
||||
}
|
||||
|
|
|
@ -40,6 +40,7 @@ type Linkage interface {
|
|||
}
|
||||
|
||||
const (
|
||||
COLON = ":"
|
||||
PY_PARAM_PREFIX = "--"
|
||||
SPACE = " "
|
||||
UNDERSCORE = "_"
|
||||
|
|
Loading…
Reference in New Issue