Merge pull request 'updated aitask submit implementations' (#48) from tzwang/pcm-coordinator:master into master
Former-commit-id: d9cd1d41ac76d168a46afa70a289e7dc748a34da
This commit is contained in:
commit
05cfa23329
|
@ -46,7 +46,8 @@ const (
|
||||||
SAILINGSI = "sailingsi"
|
SAILINGSI = "sailingsi"
|
||||||
MLU = "MLU"
|
MLU = "MLU"
|
||||||
CAMBRICONMLU290 = 256
|
CAMBRICONMLU290 = 256
|
||||||
GCU = "enflame"
|
GCU = "GCU"
|
||||||
|
ENFLAME = "enflame"
|
||||||
EnflameT20 = 128
|
EnflameT20 = 128
|
||||||
BASE_TOPS = 128
|
BASE_TOPS = 128
|
||||||
CAMBRICON = "cambricon"
|
CAMBRICON = "cambricon"
|
||||||
|
@ -57,7 +58,7 @@ const (
|
||||||
var (
|
var (
|
||||||
cardAliasMap = map[string]string{
|
cardAliasMap = map[string]string{
|
||||||
MLU: CAMBRICON,
|
MLU: CAMBRICON,
|
||||||
GCU: GCU,
|
GCU: ENFLAME,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -341,11 +342,10 @@ func (o *OctopusLink) generateResourceId(option *option.AiOption) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if option.ResourceType == CARD {
|
if option.ResourceType == CARD {
|
||||||
err = setResourceIdByCard(option, specResp, MLU)
|
err = setResourceIdByCard(option, specResp, GCU)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -418,16 +418,23 @@ func (o *OctopusLink) generateImageId(option *option.AiOption) error {
|
||||||
if !preImgResp.Success {
|
if !preImgResp.Success {
|
||||||
return errors.New("failed to get PresetImages")
|
return errors.New("failed to get PresetImages")
|
||||||
}
|
}
|
||||||
for _, image := range preImgResp.Payload.Images {
|
|
||||||
if strings.Contains(image.ImageName, option.TaskType) && strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) {
|
if option.ResourceType == CARD {
|
||||||
option.ImageId = image.Id
|
for _, image := range preImgResp.Payload.Images {
|
||||||
return nil
|
if strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) {
|
||||||
|
option.ImageId = image.Id
|
||||||
|
return nil
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return errors.New("failed to get ImageId")
|
return errors.New("failed to get ImageId")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
|
func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
|
||||||
|
// temporarily set algorithm to cnn
|
||||||
|
option.AlgorithmName = "cnn"
|
||||||
|
|
||||||
req := &octopus.GetMyAlgorithmListReq{
|
req := &octopus.GetMyAlgorithmListReq{
|
||||||
Platform: o.platform,
|
Platform: o.platform,
|
||||||
PageIndex: o.pageIndex,
|
PageIndex: o.pageIndex,
|
||||||
|
@ -442,18 +449,33 @@ func (o *OctopusLink) generateAlgorithmId(option *option.AiOption) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, algorithm := range resp.Payload.Algorithms {
|
for _, algorithm := range resp.Payload.Algorithms {
|
||||||
if algorithm.FrameworkName == strings.Title(option.TaskType) && strings.Contains(algorithm.AlgorithmName, option.DatasetsName) {
|
if algorithm.FrameworkName == strings.Title(option.TaskType) {
|
||||||
|
ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE)
|
||||||
|
if ns[0] != option.DatasetsName {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if ns[1] != option.AlgorithmName {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if ns[2] != option.ResourceType {
|
||||||
|
continue
|
||||||
|
}
|
||||||
option.AlgorithmId = algorithm.AlgorithmId
|
option.AlgorithmId = algorithm.AlgorithmId
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return errors.New("failed to get AlgorithmId")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o *OctopusLink) generateCmd(option *option.AiOption) error {
|
func (o *OctopusLink) generateCmd(option *option.AiOption) error {
|
||||||
if option.Cmd == "" {
|
if option.Cmd == "" {
|
||||||
option.Cmd = TRAIN_CMD
|
switch option.ComputeCard {
|
||||||
|
case GCU:
|
||||||
|
option.Cmd = "cd /code; python3 train.py"
|
||||||
|
default:
|
||||||
|
option.Cmd = TRAIN_CMD
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
|
|
@ -57,17 +57,17 @@ var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
|
||||||
},
|
},
|
||||||
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": {
|
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": {
|
||||||
CPU: 2,
|
CPU: 2,
|
||||||
GPU: 1,
|
GPU: 3,
|
||||||
RAM: 4 * RAM_SIZE_1G,
|
RAM: 4 * RAM_SIZE_1G,
|
||||||
},
|
},
|
||||||
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": {
|
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": {
|
||||||
CPU: 5,
|
CPU: 4,
|
||||||
GPU: 1,
|
GPU: 4,
|
||||||
RAM: 10 * RAM_SIZE_1G,
|
RAM: 8 * RAM_SIZE_1G,
|
||||||
},
|
},
|
||||||
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": {
|
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": {
|
||||||
CPU: 5,
|
CPU: 5,
|
||||||
GPU: 2,
|
GPU: 5,
|
||||||
RAM: 10 * RAM_SIZE_1G,
|
RAM: 10 * RAM_SIZE_1G,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -75,9 +75,9 @@ var RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
|
||||||
var RESOURCESPECSAI = map[string]string{
|
var RESOURCESPECSAI = map[string]string{
|
||||||
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G",
|
"WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G",
|
||||||
"6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G",
|
"6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G",
|
||||||
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:1, RAM:4G",
|
"OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:3, RAM:4G",
|
||||||
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:5, DCU:1, RAM:10G",
|
"sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:4, DCU:4, RAM:8G",
|
||||||
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:2, RAM:10G",
|
"jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:5, RAM:10G",
|
||||||
}
|
}
|
||||||
|
|
||||||
type ResourceSpecSGAI struct {
|
type ResourceSpecSGAI struct {
|
||||||
|
@ -352,18 +352,29 @@ func (s *ShuguangAi) generateResourceId(option *option.AiOption) error {
|
||||||
|
|
||||||
if option.ResourceType == CPU {
|
if option.ResourceType == CPU {
|
||||||
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
|
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if option.ResourceType == CARD {
|
if option.ResourceType == CARD {
|
||||||
if option.Tops == 0 {
|
if 0 <= option.Tops && option.Tops <= DCU_TOPS {
|
||||||
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
|
option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if option.Tops > DCU_TOPS {
|
cardNum := 5
|
||||||
|
for k, v := range RESOURCESGAIMAP {
|
||||||
|
for i := 1; i <= cardNum; i++ {
|
||||||
|
if float64(i)*DCU_TOPS <= option.Tops && option.Tops <= float64(v.GPU)*DCU_TOPS {
|
||||||
|
option.ResourceId = k
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if option.Tops > float64(cardNum)*DCU_TOPS {
|
||||||
option.ResourceId = "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2"
|
option.ResourceId = "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2"
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
//Todo add more dcu specs
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return errors.New("failed to get ResourceId")
|
return errors.New("failed to get ResourceId")
|
||||||
|
@ -386,7 +397,12 @@ func (s *ShuguangAi) generateImageId(option *option.AiOption) error {
|
||||||
return errors.New("failed to get imageId")
|
return errors.New("failed to get imageId")
|
||||||
}
|
}
|
||||||
|
|
||||||
if option.ResourceType == CPU {
|
for _, datum := range resp.Data {
|
||||||
|
ns := strings.Split(datum.Version, COLON)
|
||||||
|
if ns[0] == "jupyterlab-pytorch" {
|
||||||
|
option.ImageId = datum.ImageId
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -412,6 +428,7 @@ func (s *ShuguangAi) generateAlgorithmId(option *option.AiOption) error {
|
||||||
if ns[0] == option.DatasetsName {
|
if ns[0] == option.DatasetsName {
|
||||||
algorithmId = option.TaskType + DASH + file.Name
|
algorithmId = option.TaskType + DASH + file.Name
|
||||||
option.AlgorithmId = algorithmId
|
option.AlgorithmId = algorithmId
|
||||||
|
option.AlgorithmName = ns[1]
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -430,6 +447,23 @@ func (s *ShuguangAi) generateEnv(option *option.AiOption) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *ShuguangAi) generateParams(option *option.AiOption) error {
|
func (s *ShuguangAi) generateParams(option *option.AiOption) error {
|
||||||
|
if option.ResourceType == "" {
|
||||||
|
return errors.New("ResourceType not set")
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
//epoch := "epoch" + COMMA + "1"
|
||||||
|
//option.Params = append(option.Params, epoch)
|
||||||
|
|
||||||
|
switch option.ResourceType {
|
||||||
|
case CPU:
|
||||||
|
card := "card" + COMMA + CPU
|
||||||
|
option.Params = append(option.Params, card)
|
||||||
|
return nil
|
||||||
|
case CARD:
|
||||||
|
card := "card" + COMMA + "cuda:0"
|
||||||
|
option.Params = append(option.Params, card)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return errors.New("failed to set params")
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,6 +40,7 @@ type Linkage interface {
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
COLON = ":"
|
||||||
PY_PARAM_PREFIX = "--"
|
PY_PARAM_PREFIX = "--"
|
||||||
SPACE = " "
|
SPACE = " "
|
||||||
UNDERSCORE = "_"
|
UNDERSCORE = "_"
|
||||||
|
|
Loading…
Reference in New Issue