Merge pull request '[migration] index migration skip retrying scroll if possible' (#70) from feature/migration into master

This commit is contained in:
silenceqi 2023-04-24 14:37:22 +08:00
commit aa9a8718d0
4 changed files with 146 additions and 54 deletions

View File

@ -7,15 +7,15 @@ package model
import (
"context"
"fmt"
"net/http"
"time"
"infini.sh/framework/core/agent"
"infini.sh/framework/core/orm"
"infini.sh/framework/core/util"
"infini.sh/framework/modules/pipeline"
"net/http"
"time"
)
type Instance struct {
orm.ORMObjectBase
@ -61,7 +61,6 @@ func (inst *Instance) StartPipeline(pipelineID string) error {
return inst.doRequest(req, nil)
}
func (inst *Instance) DeletePipeline(pipelineID string) error {
req := &util.Request{
Method: http.MethodDelete,
@ -98,6 +97,17 @@ func (inst *Instance) DeleteQueueBySelector(selector util.MapStr) error {
return inst.doRequest(req, nil)
}
func (inst *Instance) DeleteQueueConsumersBySelector(selector util.MapStr) error {
req := &util.Request{
Method: http.MethodDelete,
Url: fmt.Sprintf("%s/queue/consumer/_search", inst.Endpoint),
Body: util.MustToJSONBytes(util.MapStr{
"selector": selector,
}),
}
return inst.doRequest(req, nil)
}
func (inst *Instance) TryConnect(ctx context.Context) error {
req := &util.Request{
Method: http.MethodGet,

View File

@ -105,10 +105,16 @@ type IndexStateInfo struct {
IndexDocs float64
}
const (
IndexMigrationV0 = 0
IndexMigrationV1 = 1
)
type IndexMigrationTaskConfig struct {
Source IndexMigrationSourceConfig `json:"source"`
Target IndexMigrationTargetConfig `json:"target"`
Execution ExecutionConfig `json:"execution"`
Version int `json:"version"`
}
type IndexMigrationSourceConfig struct {

View File

@ -356,9 +356,9 @@ func (p *DispatcherProcessor) handleRunningSubTask(taskItem *task2.Task) error {
p.saveTaskAndWriteLog(taskItem, &task2.TaskResult{
Success: true,
}, "index migration completed")
}
// clean disk queue if bulk failed/completed
// clean disk queue if bulk completed
p.cleanGatewayQueue(taskItem)
}
p.decrInstanceJobs(instanceID)
return nil
@ -446,8 +446,6 @@ func (p *DispatcherProcessor) handlePendingStopSubTask(taskItem *task2.Task) err
if len(tasks) == 0 {
taskItem.Status = task2.StatusStopped
p.saveTaskAndWriteLog(taskItem, nil, fmt.Sprintf("index migration task [%s] stopped", taskItem.ID))
// clean disk queue if manually stopped
p.cleanGatewayQueue(taskItem)
}
return nil
}
@ -619,6 +617,13 @@ func (p *DispatcherProcessor) handleSplitSubTask(taskItem *task2.Task) error {
}
func (p *DispatcherProcessor) handleScheduleSubTask(taskItem *task2.Task) error {
cfg := migration_model.IndexMigrationTaskConfig{}
err := migration_util.GetTaskConfig(taskItem, &cfg)
if err != nil {
log.Errorf("failed to get task config, err: %v", err)
return fmt.Errorf("got wrong config of task [%s]", taskItem.ID)
}
scrollTask, bulkTask, err := p.getScrollBulkPipelineTasks(taskItem)
if err != nil {
log.Errorf("failed to get pipeline tasks, err: %v", err)
@ -630,7 +635,39 @@ func (p *DispatcherProcessor) handleScheduleSubTask(taskItem *task2.Task) error
return nil
}
instance, err := p.getPreferenceInstance(taskItem.ParentId[0])
instanceID, _ := util.ExtractString(taskItem.Metadata.Labels["execution_instance_id"])
totalDocs := cfg.Source.DocCount
scrolled, _, err := p.checkScrollPipelineTaskStatus(scrollTask, totalDocs)
redoScroll := true
if cfg.Version >= migration_model.IndexMigrationV1 {
// skip scroll if possible
if scrolled && err == nil {
redoScroll = false
// reset queue consumer offset
err = p.resetGatewayQueue(taskItem)
if err != nil {
log.Infof("task [%s] failed to reset gateway queue, redo scroll", taskItem.ID)
redoScroll = true
}
}
}
if !redoScroll {
migration_util.WriteLog(taskItem, nil, fmt.Sprintf("task [%s] skiping scroll", taskItem.ID))
} else {
var executionConfig migration_model.ExecutionConfig
if cfg.Version >= migration_model.IndexMigrationV1 {
executionConfig = cfg.Execution
} else {
executionConfig, err = p.getExecutionConfigFromMajorTask(taskItem)
if err != nil {
return fmt.Errorf("get execution config from parent task failed, err: %v", err)
}
}
// get a new instanceID
instance, err := p.getPreferenceInstance(executionConfig)
if err != nil {
return fmt.Errorf("get preference intance error: %w", err)
}
@ -638,20 +675,22 @@ func (p *DispatcherProcessor) handleScheduleSubTask(taskItem *task2.Task) error
log.Debugf("hit max tasks per instance with %d, skip dispatch", p.config.MaxTasksPerInstance)
return nil
}
instanceID = instance.ID
// update instance info first
scrollTask.Metadata.Labels["execution_instance_id"] = instanceID
// try to clear disk queue before running es_scroll
p.cleanGatewayQueue(taskItem)
// update scroll task to ready
scrollTask.Metadata.Labels["execution_instance_id"] = instance.ID
scrollTask.Status = task2.StatusReady
err = orm.Update(nil, scrollTask)
if err != nil {
return fmt.Errorf("update scroll pipeline task error: %w", err)
}
}
// update bulk task to init
bulkTask.Metadata.Labels["execution_instance_id"] = instance.ID
bulkTask.Metadata.Labels["execution_instance_id"] = instanceID
bulkTask.Status = task2.StatusInit
err = orm.Update(nil, bulkTask)
if err != nil {
@ -660,7 +699,7 @@ func (p *DispatcherProcessor) handleScheduleSubTask(taskItem *task2.Task) error
// update sub migration task status to running and save task log
taskItem.RetryTimes++
taskItem.Metadata.Labels["execution_instance_id"] = instance.ID
taskItem.Metadata.Labels["execution_instance_id"] = instanceID
taskItem.Metadata.Labels["index_docs"] = 0
taskItem.Metadata.Labels["scrolled_docs"] = 0
taskItem.Status = task2.StatusRunning
@ -670,11 +709,12 @@ func (p *DispatcherProcessor) handleScheduleSubTask(taskItem *task2.Task) error
Success: true,
}, fmt.Sprintf("task [%s] started", taskItem.ID))
// update dispatcher state
p.incrInstanceJobs(instance.ID)
p.incrInstanceJobs(instanceID)
return nil
}
func (p *DispatcherProcessor) getPreferenceInstance(majorTaskID string) (instance model.Instance, err error) {
func (p *DispatcherProcessor) getExecutionConfigFromMajorTask(taskItem *task2.Task) (config migration_model.ExecutionConfig, err error) {
majorTaskID := taskItem.ParentId[0]
majorTask := task2.Task{}
majorTask.ID = majorTaskID
_, err = orm.Get(&majorTask)
@ -688,11 +728,16 @@ func (p *DispatcherProcessor) getPreferenceInstance(majorTaskID string) (instanc
log.Errorf("failed to get task config, err: %v", err)
return
}
config = cfg.Settings.Execution
return
}
func (p *DispatcherProcessor) getPreferenceInstance(config migration_model.ExecutionConfig) (instance model.Instance, err error) {
var (
total = math.MaxInt
tempInst = model.Instance{}
)
for _, node := range cfg.Settings.Execution.Nodes.Permit {
for _, node := range config.Nodes.Permit {
instanceTotal := p.getInstanceState(node.ID).Total
if instanceTotal < total {
if p.config.CheckInstanceAvailable {
@ -917,6 +962,7 @@ func (p *DispatcherProcessor) splitMajorMigrationTask(taskItem *task2.Task) erro
Source: partitionSource,
Target: target,
Execution: clusterMigrationTask.Settings.Execution,
Version: migration_model.IndexMigrationV1,
}),
}
partitionMigrationTask.ID = util.GetUUID()
@ -932,6 +978,8 @@ func (p *DispatcherProcessor) splitMajorMigrationTask(taskItem *task2.Task) erro
indexParameters := migration_model.IndexMigrationTaskConfig{
Source: source,
Target: target,
Execution: clusterMigrationTask.Settings.Execution,
Version: migration_model.IndexMigrationV1,
}
indexMigrationTask := task2.Task{
ParentId: []string{taskItem.ID},
@ -1315,6 +1363,8 @@ func (p *DispatcherProcessor) refreshInstanceJobsFromES() error {
}
func (p *DispatcherProcessor) cleanGatewayQueue(taskItem *task2.Task) {
log.Debugf("cleaning gateway queue for task [%s]", taskItem.ID)
var err error
instance := model.Instance{}
instance.ID, _ = util.ExtractString(taskItem.Metadata.Labels["execution_instance_id"])
@ -1338,3 +1388,29 @@ func (p *DispatcherProcessor) cleanGatewayQueue(taskItem *task2.Task) {
log.Errorf("failed to delete queue, err: %v", err)
}
}
func (p *DispatcherProcessor) resetGatewayQueue(taskItem *task2.Task) error {
log.Debugf("resetting gateway queue offset for task [%s]", taskItem.ID)
var err error
instance := model.Instance{}
instance.ID, _ = util.ExtractString(taskItem.Metadata.Labels["execution_instance_id"])
_, err = orm.Get(&instance)
if err != nil {
log.Errorf("failed to get instance, err: %v", err)
return err
}
selector := util.MapStr{
"labels": util.MapStr{
"migration_task_id": taskItem.ID,
},
}
err = instance.DeleteQueueConsumersBySelector(selector)
if err != nil {
log.Errorf("failed to delete queue consumers, err: %v", err)
return err
}
return nil
}

View File

@ -92,7 +92,7 @@ func (p *processor) handleReadyPipelineTask(taskItem *task.Task) error {
taskItem.StartTimeInMillis = time.Now().UnixMilli()
p.saveTaskAndWriteLog(taskItem, &task.TaskResult{
Success: true,
}, fmt.Sprintf("pipeline task [%s] started", taskItem.ID))
}, fmt.Sprintf("[%v] pipeline task [%s] started", taskItem.Metadata.Labels["pipeline_id"], taskItem.ID))
return nil
}
@ -138,7 +138,7 @@ func (p *processor) handleRunningEsScrollPipelineTask(taskItem *task.Task) error
p.saveTaskAndWriteLog(taskItem, &task.TaskResult{
Success: errMsg == "",
Error: errMsg,
}, fmt.Sprintf("pipeline task [%s] completed", taskItem.ID))
}, fmt.Sprintf("[es_scroll] pipeline task [%s] completed", taskItem.ID))
p.cleanGatewayPipeline(taskItem)
return nil
}
@ -171,7 +171,7 @@ func (p *processor) handleRunningBulkIndexingPipelineTask(taskItem *task.Task) e
p.saveTaskAndWriteLog(taskItem, &task.TaskResult{
Success: errMsg == "",
Error: errMsg,
}, fmt.Sprintf("pipeline task [%s] completed", taskItem.ID))
}, fmt.Sprintf("[bulk_indexing] pipeline task [%s] completed", taskItem.ID))
p.cleanGatewayPipeline(taskItem)
return nil
}
@ -194,7 +194,7 @@ func (p *processor) handlePendingStopPipelineTask(taskItem *task.Task) error {
if stopped {
taskItem.Status = task.StatusStopped
p.saveTaskAndWriteLog(taskItem, nil, fmt.Sprintf("task [%s] stopped", taskItem.ID))
p.saveTaskAndWriteLog(taskItem, nil, fmt.Sprintf("[%v] task [%s] stopped", taskItem.Metadata.Labels["pipeline_id"], taskItem.ID))
p.cleanGatewayPipeline(taskItem)
return nil
}