WIP: 拆解 TiDB Parser & Planner 篇

关于 TiDB 的序

Protocol Layer

Session Context

*clientConn.handleQuery

编译原理的序

Lexer

NFA/DFA

定义有限状态机 M 描述为 $M = (S, \Sigma, \delta, S_0, F)$

  1. $S$ 为有限状态集(如下图的 $\{0,1,2,3\}$ 状态);
  2. $\Sigma$ 为输入的字符集,空字符(通常用 $\epsilon$ 表示)不归属于 $\Sigma$ 中;
  3. $\delta$ 为状态转移函数,描述由某一状态接受到某字符后的状态变更;
  4. $S_0$ 为起始状态集,$start$ 箭头所指向的状态 $0$;
  5. $F$ 为终止状态集,用双圈包裹标记的状态。

而 NFA 与 DFA 的区别为,

  1. NFA 的转移函数 $\delta = S \times ( \Sigma \cup \epsilon ) $ 的结果集范围等于有限状态集的幂集 $2^{S}$;
  2. DFA 的转移函数 $\delta = S \times \Sigma $ 结果集范围等于有限状态集;
  3. 在实际意义上,DFA 状态转移的目标状态只有一种(有限状态集的一个状态点),而 NFA 的目标状态是有多种(幂集中的一个状态集合),故此 NFA 匹配失败时,需要回溯。

TiDB 采用 Tire 进行 token 识别,关键字词汇辅以 HashMap 识别。

Grammar

二义性文法

CYK & Earley

Parser

Lexer

yylval

yyval

ruleTable & tokenMap & windowFuncTokenMap

Yacc

%union 制定 Lexer 的 yylval

%union {
	offset int // offset
	item interface{}
	ident string
	expr ast.ExprNode
	statement ast.StmtNode
}

https://github.com/mysql/mysql-server/blob/8.0/sql/sql_yacc.yy

https://github.com/pingcap/tidb/blob/master/parser/parser.y

*session.Parse

Planner

Logical Plan

*clientConn.handleStmt -> *TiDBContext.ExecuteStmt -> *session.ExecuteStmt

func (s *session) ExecuteStmt(ctx context.Context, stmtNode ast.StmtNode) (sqlexec.RecordSet, error) {
	if err := s.PrepareTxnCtx(ctx); err != nil {
		return nil, err
	}
	compiler := executor.Compiler{Ctx: s}
	stmt, err := compiler.Compile(ctx, stmtNode)
	if err != nil {
		return nil, err
	}
	s.currentPlan = stmt.Plan

	// Execute the physical plan.
	return runStmt(ctx, s, stmt)
}

*Compiler.Compile

// Compile compiles an ast.StmtNode to a physical plan.
func (c *Compiler) Compile(ctx context.Context, stmtNode ast.StmtNode) (_ *ExecStmt, err error) {
	ret := &plannercore.PreprocessorReturn{}
	err = plannercore.Preprocess(ctx, c.Ctx,
		stmtNode,
		plannercore.WithPreprocessorReturn(ret),
		plannercore.InitTxnContextProvider,
	)
	if err != nil {
		return nil, err
	}
	
	is := sessiontxn.GetTxnManager(c.Ctx).GetTxnInfoSchema()
	sessVars := c.Ctx.GetSessionVars()
	stmtCtx := sessVars.StmtCtx
	
	// handle the execute statement
	finalPlan, names, err := planner.Optimize(ctx, c.Ctx, stmtNode, is)
	if err != nil {
		return nil, err
	}
	stmtCtx.SetPlan(finalPlan)
	stmt := &ExecStmt{
		GoCtx:         ctx,
		InfoSchema:    is,
		Plan:          finalPlan,
		Text:          stmtNode.Text(),
		StmtNode:      stmtNode,
		Ctx:           c.Ctx,
		OutputNames:   names,
		Ti:            &TelemetryInfo{},
	}
	return stmt, nil
}

planner.Optimize->planner.optimize

func optimize(ctx context.Context, sctx sessionctx.Context, node ast.Node, is infoschema.InfoSchema) (core.Plan, types.NameSlice, float64, error) {
	// build logical plan
	hintProcessor := &hint.BlockHintProcessor{Ctx: sctx}
	node.Accept(hintProcessor)
	defer hintProcessor.HandleUnusedViewHints()
	builder := planBuilderPool.Get().(*core.PlanBuilder)
	defer planBuilderPool.Put(builder.ResetForReuse())
	builder.Init(sctx, is, hintProcessor)
	p, err := buildLogicalPlan(ctx, sctx, node, builder)
	if err != nil {
		return nil, nil, 0, err
	}

	names := p.OutputNames()

	// Handle the non-logical plan statement.
	logic, isLogicalPlan := p.(core.LogicalPlan)
	if !isLogicalPlan {
		return p, names, 0, nil
	}

	// Handle the logical plan statement, use cascades planner if enabled.
	if sctx.GetSessionVars().GetEnableCascadesPlanner() {
		finalPlan, cost, err := cascades.DefaultOptimizer.FindBestPlan(sctx, logic)
		return finalPlan, names, cost, err
	}

	finalPlan, cost, err := core.DoOptimize(ctx, sctx, builder.GetOptFlag(), logic)
	return finalPlan, names, cost, err
}

planner.buildLogicalPlan

func buildLogicalPlan(ctx context.Context, sctx sessionctx.Context, node ast.Node, builder *core.PlanBuilder) (core.Plan, error) {
	// reset fields about rewrite
	sctx.GetSessionVars().RewritePhaseInfo.Reset()
	p, err := builder.Build(ctx, node)
	if err != nil {
		return nil, err
	}
	return p, nil
}

Physical Plan

// DoOptimize optimizes a logical plan to a physical plan.
func DoOptimize(ctx context.Context, sctx sessionctx.Context, flag uint64, logic LogicalPlan) (PhysicalPlan, float64, error) {
	// if there is something after flagPrunColumns, do flagPrunColumnsAgain
	if flag&flagPrunColumns > 0 && flag-flagPrunColumns > flagPrunColumns {
		flag |= flagPrunColumnsAgain
	}
	if checkStableResultMode(sctx) {
		flag |= flagStabilizeResults
	}
	if sctx.GetSessionVars().StmtCtx.StraightJoinOrder {
		// When we use the straight Join Order hint, we should disable the join reorder optimization.
		flag &= ^flagJoinReOrder
	}
	flag |= flagCollectPredicateColumnsPoint
	flag |= flagSyncWaitStatsLoadPoint
	logic, err := logicalOptimize(ctx, flag, logic)
	if err != nil {
		return nil, 0, err
	}
	if !AllowCartesianProduct.Load() && existsCartesianProduct(logic) {
		return nil, 0, errors.Trace(ErrCartesianProductUnsupported)
	}
	planCounter := PlanCounterTp(sctx.GetSessionVars().StmtCtx.StmtHints.ForceNthPlan)
	if planCounter == 0 {
		planCounter = -1
	}
	physical, cost, err := physicalOptimize(logic, &planCounter)
	if err != nil {
		return nil, 0, err
	}
	finalPlan, err := postOptimize(ctx, sctx, physical)
	if err != nil {
		return nil, 0, err
	}
	return finalPlan, cost, nil
}

Cascaded Physical Plan

运行执行器

// runStmt executes the sqlexec.Statement and commit or rollback the current transaction.
func runStmt(ctx context.Context, se *session, s sqlexec.Statement) (rs sqlexec.RecordSet, err error) {
	rs, err = s.Exec(ctx)
	if rs != nil {
		return &execStmtResult{
			RecordSet: rs,
			sql:       s,
			se:        se,
		}, err
	}

	err = finishStmt(ctx, se, err, s)
	return nil, err
}

func finishStmt(ctx context.Context, se *session, meetsErr error, sql sqlexec.Statement) error {
	sessVars := se.sessionVars
	if !sql.IsReadOnly(sessVars) {
		// Handle the stmt commit/rollback.
		if se.txn.Valid() {
			if meetsErr != nil {
				se.StmtRollback(ctx, false)
			} else {
				se.StmtCommit(ctx)
			}
		}
	}
	err := autoCommitAfterStmt(ctx, se, meetsErr, sql)
	if se.txn.pending() {
		// After run statement finish, txn state is still pending means the
		// statement never need a Txn(), such as:
		//
		// set @@tidb_general_log = 1
		// set @@autocommit = 0
		// select 1
		//
		// Reset txn state to invalid to dispose the pending start ts.
		se.txn.changeToInvalid()
	}
	if err != nil {
		return err
	}
	return checkStmtLimit(ctx, se)
}

参考资料

  1. https://github.com/pingcap/tidb
  2. https://www.udacity.com/course/compilers-theory-and-practice–ud168
  3. https://www.icourse163.org/course/HIT-1002123007
  4. https://courses.cs.washington.edu/courses/cse401/16wi/lectures/09-AG-SDT-wi16.pdf
  5. https://courses.cs.washington.edu/courses/cse401/16wi/lectures/10-CYK-Earley-Disambig-wi16.pdf
  6. https://nlogn.art/wp-content/uploads/2023/04/Language-Implementation-Patterns.pdf
  7. https://gist.github.com/tkrs/23ab96cf84284ec6848c1c0a16611e21
  8. https://www.bilibili.com/video/BV1ea41187KK

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注