WIP: 拆解 TiDB Parser & Planner 篇
关于 TiDB 的序

Protocol Layer
Session Context
*clientConn.handleQuery
编译原理的序
Lexer
NFA/DFA
定义有限状态机 M 描述为 $M = (S, \Sigma, \delta, S_0, F)$
- $S$ 为有限状态集(如下图的 $\{0,1,2,3\}$ 状态);
- $\Sigma$ 为输入的字符集,空字符(通常用 $\epsilon$ 表示)不归属于 $\Sigma$ 中;
- $\delta$ 为状态转移函数,描述由某一状态接受到某字符后的状态变更;
- $S_0$ 为起始状态集,$start$ 箭头所指向的状态 $0$;
- $F$ 为终止状态集,用双圈包裹标记的状态。
而 NFA 与 DFA 的区别为,
- NFA 的转移函数 $\delta = S \times ( \Sigma \cup \epsilon ) $ 的结果集范围等于有限状态集的幂集 $2^{S}$;
- DFA 的转移函数 $\delta = S \times \Sigma $ 结果集范围等于有限状态集;
- 在实际意义上,DFA 状态转移的目标状态只有一种(有限状态集的一个状态点),而 NFA 的目标状态是有多种(幂集中的一个状态集合),故此 NFA 匹配失败时,需要回溯。

TiDB 采用 Tire 进行 token 识别,关键字词汇辅以 HashMap 识别。
Grammar
二义性文法
CYK & Earley

Parser
Lexer
yylval
yyval
ruleTable & tokenMap & windowFuncTokenMap
Yacc
%union 制定 Lexer 的 yylval
%union {
offset int // offset
item interface{}
ident string
expr ast.ExprNode
statement ast.StmtNode
}
https://github.com/mysql/mysql-server/blob/8.0/sql/sql_yacc.yy
https://github.com/pingcap/tidb/blob/master/parser/parser.y
*session.Parse
Planner
Logical Plan
*clientConn.handleStmt -> *TiDBContext.ExecuteStmt -> *session.ExecuteStmt
func (s *session) ExecuteStmt(ctx context.Context, stmtNode ast.StmtNode) (sqlexec.RecordSet, error) {
if err := s.PrepareTxnCtx(ctx); err != nil {
return nil, err
}
compiler := executor.Compiler{Ctx: s}
stmt, err := compiler.Compile(ctx, stmtNode)
if err != nil {
return nil, err
}
s.currentPlan = stmt.Plan
// Execute the physical plan.
return runStmt(ctx, s, stmt)
}
*Compiler.Compile
// Compile compiles an ast.StmtNode to a physical plan.
func (c *Compiler) Compile(ctx context.Context, stmtNode ast.StmtNode) (_ *ExecStmt, err error) {
ret := &plannercore.PreprocessorReturn{}
err = plannercore.Preprocess(ctx, c.Ctx,
stmtNode,
plannercore.WithPreprocessorReturn(ret),
plannercore.InitTxnContextProvider,
)
if err != nil {
return nil, err
}
is := sessiontxn.GetTxnManager(c.Ctx).GetTxnInfoSchema()
sessVars := c.Ctx.GetSessionVars()
stmtCtx := sessVars.StmtCtx
// handle the execute statement
finalPlan, names, err := planner.Optimize(ctx, c.Ctx, stmtNode, is)
if err != nil {
return nil, err
}
stmtCtx.SetPlan(finalPlan)
stmt := &ExecStmt{
GoCtx: ctx,
InfoSchema: is,
Plan: finalPlan,
Text: stmtNode.Text(),
StmtNode: stmtNode,
Ctx: c.Ctx,
OutputNames: names,
Ti: &TelemetryInfo{},
}
return stmt, nil
}
planner.Optimize->planner.optimize
func optimize(ctx context.Context, sctx sessionctx.Context, node ast.Node, is infoschema.InfoSchema) (core.Plan, types.NameSlice, float64, error) {
// build logical plan
hintProcessor := &hint.BlockHintProcessor{Ctx: sctx}
node.Accept(hintProcessor)
defer hintProcessor.HandleUnusedViewHints()
builder := planBuilderPool.Get().(*core.PlanBuilder)
defer planBuilderPool.Put(builder.ResetForReuse())
builder.Init(sctx, is, hintProcessor)
p, err := buildLogicalPlan(ctx, sctx, node, builder)
if err != nil {
return nil, nil, 0, err
}
names := p.OutputNames()
// Handle the non-logical plan statement.
logic, isLogicalPlan := p.(core.LogicalPlan)
if !isLogicalPlan {
return p, names, 0, nil
}
// Handle the logical plan statement, use cascades planner if enabled.
if sctx.GetSessionVars().GetEnableCascadesPlanner() {
finalPlan, cost, err := cascades.DefaultOptimizer.FindBestPlan(sctx, logic)
return finalPlan, names, cost, err
}
finalPlan, cost, err := core.DoOptimize(ctx, sctx, builder.GetOptFlag(), logic)
return finalPlan, names, cost, err
}
planner.buildLogicalPlan
func buildLogicalPlan(ctx context.Context, sctx sessionctx.Context, node ast.Node, builder *core.PlanBuilder) (core.Plan, error) {
// reset fields about rewrite
sctx.GetSessionVars().RewritePhaseInfo.Reset()
p, err := builder.Build(ctx, node)
if err != nil {
return nil, err
}
return p, nil
}
Physical Plan
// DoOptimize optimizes a logical plan to a physical plan.
func DoOptimize(ctx context.Context, sctx sessionctx.Context, flag uint64, logic LogicalPlan) (PhysicalPlan, float64, error) {
// if there is something after flagPrunColumns, do flagPrunColumnsAgain
if flag&flagPrunColumns > 0 && flag-flagPrunColumns > flagPrunColumns {
flag |= flagPrunColumnsAgain
}
if checkStableResultMode(sctx) {
flag |= flagStabilizeResults
}
if sctx.GetSessionVars().StmtCtx.StraightJoinOrder {
// When we use the straight Join Order hint, we should disable the join reorder optimization.
flag &= ^flagJoinReOrder
}
flag |= flagCollectPredicateColumnsPoint
flag |= flagSyncWaitStatsLoadPoint
logic, err := logicalOptimize(ctx, flag, logic)
if err != nil {
return nil, 0, err
}
if !AllowCartesianProduct.Load() && existsCartesianProduct(logic) {
return nil, 0, errors.Trace(ErrCartesianProductUnsupported)
}
planCounter := PlanCounterTp(sctx.GetSessionVars().StmtCtx.StmtHints.ForceNthPlan)
if planCounter == 0 {
planCounter = -1
}
physical, cost, err := physicalOptimize(logic, &planCounter)
if err != nil {
return nil, 0, err
}
finalPlan, err := postOptimize(ctx, sctx, physical)
if err != nil {
return nil, 0, err
}
return finalPlan, cost, nil
}
Cascaded Physical Plan
运行执行器
// runStmt executes the sqlexec.Statement and commit or rollback the current transaction.
func runStmt(ctx context.Context, se *session, s sqlexec.Statement) (rs sqlexec.RecordSet, err error) {
rs, err = s.Exec(ctx)
if rs != nil {
return &execStmtResult{
RecordSet: rs,
sql: s,
se: se,
}, err
}
err = finishStmt(ctx, se, err, s)
return nil, err
}
func finishStmt(ctx context.Context, se *session, meetsErr error, sql sqlexec.Statement) error {
sessVars := se.sessionVars
if !sql.IsReadOnly(sessVars) {
// Handle the stmt commit/rollback.
if se.txn.Valid() {
if meetsErr != nil {
se.StmtRollback(ctx, false)
} else {
se.StmtCommit(ctx)
}
}
}
err := autoCommitAfterStmt(ctx, se, meetsErr, sql)
if se.txn.pending() {
// After run statement finish, txn state is still pending means the
// statement never need a Txn(), such as:
//
// set @@tidb_general_log = 1
// set @@autocommit = 0
// select 1
//
// Reset txn state to invalid to dispose the pending start ts.
se.txn.changeToInvalid()
}
if err != nil {
return err
}
return checkStmtLimit(ctx, se)
}
参考资料
- https://github.com/pingcap/tidb
- https://www.udacity.com/course/compilers-theory-and-practice–ud168
- https://www.icourse163.org/course/HIT-1002123007
- https://courses.cs.washington.edu/courses/cse401/16wi/lectures/09-AG-SDT-wi16.pdf
- https://courses.cs.washington.edu/courses/cse401/16wi/lectures/10-CYK-Earley-Disambig-wi16.pdf
- https://nlogn.art/wp-content/uploads/2023/04/Language-Implementation-Patterns.pdf
- https://gist.github.com/tkrs/23ab96cf84284ec6848c1c0a16611e21
- https://www.bilibili.com/video/BV1ea41187KK