innodb对B树游标的定位过程以及对“小于(等于)B树最小记录”的特殊处理

2023年9月25日 429次阅读来源: B树

innodb对B树进行游标定位时，主要通过函数btr_cur_search_to_nth_level进行，该函数从根页开始向下层页迭代，直到指定的层级level，最终将B树游标定位在第一个大/小于(等于)tuple的位置，先不考虑页面latch、锁、自适应哈希索引、插入缓冲的影响，仅看B树游标定位：

UNIV_INTERN

void

btr_cur_search_to_nth_level(

/*========================*/

dict_index_t* index, /*!< in: index */

ulint level, /*!< in: the tree level of search */

const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in

tuple must be set so that it cannot get

compared to the node ptr page number field! */

ulint mode, /*!< in: PAGE_CUR_L, …;

Inserts should always be made using

PAGE_CUR_LE to search the position! */

ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, …, ORed with

at most one of BTR_INSERT, BTR_DELETE_MARK,

BTR_DELETE, or BTR_ESTIMATE;

cursor->left_block is used to store a pointer

to the left neighbor page, in the cases

BTR_SEARCH_PREV and BTR_MODIFY_PREV;

NOTE that if has_search_latch

is != 0, we maybe do not have a latch set

on the cursor page, we assume

the caller uses his search latch

to protect the record! */

btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is

s- or x-latched, but see also above! */

ulint has_search_latch,/*!< in: info on the latch mode the

caller currently has on btr_search_latch:

RW_S_LATCH, or 0 */

const char* file, /*!< in: file name */

ulint line, /*!< in: line where called */

mtr_t* mtr) /*!< in: mtr */

{

…

//1.取得根页页号：

page_cursor = btr_cur_get_page_cur(cursor);

space = dict_index_get_space(index);

page_no = dict_index_get_page(index);

…

//2.对于非叶子页，将游标定位模式标准化：

switch (mode) {

case PAGE_CUR_GE:

page_mode = PAGE_CUR_L;

break;

case PAGE_CUR_G:

page_mode = PAGE_CUR_LE;

break;

default:

page_mode = mode;

break;

}

…

//3.开始B树迭代：

search_loop: <—————————————————————————-|

buf_mode = BUF_GET; |

rw_latch = RW_NO_LATCH; |

retry_page_get: |

//3.1取得本层页面，首次为根页面 |

block = buf_page_get_gen( |

space, zip_size, page_no, rw_latch, guess, buf_mode, |

file, line, mtr); |

page = buf_block_get_frame(block); |

… |

if (height == 0) { |

//叶子页需要恢复游标定位模式 |

page_mode = mode; |

} |

//3.2在本层页面进行游标定位： |

page_cur_search_with_match( |

block, index, tuple, page_mode, &up_match, &up_bytes, |

&low_match, &low_bytes, page_cursor); |

//3.3若未到达指定level，则向下一层迭代： |

if (level != height) { |

const rec_t* node_ptr; |

height–; |

//去下层子页页号 |

node_ptr = page_cur_get_rec(page_cursor); |

offsets = rec_get_offsets( |

node_ptr, index, offsets, ULINT_UNDEFINED, &heap); |

page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); |

goto search_loop;——————————————————————-|

}

//3.4若到达指定level，则完成定位：

if (level != 0) {

buf_block_t* child_block = btr_block_get(

space, zip_size, page_no, RW_X_LATCH, index, mtr);

page = buf_block_get_frame(child_block);

btr_assert_not_corrupted(child_block, index);

} else {

cursor->low_match = low_match;

cursor->low_bytes = low_bytes;

cursor->up_match = up_match;

cursor->up_bytes = up_bytes;

}

值得注意的是，通过page_cur_search_with_match对非叶子页索引项进行二分搜索时，并不对得到的结果进行检查，而是直接将结果记录的最后一个字段作为下层页的页号。

页面二分搜索函数page_cur_search_with_match根据传入tuple大小和比较模式，将页面游标定位在在第一个大/小于(等于)tuple的页面位置

UNIV_INTERN

void

page_cur_search_with_match(

/*=======================*/

const buf_block_t* block, /*!< in: buffer block */

const dict_index_t* index, /*!< in: record descriptor */

const dtuple_t* tuple, /*!< in: data tuple */

ulint mode, /*!< in: PAGE_CUR_L,

PAGE_CUR_LE, PAGE_CUR_G, or

PAGE_CUR_GE */

ulint* iup_matched_fields,

/*!< in/out: already matched

fields in upper limit record */

ulint* iup_matched_bytes,

/*!< in/out: already matched

bytes in a field not yet

completely matched */

ulint* ilow_matched_fields,

/*!< in/out: already matched

fields in lower limit record */

ulint* ilow_matched_bytes,

/*!< in/out: already matched

bytes in a field not yet

completely matched */

page_cur_t* cursor) /*!< o

{

//1.初始化搜索

up_matched_fields = *iup_matched_fields;

up_matched_bytes = *iup_matched_bytes;

low_matched_fields = *ilow_matched_fields;

low_matched_bytes = *ilow_matched_bytes;

//2.首先进行页面目录的二分搜索，low为infimum记录的页面目录槽，而up为supremum记录的页面目录槽

low = 0;

up = page_dir_get_n_slots(page) – 1;

//3.开始页面目录的二分搜索，这是一个非常直接的二分搜索过程

while (up – low > 1) {

mid = (low + up) / 2;

slot = page_dir_get_nth_slot(page, mid);

mid_rec = page_dir_slot_get_rec(slot);

offsets = rec_get_offsets(mid_rec, index, offsets,

dtuple_get_n_fields_cmp(tuple),

&heap);

//用于记录与tuple比较的函数

cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,

&cur_matched_fields,

&cur_matched_bytes);

if (UNIV_LIKELY(cmp > 0)) {

low_slot_match:

low = mid;

low_matched_fields = cur_matched_fields;

low_matched_bytes = cur_matched_bytes;

} else if (UNIV_EXPECT(cmp, -1)) {

up_slot_match:

up = mid;

up_matched_fields = cur_matched_fields;

up_matched_bytes = cur_matched_bytes;

} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE) {

goto low_slot_match;

} else {

goto up_slot_match;

}

//4.此时low_rec与up_rec是两个相邻的页面目录槽指向的记录，然后在low_rec与up_rec之间进行线性搜索

slot = page_dir_get_nth_slot(page, low);

low_rec = page_dir_slot_get_rec(slot);

slot = page_dir_get_nth_slot(page, up);

up_rec = page_dir_slot_get_rec(slot);

//5.在low_rec与up_rec之间进行线性搜索

while (page_rec_get_next_const(low_rec) != up_rec) {

mid_rec = page_rec_get_next_const(low_rec);

offsets = rec_get_offsets(mid_rec, index, offsets,

dtuple_get_n_fields_cmp(tuple),

&heap);

//用于记录与tuple比较的函数

cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,

&cur_matched_fields,

&cur_matched_bytes);

if (UNIV_LIKELY(cmp > 0)) {

low_rec_match:

low_rec = mid_rec;

low_matched_fields = cur_matched_fields;

low_matched_bytes = cur_matched_bytes;

} else if (UNIV_EXPECT(cmp, -1)) {

up_rec_match:

up_rec = mid_rec;

up_matched_fields = cur_matched_fields;

up_matched_bytes = cur_matched_bytes;

} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE) {

goto low_rec_match;

} else {

goto up_rec_match;

}

//6.将游标正确的定位到页面记录上，同时记录比较结果

if (mode <= PAGE_CUR_GE) {

page_cur_position(up_rec, block, cursor);

} else {

page_cur_position(low_rec, block, cursor);

}

*iup_matched_fields = up_matched_fields;

*iup_matched_bytes = up_matched_bytes;

*ilow_matched_fields = low_matched_fields;

*ilow_matched_bytes = low_matched_bytes;

}

通常，我们会认为记录比较函数cmp_dtuple_rec_with_match将直接比较tuple与页面记录的大小，并将结果返回：

tuple大于页面记录时返回1，tuple等于页面记录时返回0，tuple小于页面记录时返回-1，这样就造成了一个误解，若tuple 小于当前B树最小记录，如果使用PAGE_CUR_L|PAGE_CUR_LE定位游标，这回导致low_rec定位到页面的infimum，游标也将定位到这里，从而导致page_cur_search_with_match向btr_cur_search_to_nth_level返回时，将游标定位到infimum，对于非叶子页，若未到达指定level，btr_cur_search_to_nth_level之后会从这条infimum记录中取下一层页面的页号，但是infimum记录中显然不会有下层页号，所以这里有一个BUG！

可是当真是这样吗？innodb开发成员不会注意不到这样一个基础性的细节，那么问题在哪里呢？

问题来源于我们的想当然，即比较函数cmp_dtuple_rec_with_match的处理过程，它并不是按照我们想象的进行的：

UNIV_INTERN

int

cmp_dtuple_rec_with_match_low(

/*==========================*/

const dtuple_t* dtuple, /*!< in: data tuple */

const rec_t* rec, /*!< in: physical record which differs from

dtuple in some of the common fields, or which

has an equal number or more fields than

dtuple */

const ulint* offsets,/*!< in: array returned by rec_get_offsets() */

ulint n_cmp, /*!< in: number of fields to compare */

ulint* matched_fields, /*!< in/out: number of already completely

matched fields; when function returns,

contains the value for current comparison */

ulint* matched_bytes) /*!< in/out: number of already matched

bytes within the first field not completely

matched; when function returns, contains the

value for current comparison */

{

//1.比较开始前的初始化

cur_field = *matched_fields;

cur_bytes = *matched_bytes;

//2.进行一些特殊检查，也即前面问题的根源：

if (cur_bytes == 0 && cur_field == 0) {

ulint rec_info = rec_get_info_bits(rec, rec_offs_comp(offsets));

ulint tup_info = dtuple_get_info_bits(dtuple);

if (UNIV_UNLIKELY(rec_info & REC_INFO_MIN_REC_FLAG)) {

//注意此处，对于非叶子页的最左记录，其上有一个标志REC_INFO_MIN_REC_FLAG，而叶子页没有这个标志，若tuple也有这个标志，则tuple与页面记录相等，若没有这个标志，则即使tuple的实际值小于页面记录，该函数仍然会返回1，即tuple大于页面记录。这便会令btr_cur_search_to_nth_level完成对“小于(等于)B树最小记录”的定位，沿着每层B树最左页向下，最终到达叶子页，由于最左叶子页最左记录没有这个标志，对于插入等操作就可以正确定位到最左叶子页的infimum，插入动作也可以正确的进行了

ret = !(tup_info & REC_INFO_MIN_REC_FLAG);

goto order_resolved;

} else if (UNIV_UNLIKELY(tup_info & REC_INFO_MIN_REC_FLAG)) {

ret = -1;

goto order_resolved;

}

//3.进行记录各个字段的比较，不详细分析：

while (cur_field < n_cmp) {

ulint mtype;

ulint prtype;

dtuple_field = dtuple_get_nth_field(dtuple, cur_field);

{

const dtype_t* type = dfield_get_type(dtuple_field);

mtype = type->mtype;

prtype = type->prtype;

}

dtuple_f_len = dfield_get_len(dtuple_field);

rec_b_ptr = rec_get_nth_field(rec, offsets, cur_field, &rec_f_len);

if (UNIV_LIKELY(cur_bytes == 0)) {

if (rec_offs_nth_extern(offsets, cur_field)) {

//不处理外部存储字段

ret = 0;

goto order_resolved;

}

if (dtuple_f_len == UNIV_SQL_NULL) {

//tuple的NULL字段小于等于页面记录字段

if (rec_f_len == UNIV_SQL_NULL) {

goto next_field;

}

ret = -1;

goto order_resolved;

} else if (rec_f_len == UNIV_SQL_NULL) {

//页面记录的NULL字段小于等于tuple的字段

ret = 1;

goto order_resolved;

}

//FLOAT等类型的比较，安装类型进行比较

if (mtype >= DATA_FLOAT

|| (mtype == DATA_BLOB

&& 0 == (prtype & DATA_BINARY_TYPE)

&& dtype_get_charset_coll(prtype)

!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {

ret = cmp_whole_field(

mtype, prtype,

static_cast<const byte*>(dfield_get_data(dtuple_field)),

(unsigned) dtuple_f_len, rec_b_ptr, (unsigned) rec_f_len);

if (ret != 0) {

cur_bytes = 0;

goto order_resolved;

} else {

goto next_field;

}

//其它类型的比较，遍历每一个字节比较

rec_b_ptr = rec_b_ptr + cur_bytes;

dtuple_b_ptr = (byte*) dfield_get_data(dtuple_field)

+ cur_bytes;

for (;;) {

if (UNIV_UNLIKELY(rec_f_len <= cur_bytes)) {

if (dtuple_f_len <= cur_bytes) {

goto next_field;

}

rec_byte = dtype_get_pad_char(mtype, prtype);

if (rec_byte == ULINT_UNDEFINED) {

ret = 1;

goto order_resolved;

}

} else {

rec_byte = *rec_b_ptr;

}

if (UNIV_UNLIKELY(dtuple_f_len <= cur_bytes)) {

dtuple_byte = dtype_get_pad_char(mtype,

prtype);

if (dtuple_byte == ULINT_UNDEFINED) {

ret = -1;

goto order_resolved;

}

} else {

dtuple_byte = *dtuple_b_ptr;

}

if (dtuple_byte == rec_byte) {

/* If the bytes are equal, they will

remain such even after the collation

transformation below */

goto next_byte;

}

if (mtype <= DATA_CHAR

|| (mtype == DATA_BLOB

&& !(prtype & DATA_BINARY_TYPE))) {

rec_byte = cmp_collate(rec_byte);

dtuple_byte = cmp_collate(dtuple_byte);

}

ret = (int) (dtuple_byte – rec_byte);

if (UNIV_LIKELY(ret)) {

if (ret < 0) {

ret = -1;

goto order_resolved;

} else {

ret = 1;

goto order_resolved;

}

next_byte:

/* Next byte */

cur_bytes++;

rec_b_ptr++;

dtuple_b_ptr++;

}

next_field:

cur_field++;

cur_bytes = 0;

}

ret = 0;

order_resolved:

*matched_fields = cur_field;

*matched_bytes = cur_bytes;

return(ret);

}

综上所述，btr_cur_search_to_nth_level进行B树层次迭代，page_cur_search_with_match进行页面二分搜索（页面目录槽二分搜索，相邻槽间线性搜索），cmp_dtuple_rec_with_match_low进行记录比较，特殊之处在于对“小于(等于)B树最小记录”这种模式的定位，cmp_dtuple_rec_with_match_low会针对叶子页和非叶子页进行特殊处理。

    原文作者：B树
    原文地址: https://blog.csdn.net/wqtn22/article/details/84397734
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。