innodb对B树进行游标定位时,主要通过函数btr_cur_search_to_nth_level进行,该函数从根页开始向下层页迭代,直到指定的层级level,最终将B树游标定位在第一个大/小于(等于)tuple的位置,先不考虑页面latch、锁、自适应哈希索引、插入缓冲的影响,仅看B树游标定位:
UNIV_INTERN
void
btr_cur_search_to_nth_level(
/*========================*/
dict_index_t* index, /*!< in: index */
ulint level, /*!< in: the tree level of search */
const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
tuple must be set so that it cannot get
compared to the node ptr page number field! */
ulint mode, /*!< in: PAGE_CUR_L, …;
Inserts should always be made using
PAGE_CUR_LE to search the position! */
ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, …, ORed with
at most one of BTR_INSERT, BTR_DELETE_MARK,
BTR_DELETE, or BTR_ESTIMATE;
cursor->left_block is used to store a pointer
to the left neighbor page, in the cases
BTR_SEARCH_PREV and BTR_MODIFY_PREV;
NOTE that if has_search_latch
is != 0, we maybe do not have a latch set
on the cursor page, we assume
the caller uses his search latch
to protect the record! */
btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
s- or x-latched, but see also above! */
ulint has_search_latch,/*!< in: info on the latch mode the
caller currently has on btr_search_latch:
RW_S_LATCH, or 0 */
const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */
mtr_t* mtr) /*!< in: mtr */
{
…
//1.取得根页页号:
page_cursor = btr_cur_get_page_cur(cursor);
space = dict_index_get_space(index);
page_no = dict_index_get_page(index);
…
//2.对于非叶子页,将游标定位模式标准化:
switch (mode) {
case PAGE_CUR_GE:
page_mode = PAGE_CUR_L;
break;
case PAGE_CUR_G:
page_mode = PAGE_CUR_LE;
break;
default:
page_mode = mode;
break;
}
…
//3.开始B树迭代:
search_loop: <—————————————————————————-|
buf_mode = BUF_GET; |
rw_latch = RW_NO_LATCH; |
retry_page_get: |
//3.1取得本层页面,首次为根页面 |
block = buf_page_get_gen( |
space, zip_size, page_no, rw_latch, guess, buf_mode, |
file, line, mtr); |
page = buf_block_get_frame(block); |
… |
if (height == 0) { |
//叶子页需要恢复游标定位模式 |
page_mode = mode; |
} |
//3.2在本层页面进行游标定位: |
page_cur_search_with_match( |
block, index, tuple, page_mode, &up_match, &up_bytes, |
&low_match, &low_bytes, page_cursor); |
//3.3若未到达指定level,则向下一层迭代: |
if (level != height) { |
const rec_t* node_ptr; |
height–; |
//去下层子页页号 |
node_ptr = page_cur_get_rec(page_cursor); |
offsets = rec_get_offsets( |
node_ptr, index, offsets, ULINT_UNDEFINED, &heap); |
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets); |
goto search_loop;——————————————————————-|
}
//3.4若到达指定level,则完成定位:
if (level != 0) {
buf_block_t* child_block = btr_block_get(
space, zip_size, page_no, RW_X_LATCH, index, mtr);
page = buf_block_get_frame(child_block);
btr_assert_not_corrupted(child_block, index);
} else {
cursor->low_match = low_match;
cursor->low_bytes = low_bytes;
cursor->up_match = up_match;
cursor->up_bytes = up_bytes;
}
}
值得注意的是,通过page_cur_search_with_match对非叶子页索引项进行二分搜索时,并不对得到的结果进行检查,而是直接将结果记录的最后一个字段作为下层页的页号。
页面二分搜索函数page_cur_search_with_match根据传入tuple大小和比较模式,将页面游标定位在在第一个大/小于(等于)tuple的页面位置
UNIV_INTERN
void
page_cur_search_with_match(
/*=======================*/
const buf_block_t* block, /*!< in: buffer block */
const dict_index_t* index, /*!< in: record descriptor */
const dtuple_t* tuple, /*!< in: data tuple */
ulint mode, /*!< in: PAGE_CUR_L,
PAGE_CUR_LE, PAGE_CUR_G, or
PAGE_CUR_GE */
ulint* iup_matched_fields,
/*!< in/out: already matched
fields in upper limit record */
ulint* iup_matched_bytes,
/*!< in/out: already matched
bytes in a field not yet
completely matched */
ulint* ilow_matched_fields,
/*!< in/out: already matched
fields in lower limit record */
ulint* ilow_matched_bytes,
/*!< in/out: already matched
bytes in a field not yet
completely matched */
page_cur_t* cursor) /*!< o
{
//1.初始化搜索
up_matched_fields = *iup_matched_fields;
up_matched_bytes = *iup_matched_bytes;
low_matched_fields = *ilow_matched_fields;
low_matched_bytes = *ilow_matched_bytes;
//2.首先进行页面目录的二分搜索,low为infimum记录的页面目录槽,而up为supremum记录的页面目录槽
low = 0;
up = page_dir_get_n_slots(page) – 1;
//3.开始页面目录的二分搜索,这是一个非常直接的二分搜索过程
while (up – low > 1) {
mid = (low + up) / 2;
slot = page_dir_get_nth_slot(page, mid);
mid_rec = page_dir_slot_get_rec(slot);
offsets = rec_get_offsets(mid_rec, index, offsets,
dtuple_get_n_fields_cmp(tuple),
&heap);
//用于记录与tuple比较的函数
cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,
&cur_matched_fields,
&cur_matched_bytes);
if (UNIV_LIKELY(cmp > 0)) {
low_slot_match:
low = mid;
low_matched_fields = cur_matched_fields;
low_matched_bytes = cur_matched_bytes;
} else if (UNIV_EXPECT(cmp, -1)) {
up_slot_match:
up = mid;
up_matched_fields = cur_matched_fields;
up_matched_bytes = cur_matched_bytes;
} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE) {
goto low_slot_match;
} else {
goto up_slot_match;
}
}
//4.此时low_rec与up_rec是两个相邻的页面目录槽指向的记录,然后在low_rec与up_rec之间进行线性搜索
slot = page_dir_get_nth_slot(page, low);
low_rec = page_dir_slot_get_rec(slot);
slot = page_dir_get_nth_slot(page, up);
up_rec = page_dir_slot_get_rec(slot);
//5.在low_rec与up_rec之间进行线性搜索
while (page_rec_get_next_const(low_rec) != up_rec) {
mid_rec = page_rec_get_next_const(low_rec);
offsets = rec_get_offsets(mid_rec, index, offsets,
dtuple_get_n_fields_cmp(tuple),
&heap);
//用于记录与tuple比较的函数
cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,
&cur_matched_fields,
&cur_matched_bytes);
if (UNIV_LIKELY(cmp > 0)) {
low_rec_match:
low_rec = mid_rec;
low_matched_fields = cur_matched_fields;
low_matched_bytes = cur_matched_bytes;
} else if (UNIV_EXPECT(cmp, -1)) {
up_rec_match:
up_rec = mid_rec;
up_matched_fields = cur_matched_fields;
up_matched_bytes = cur_matched_bytes;
} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE) {
goto low_rec_match;
} else {
goto up_rec_match;
}
}
//6.将游标正确的定位到页面记录上,同时记录比较结果
if (mode <= PAGE_CUR_GE) {
page_cur_position(up_rec, block, cursor);
} else {
page_cur_position(low_rec, block, cursor);
}
*iup_matched_fields = up_matched_fields;
*iup_matched_bytes = up_matched_bytes;
*ilow_matched_fields = low_matched_fields;
*ilow_matched_bytes = low_matched_bytes;
}
通常,我们会认为记录比较函数cmp_dtuple_rec_with_match将直接比较tuple与页面记录的大小,并将结果返回:
tuple大于页面记录时返回1,tuple等于页面记录时返回0,tuple小于页面记录时返回-1,这样就造成了一个误解,若tuple 小于当前B树最小记录,如果使用PAGE_CUR_L|PAGE_CUR_LE定位游标,这回导致low_rec定位到页面的infimum,游标也将定位到这里,从而导致page_cur_search_with_match向btr_cur_search_to_nth_level返回时,将游标定位到infimum,对于非叶子页,若未到达指定level,btr_cur_search_to_nth_level之后会从这条infimum记录中取下一层页面的页号,但是infimum记录中显然不会有下层页号,所以这里有一个BUG!
可是当真是这样吗?innodb开发成员不会注意不到这样一个基础性的细节,那么问题在哪里呢?
问题来源于我们的想当然,即比较函数cmp_dtuple_rec_with_match的处理过程,它并不是按照我们想象的进行的:
UNIV_INTERN
int
cmp_dtuple_rec_with_match_low(
/*==========================*/
const dtuple_t* dtuple, /*!< in: data tuple */
const rec_t* rec, /*!< in: physical record which differs from
dtuple in some of the common fields, or which
has an equal number or more fields than
dtuple */
const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
ulint n_cmp, /*!< in: number of fields to compare */
ulint* matched_fields, /*!< in/out: number of already completely
matched fields; when function returns,
contains the value for current comparison */
ulint* matched_bytes) /*!< in/out: number of already matched
bytes within the first field not completely
matched; when function returns, contains the
value for current comparison */
{
//1.比较开始前的初始化
cur_field = *matched_fields;
cur_bytes = *matched_bytes;
//2.进行一些特殊检查,也即前面问题的根源:
if (cur_bytes == 0 && cur_field == 0) {
ulint rec_info = rec_get_info_bits(rec, rec_offs_comp(offsets));
ulint tup_info = dtuple_get_info_bits(dtuple);
if (UNIV_UNLIKELY(rec_info & REC_INFO_MIN_REC_FLAG)) {
//注意此处,对于非叶子页的最左记录,其上有一个标志REC_INFO_MIN_REC_FLAG,而叶子页没有这个标志,若tuple也有这个标志,则tuple与页面记录相等,若没有这个标志,则即使tuple的实际值小于页面记录,该函数仍然会返回1,即tuple大于页面记录。这便会令btr_cur_search_to_nth_level完成对“小于(等于)B树最小记录”的定位,沿着每层B树最左页向下,最终到达叶子页,由于最左叶子页最左记录没有这个标志,对于插入等操作就可以正确定位到最左叶子页的infimum,插入动作也可以正确的进行了
ret = !(tup_info & REC_INFO_MIN_REC_FLAG);
goto order_resolved;
} else if (UNIV_UNLIKELY(tup_info & REC_INFO_MIN_REC_FLAG)) {
ret = -1;
goto order_resolved;
}
}
//3.进行记录各个字段的比较,不详细分析:
while (cur_field < n_cmp) {
ulint mtype;
ulint prtype;
dtuple_field = dtuple_get_nth_field(dtuple, cur_field);
{
const dtype_t* type = dfield_get_type(dtuple_field);
mtype = type->mtype;
prtype = type->prtype;
}
dtuple_f_len = dfield_get_len(dtuple_field);
rec_b_ptr = rec_get_nth_field(rec, offsets, cur_field, &rec_f_len);
if (UNIV_LIKELY(cur_bytes == 0)) {
if (rec_offs_nth_extern(offsets, cur_field)) {
//不处理外部存储字段
ret = 0;
goto order_resolved;
}
if (dtuple_f_len == UNIV_SQL_NULL) {
//tuple的NULL字段小于等于页面记录字段
if (rec_f_len == UNIV_SQL_NULL) {
goto next_field;
}
ret = -1;
goto order_resolved;
} else if (rec_f_len == UNIV_SQL_NULL) {
//页面记录的NULL字段小于等于tuple的字段
ret = 1;
goto order_resolved;
}
}
//FLOAT等类型的比较,安装类型进行比较
if (mtype >= DATA_FLOAT
|| (mtype == DATA_BLOB
&& 0 == (prtype & DATA_BINARY_TYPE)
&& dtype_get_charset_coll(prtype)
!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
ret = cmp_whole_field(
mtype, prtype,
static_cast<const byte*>(dfield_get_data(dtuple_field)),
(unsigned) dtuple_f_len, rec_b_ptr, (unsigned) rec_f_len);
if (ret != 0) {
cur_bytes = 0;
goto order_resolved;
} else {
goto next_field;
}
}
//其它类型的比较,遍历每一个字节比较
rec_b_ptr = rec_b_ptr + cur_bytes;
dtuple_b_ptr = (byte*) dfield_get_data(dtuple_field)
+ cur_bytes;
for (;;) {
if (UNIV_UNLIKELY(rec_f_len <= cur_bytes)) {
if (dtuple_f_len <= cur_bytes) {
goto next_field;
}
rec_byte = dtype_get_pad_char(mtype, prtype);
if (rec_byte == ULINT_UNDEFINED) {
ret = 1;
goto order_resolved;
}
} else {
rec_byte = *rec_b_ptr;
}
if (UNIV_UNLIKELY(dtuple_f_len <= cur_bytes)) {
dtuple_byte = dtype_get_pad_char(mtype,
prtype);
if (dtuple_byte == ULINT_UNDEFINED) {
ret = -1;
goto order_resolved;
}
} else {
dtuple_byte = *dtuple_b_ptr;
}
if (dtuple_byte == rec_byte) {
/* If the bytes are equal, they will
remain such even after the collation
transformation below */
goto next_byte;
}
if (mtype <= DATA_CHAR
|| (mtype == DATA_BLOB
&& !(prtype & DATA_BINARY_TYPE))) {
rec_byte = cmp_collate(rec_byte);
dtuple_byte = cmp_collate(dtuple_byte);
}
ret = (int) (dtuple_byte – rec_byte);
if (UNIV_LIKELY(ret)) {
if (ret < 0) {
ret = -1;
goto order_resolved;
} else {
ret = 1;
goto order_resolved;
}
}
next_byte:
/* Next byte */
cur_bytes++;
rec_b_ptr++;
dtuple_b_ptr++;
}
next_field:
cur_field++;
cur_bytes = 0;
}
ret = 0;
order_resolved:
*matched_fields = cur_field;
*matched_bytes = cur_bytes;
return(ret);
}
综上所述,btr_cur_search_to_nth_level进行B树层次迭代,page_cur_search_with_match进行页面二分搜索(页面目录槽二分搜索,相邻槽间线性搜索),cmp_dtuple_rec_with_match_low进行记录比较,特殊之处在于对“小于(等于)B树最小记录”这种模式的定位,cmp_dtuple_rec_with_match_low会针对叶子页和非叶子页进行特殊处理。