1
/*****************************************************************************
3
Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15
St, Fifth Floor, Boston, MA 02110-1301 USA
17
*****************************************************************************/
19
/**************************************************//**
23
Created 6/2/1994 Heikki Tuuri
24
*******************************************************/
33
#include "page0page.h"
35
#ifndef UNIV_HOTBACKUP
40
#include "lock0lock.h"
41
#include "ibuf0ibuf.h"
46
Latching strategy of the InnoDB B-tree
47
--------------------------------------
48
A tree latch protects all non-leaf nodes of the tree. Each node of a tree
49
also has a latch of its own.
51
A B-tree operation normally first acquires an S-latch on the tree. It
52
searches down the tree and releases the tree latch when it has the
53
leaf node latch. To save CPU time we do not acquire any latch on
54
non-leaf nodes of the tree during a search, those pages are only bufferfixed.
56
If an operation needs to restructure the tree, it acquires an X-latch on
57
the tree before searching to a leaf node. If it needs, for example, to
59
(1) InnoDB decides the split point in the leaf,
60
(2) allocates a new page,
61
(3) inserts the appropriate node pointer to the first non-leaf level,
62
(4) releases the tree X-latch,
63
(5) and then moves records from the leaf to the new allocated page.
67
Leaf pages of a B-tree contain the index records stored in the
68
tree. On levels n > 0 we store 'node pointers' to pages on level
69
n - 1. For each page there is exactly one node pointer stored:
70
thus the our tree is an ordinary B-tree, not a B-link tree.
72
A node pointer contains a prefix P of an index record. The prefix
73
is long enough so that it determines an index record uniquely.
74
The file page number of the child page is added as the last
75
field. To the child page we can store node pointers or index records
76
which are >= P in the alphabetical order, but < P1 if there is
77
a next node pointer on the level, and P1 is its prefix.
79
If a node pointer with a prefix P points to a non-leaf child,
80
then the leftmost record in the child must have the same
81
prefix P. If it points to a leaf node, the child is not required
82
to contain any record with a prefix equal to P. The leaf case
83
is decided this way to allow arbitrary deletions in a leaf node
84
without touching upper levels of the tree.
86
We have predefined a special minimum record which we
87
define as the smallest record in any alphabetical order.
88
A minimum record is denoted by setting a bit in the record
89
header. A minimum record acts as the prefix of a node pointer
90
which points to a leftmost node on any level of the tree.
94
In the root node of a B-tree there are two file segment headers.
95
The leaf pages of a tree are allocated from one file segment, to
96
make them consecutive on disk if possible. From the other file segment
97
we allocate pages for the non-leaf levels of the tree.
100
#ifdef UNIV_BTR_DEBUG
101
/**************************************************************//**
102
Checks a file segment header within a B-tree root page.
103
@return TRUE if valid */
106
btr_root_fseg_validate(
107
/*===================*/
108
const fseg_header_t* seg_header, /*!< in: segment header */
109
ulint space) /*!< in: tablespace identifier */
111
ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
113
ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
114
ut_a(offset >= FIL_PAGE_DATA);
115
ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
118
#endif /* UNIV_BTR_DEBUG */
120
/**************************************************************//**
121
Gets the root node of a tree and x-latches it.
122
@return root page, x-latched */
127
dict_index_t* index, /*!< in: index tree */
128
mtr_t* mtr) /*!< in: mtr */
135
space = dict_index_get_space(index);
136
zip_size = dict_table_zip_size(index->table);
137
root_page_no = dict_index_get_page(index);
139
block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
140
ut_a((ibool)!!page_is_comp(buf_block_get_frame(block))
141
== dict_table_is_comp(index->table));
142
#ifdef UNIV_BTR_DEBUG
143
if (!dict_index_is_ibuf(index)) {
144
const page_t* root = buf_block_get_frame(block);
146
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
148
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
151
#endif /* UNIV_BTR_DEBUG */
156
/**************************************************************//**
157
Gets the root node of a tree and x-latches it.
158
@return root page, x-latched */
163
dict_index_t* index, /*!< in: index tree */
164
mtr_t* mtr) /*!< in: mtr */
166
return(buf_block_get_frame(btr_root_block_get(index, mtr)));
169
/*************************************************************//**
170
Gets pointer to the previous user record in the tree. It is assumed that
171
the caller has appropriate latches on the page and its neighbor.
172
@return previous user record, NULL if there is none */
175
btr_get_prev_user_rec(
176
/*==================*/
177
rec_t* rec, /*!< in: record on leaf level */
178
mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if
179
needed, also to the previous page */
185
if (!page_rec_is_infimum(rec)) {
187
rec_t* prev_rec = page_rec_get_prev(rec);
189
if (!page_rec_is_infimum(prev_rec)) {
195
page = page_align(rec);
196
prev_page_no = btr_page_get_prev(page, mtr);
198
if (prev_page_no != FIL_NULL) {
202
buf_block_t* prev_block;
204
space = page_get_space_id(page);
205
zip_size = fil_space_get_zip_size(space);
207
prev_block = buf_page_get_with_no_latch(space, zip_size,
209
prev_page = buf_block_get_frame(prev_block);
210
/* The caller must already have a latch to the brother */
211
ut_ad(mtr_memo_contains(mtr, prev_block,
213
|| mtr_memo_contains(mtr, prev_block,
214
MTR_MEMO_PAGE_X_FIX));
215
#ifdef UNIV_BTR_DEBUG
216
ut_a(page_is_comp(prev_page) == page_is_comp(page));
217
ut_a(btr_page_get_next(prev_page, mtr)
218
== page_get_page_no(page));
219
#endif /* UNIV_BTR_DEBUG */
221
return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
227
/*************************************************************//**
228
Gets pointer to the next user record in the tree. It is assumed that the
229
caller has appropriate latches on the page and its neighbor.
230
@return next user record, NULL if there is none */
233
btr_get_next_user_rec(
234
/*==================*/
235
rec_t* rec, /*!< in: record on leaf level */
236
mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if
237
needed, also to the next page */
243
if (!page_rec_is_supremum(rec)) {
245
rec_t* next_rec = page_rec_get_next(rec);
247
if (!page_rec_is_supremum(next_rec)) {
253
page = page_align(rec);
254
next_page_no = btr_page_get_next(page, mtr);
256
if (next_page_no != FIL_NULL) {
259
buf_block_t* next_block;
261
space = page_get_space_id(page);
262
zip_size = fil_space_get_zip_size(space);
264
next_block = buf_page_get_with_no_latch(space, zip_size,
266
next_page = buf_block_get_frame(next_block);
267
/* The caller must already have a latch to the brother */
268
ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX)
269
|| mtr_memo_contains(mtr, next_block,
270
MTR_MEMO_PAGE_X_FIX));
271
#ifdef UNIV_BTR_DEBUG
272
ut_a(page_is_comp(next_page) == page_is_comp(page));
273
ut_a(btr_page_get_prev(next_page, mtr)
274
== page_get_page_no(page));
275
#endif /* UNIV_BTR_DEBUG */
277
return(page_rec_get_next(page_get_infimum_rec(next_page)));
283
/**************************************************************//**
284
Creates a new index page (not the root, and also not
285
used in page reorganization). @see btr_page_empty(). */
290
buf_block_t* block, /*!< in/out: page to be created */
291
page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
292
dict_index_t* index, /*!< in: index */
293
ulint level, /*!< in: the B-tree level of the page */
294
mtr_t* mtr) /*!< in: mtr */
296
page_t* page = buf_block_get_frame(block);
298
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
300
if (UNIV_LIKELY_NULL(page_zip)) {
301
page_create_zip(block, index, level, mtr);
303
page_create(block, mtr, dict_table_is_comp(index->table));
304
/* Set the level of the new index page */
305
btr_page_set_level(page, NULL, level, mtr);
308
block->check_index_page_at_flush = TRUE;
310
btr_page_set_index_id(page, page_zip, index->id, mtr);
313
/**************************************************************//**
314
Allocates a new file page to be used in an ibuf tree. Takes the page from
315
the free list of the tree, which must contain pages!
316
@return new allocated block, x-latched */
319
btr_page_alloc_for_ibuf(
320
/*====================*/
321
dict_index_t* index, /*!< in: index tree */
322
mtr_t* mtr) /*!< in: mtr */
324
fil_addr_t node_addr;
327
buf_block_t* new_block;
329
root = btr_root_get(index, mtr);
331
node_addr = flst_get_first(root + PAGE_HEADER
332
+ PAGE_BTR_IBUF_FREE_LIST, mtr);
333
ut_a(node_addr.page != FIL_NULL);
335
new_block = buf_page_get(dict_index_get_space(index),
336
dict_table_zip_size(index->table),
337
node_addr.page, RW_X_LATCH, mtr);
338
new_page = buf_block_get_frame(new_block);
339
buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
341
flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
342
new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
344
ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
350
/**************************************************************//**
351
Allocates a new file page to be used in an index tree. NOTE: we assume
352
that the caller has made the reservation for free extents!
353
@return new allocated block, x-latched; NULL if out of space */
358
dict_index_t* index, /*!< in: index */
359
ulint hint_page_no, /*!< in: hint of a good page */
360
byte file_direction, /*!< in: direction where a possible
361
page split is made */
362
ulint level, /*!< in: level where the page is placed
364
mtr_t* mtr) /*!< in: mtr */
366
fseg_header_t* seg_header;
368
buf_block_t* new_block;
371
if (dict_index_is_ibuf(index)) {
373
return(btr_page_alloc_for_ibuf(index, mtr));
376
root = btr_root_get(index, mtr);
379
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
381
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
384
/* Parameter TRUE below states that the caller has made the
385
reservation for free extents, and thus we know that a page can
388
new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
389
file_direction, TRUE, mtr);
390
if (new_page_no == FIL_NULL) {
395
new_block = buf_page_get(dict_index_get_space(index),
396
dict_table_zip_size(index->table),
397
new_page_no, RW_X_LATCH, mtr);
398
buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
403
/**************************************************************//**
404
Gets the number of pages in a B-tree.
405
@return number of pages */
410
dict_index_t* index, /*!< in: index */
411
ulint flag) /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
413
fseg_header_t* seg_header;
421
mtr_s_lock(dict_index_get_lock(index), &mtr);
423
root = btr_root_get(index, &mtr);
425
if (flag == BTR_N_LEAF_PAGES) {
426
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
428
fseg_n_reserved_pages(seg_header, &n, &mtr);
430
} else if (flag == BTR_TOTAL_SIZE) {
431
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
433
n = fseg_n_reserved_pages(seg_header, &dummy, &mtr);
435
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
437
n += fseg_n_reserved_pages(seg_header, &dummy, &mtr);
447
/**************************************************************//**
448
Frees a page used in an ibuf tree. Puts the page to the free list of the
452
btr_page_free_for_ibuf(
453
/*===================*/
454
dict_index_t* index, /*!< in: index tree */
455
buf_block_t* block, /*!< in: block to be freed, x-latched */
456
mtr_t* mtr) /*!< in: mtr */
460
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
461
root = btr_root_get(index, mtr);
463
flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
464
buf_block_get_frame(block)
465
+ PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
467
ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
471
/**************************************************************//**
472
Frees a file page used in an index tree. Can be used also to (BLOB)
473
external storage pages, because the page level 0 can be given as an
479
dict_index_t* index, /*!< in: index tree */
480
buf_block_t* block, /*!< in: block to be freed, x-latched */
481
ulint level, /*!< in: page level */
482
mtr_t* mtr) /*!< in: mtr */
484
fseg_header_t* seg_header;
487
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
488
/* The page gets invalid for optimistic searches: increment the frame
491
buf_block_modify_clock_inc(block);
493
if (dict_index_is_ibuf(index)) {
495
btr_page_free_for_ibuf(index, block, mtr);
500
root = btr_root_get(index, mtr);
503
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
505
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
508
fseg_free_page(seg_header,
509
buf_block_get_space(block),
510
buf_block_get_page_no(block), mtr);
513
/**************************************************************//**
514
Frees a file page used in an index tree. NOTE: cannot free field external
515
storage pages because the page must contain info on its level. */
520
dict_index_t* index, /*!< in: index tree */
521
buf_block_t* block, /*!< in: block to be freed, x-latched */
522
mtr_t* mtr) /*!< in: mtr */
526
level = btr_page_get_level(buf_block_get_frame(block), mtr);
528
btr_page_free_low(index, block, level, mtr);
531
/**************************************************************//**
532
Sets the child node file address in a node pointer. */
535
btr_node_ptr_set_child_page_no(
536
/*===========================*/
537
rec_t* rec, /*!< in: node pointer record */
538
page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
539
part will be updated, or NULL */
540
const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
541
ulint page_no,/*!< in: child node address */
542
mtr_t* mtr) /*!< in: mtr */
547
ut_ad(rec_offs_validate(rec, NULL, offsets));
548
ut_ad(!page_is_leaf(page_align(rec)));
549
ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
551
/* The child address is in the last field */
552
field = rec_get_nth_field(rec, offsets,
553
rec_offs_n_fields(offsets) - 1, &len);
555
ut_ad(len == REC_NODE_PTR_SIZE);
557
if (UNIV_LIKELY_NULL(page_zip)) {
558
page_zip_write_node_ptr(page_zip, rec,
559
rec_offs_data_size(offsets),
562
mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
566
/************************************************************//**
567
Returns the child page of a node pointer and x-latches it.
568
@return child page, x-latched */
571
btr_node_ptr_get_child(
572
/*===================*/
573
const rec_t* node_ptr,/*!< in: node pointer */
574
dict_index_t* index, /*!< in: index */
575
const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
576
mtr_t* mtr) /*!< in: mtr */
581
ut_ad(rec_offs_validate(node_ptr, index, offsets));
582
space = page_get_space_id(page_align(node_ptr));
583
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
585
return(btr_block_get(space, dict_table_zip_size(index->table),
586
page_no, RW_X_LATCH, mtr));
589
/************************************************************//**
590
Returns the upper level node pointer to a page. It is assumed that mtr holds
591
an x-latch on the tree.
592
@return rec_get_offsets() of the node pointer record */
595
btr_page_get_father_node_ptr_func(
596
/*==============================*/
597
ulint* offsets,/*!< in: work area for the return value */
598
mem_heap_t* heap, /*!< in: memory heap to use */
599
btr_cur_t* cursor, /*!< in: cursor pointing to user record,
600
out: cursor on node pointer record,
601
its page x-latched */
602
const char* file, /*!< in: file name */
603
ulint line, /*!< in: line where called */
604
mtr_t* mtr) /*!< in: mtr */
613
page_no = buf_block_get_page_no(btr_cur_get_block(cursor));
614
index = btr_cur_get_index(cursor);
616
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
619
ut_ad(dict_index_get_page(index) != page_no);
621
level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
623
user_rec = btr_cur_get_rec(cursor);
624
ut_a(page_rec_is_user_rec(user_rec));
625
tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
627
btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
628
BTR_CONT_MODIFY_TREE, cursor, 0,
631
node_ptr = btr_cur_get_rec(cursor);
632
ut_ad(!page_rec_is_comp(node_ptr)
633
|| rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
634
offsets = rec_get_offsets(node_ptr, index, offsets,
635
ULINT_UNDEFINED, &heap);
637
if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets)
640
fputs("InnoDB: Dump of the child page:\n", stderr);
641
buf_page_print(page_align(user_rec), 0);
642
fputs("InnoDB: Dump of the parent page:\n", stderr);
643
buf_page_print(page_align(node_ptr), 0);
645
fputs("InnoDB: Corruption of an index tree: table ", stderr);
646
ut_print_name(stderr, NULL, TRUE, index->table_name);
647
fputs(", index ", stderr);
648
ut_print_name(stderr, NULL, FALSE, index->name);
649
fprintf(stderr, ",\n"
650
"InnoDB: father ptr page no %lu, child page no %lu\n",
652
btr_node_ptr_get_child_page_no(node_ptr, offsets),
654
print_rec = page_rec_get_next(
655
page_get_infimum_rec(page_align(user_rec)));
656
offsets = rec_get_offsets(print_rec, index,
657
offsets, ULINT_UNDEFINED, &heap);
658
page_rec_print(print_rec, offsets);
659
offsets = rec_get_offsets(node_ptr, index, offsets,
660
ULINT_UNDEFINED, &heap);
661
page_rec_print(node_ptr, offsets);
663
fputs("InnoDB: You should dump + drop + reimport the table"
665
"InnoDB: corruption. If the crash happens at "
666
"the database startup, see\n"
667
"InnoDB: " REFMAN "forcing-recovery.html about\n"
668
"InnoDB: forcing recovery. "
669
"Then dump + drop + reimport.\n", stderr);
677
#define btr_page_get_father_node_ptr(of,heap,cur,mtr) \
678
btr_page_get_father_node_ptr_func(of,heap,cur,__FILE__,__LINE__,mtr)
680
/************************************************************//**
681
Returns the upper level node pointer to a page. It is assumed that mtr holds
682
an x-latch on the tree.
683
@return rec_get_offsets() of the node pointer record */
686
btr_page_get_father_block(
687
/*======================*/
688
ulint* offsets,/*!< in: work area for the return value */
689
mem_heap_t* heap, /*!< in: memory heap to use */
690
dict_index_t* index, /*!< in: b-tree index */
691
buf_block_t* block, /*!< in: child page in the index */
692
mtr_t* mtr, /*!< in: mtr */
693
btr_cur_t* cursor) /*!< out: cursor on node pointer record,
694
its page x-latched */
697
= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
699
btr_cur_position(index, rec, block, cursor);
700
return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
703
/************************************************************//**
704
Seeks to the upper level node pointer to a page.
705
It is assumed that mtr holds an x-latch on the tree. */
710
dict_index_t* index, /*!< in: b-tree index */
711
buf_block_t* block, /*!< in: child page in the index */
712
mtr_t* mtr, /*!< in: mtr */
713
btr_cur_t* cursor) /*!< out: cursor on node pointer record,
714
its page x-latched */
718
= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
720
btr_cur_position(index, rec, block, cursor);
722
heap = mem_heap_create(100);
723
btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
727
/************************************************************//**
728
Creates the root node for a new index tree.
729
@return page number of the created root, FIL_NULL if did not succeed */
734
ulint type, /*!< in: type of the index */
735
ulint space, /*!< in: space where created */
736
ulint zip_size,/*!< in: compressed page size in bytes
737
or 0 for uncompressed pages */
738
index_id_t index_id,/*!< in: index id */
739
dict_index_t* index, /*!< in: index */
740
mtr_t* mtr) /*!< in: mini-transaction handle */
746
page_zip_des_t* page_zip;
748
/* Create the two new segments (one, in the case of an ibuf tree) for
749
the index tree; the segment headers are put on the allocated root page
750
(for an ibuf tree, not in the root, but on a separate ibuf header
753
if (type & DICT_IBUF) {
754
/* Allocate first the ibuf header page */
755
buf_block_t* ibuf_hdr_block = fseg_create(
757
IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
759
buf_block_dbg_add_level(ibuf_hdr_block, SYNC_TREE_NODE_NEW);
761
ut_ad(buf_block_get_page_no(ibuf_hdr_block)
762
== IBUF_HEADER_PAGE_NO);
763
/* Allocate then the next page to the segment: it will be the
766
page_no = fseg_alloc_free_page(buf_block_get_frame(
769
+ IBUF_TREE_SEG_HEADER,
770
IBUF_TREE_ROOT_PAGE_NO,
772
ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
774
block = buf_page_get(space, zip_size, page_no,
777
block = fseg_create(space, 0,
778
PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
786
page_no = buf_block_get_page_no(block);
787
frame = buf_block_get_frame(block);
789
buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
791
if (type & DICT_IBUF) {
792
/* It is an insert buffer tree: initialize the free list */
794
ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
796
flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
798
/* It is a non-ibuf tree: create a file segment for leaf
800
if (!fseg_create(space, page_no,
801
PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
802
/* Not enough space for new segment, free root
803
segment before return. */
804
btr_free_root(space, zip_size, page_no, mtr);
809
/* The fseg create acquires a second latch on the page,
810
therefore we must declare it: */
811
buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
814
/* Create a new index page on the the allocated segment page */
815
page_zip = buf_block_get_page_zip(block);
817
if (UNIV_LIKELY_NULL(page_zip)) {
818
page = page_create_zip(block, index, 0, mtr);
820
page = page_create(block, mtr,
821
dict_table_is_comp(index->table));
822
/* Set the level of the new index page */
823
btr_page_set_level(page, NULL, 0, mtr);
826
block->check_index_page_at_flush = TRUE;
828
/* Set the index id of the page */
829
btr_page_set_index_id(page, page_zip, index_id, mtr);
831
/* Set the next node and previous node fields */
832
btr_page_set_next(page, page_zip, FIL_NULL, mtr);
833
btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
835
/* We reset the free bits for the page to allow creation of several
836
trees in the same mtr, otherwise the latch on a bitmap page would
837
prevent it because of the latching order */
839
if (!(type & DICT_CLUSTERED)) {
840
ibuf_reset_free_bits(block);
843
/* In the following assertion we test that two records of maximum
844
allowed size fit on the root page: this fact is needed to ensure
845
correctness of split algorithms */
847
ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
852
/************************************************************//**
853
Frees a B-tree except the root page, which MUST be freed after this
854
by calling btr_free_root. */
857
btr_free_but_not_root(
858
/*==================*/
859
ulint space, /*!< in: space where created */
860
ulint zip_size, /*!< in: compressed page size in bytes
861
or 0 for uncompressed pages */
862
ulint root_page_no) /*!< in: root page number */
871
root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
872
#ifdef UNIV_BTR_DEBUG
873
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
875
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
877
#endif /* UNIV_BTR_DEBUG */
879
/* NOTE: page hash indexes are dropped when a page is freed inside
882
finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
893
root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
894
#ifdef UNIV_BTR_DEBUG
895
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
897
#endif /* UNIV_BTR_DEBUG */
899
finished = fseg_free_step_not_header(
900
root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
909
/************************************************************//**
910
Frees the B-tree root page. Other tree MUST already have been freed. */
915
ulint space, /*!< in: space where created */
916
ulint zip_size, /*!< in: compressed page size in bytes
917
or 0 for uncompressed pages */
918
ulint root_page_no, /*!< in: root page number */
919
mtr_t* mtr) /*!< in: a mini-transaction which has already
923
fseg_header_t* header;
925
block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
927
btr_search_drop_page_hash_index(block);
929
header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
930
#ifdef UNIV_BTR_DEBUG
931
ut_a(btr_root_fseg_validate(header, space));
932
#endif /* UNIV_BTR_DEBUG */
934
while (!fseg_free_step(header, mtr));
936
#endif /* !UNIV_HOTBACKUP */
938
/*************************************************************//**
939
Reorganizes an index page. */
942
btr_page_reorganize_low(
943
/*====================*/
944
ibool recovery,/*!< in: TRUE if called in recovery:
945
locks should not be updated, i.e.,
946
there cannot exist locks on the
947
page, and a hash index should not be
948
dropped: it cannot exist */
949
buf_block_t* block, /*!< in: page to be reorganized */
950
dict_index_t* index, /*!< in: record descriptor */
951
mtr_t* mtr) /*!< in: mtr */
953
buf_pool_t* buf_pool = buf_pool_from_bpage(&block->page);
954
page_t* page = buf_block_get_frame(block);
955
page_zip_des_t* page_zip = buf_block_get_page_zip(block);
956
buf_block_t* temp_block;
963
ibool success = FALSE;
965
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
966
ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
967
#ifdef UNIV_ZIP_DEBUG
968
ut_a(!page_zip || page_zip_validate(page_zip, page));
969
#endif /* UNIV_ZIP_DEBUG */
970
data_size1 = page_get_data_size(page);
971
max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
973
#ifndef UNIV_HOTBACKUP
974
/* Write the log record */
975
mlog_open_and_write_index(mtr, page, index, page_is_comp(page)
976
? MLOG_COMP_PAGE_REORGANIZE
977
: MLOG_PAGE_REORGANIZE, 0);
978
#endif /* !UNIV_HOTBACKUP */
980
/* Turn logging off */
981
log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
983
#ifndef UNIV_HOTBACKUP
984
temp_block = buf_block_alloc(buf_pool, 0);
985
#else /* !UNIV_HOTBACKUP */
986
ut_ad(block == back_block1);
987
temp_block = back_block2;
988
#endif /* !UNIV_HOTBACKUP */
989
temp_page = temp_block->frame;
991
/* Copy the old page to temporary space */
992
buf_frame_copy(temp_page, page);
994
#ifndef UNIV_HOTBACKUP
995
if (UNIV_LIKELY(!recovery)) {
996
btr_search_drop_page_hash_index(block);
999
block->check_index_page_at_flush = TRUE;
1000
#endif /* !UNIV_HOTBACKUP */
1002
/* Recreate the page: note that global data on page (possible
1003
segment headers, next page-field, etc.) is preserved intact */
1005
page_create(block, mtr, dict_table_is_comp(index->table));
1007
/* Copy the records from the temporary space to the recreated page;
1008
do not copy the lock bits yet */
1010
page_copy_rec_list_end_no_locks(block, temp_block,
1011
page_get_infimum_rec(temp_page),
1014
if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
1015
/* Copy max trx id to recreated page */
1016
trx_id_t max_trx_id = page_get_max_trx_id(temp_page);
1017
page_set_max_trx_id(block, NULL, max_trx_id, mtr);
1018
/* In crash recovery, dict_index_is_sec_or_ibuf() always
1019
returns TRUE, even for clustered indexes. max_trx_id is
1020
unused in clustered index pages. */
1021
ut_ad(max_trx_id != 0 || recovery);
1024
if (UNIV_LIKELY_NULL(page_zip)
1026
(!page_zip_compress(page_zip, page, index, NULL))) {
1028
/* Restore the old page and exit. */
1030
#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1031
/* Check that the bytes that we skip are identical. */
1032
ut_a(!memcmp(page, temp_page, PAGE_HEADER));
1033
ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
1034
PAGE_HEADER + PAGE_N_RECS + temp_page,
1035
PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
1036
ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page,
1037
UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page,
1038
FIL_PAGE_DATA_END));
1039
#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1041
memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
1042
PAGE_N_RECS - PAGE_N_DIR_SLOTS);
1043
memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
1044
UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
1046
#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1047
ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE));
1048
#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1053
#ifndef UNIV_HOTBACKUP
1054
if (UNIV_LIKELY(!recovery)) {
1055
/* Update the record lock bitmaps */
1056
lock_move_reorganize_page(block, temp_block);
1058
#endif /* !UNIV_HOTBACKUP */
1060
data_size2 = page_get_data_size(page);
1061
max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
1063
if (UNIV_UNLIKELY(data_size1 != data_size2)
1064
|| UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) {
1065
buf_page_print(page, 0);
1066
buf_page_print(temp_page, 0);
1068
"InnoDB: Error: page old data size %lu"
1069
" new data size %lu\n"
1070
"InnoDB: Error: page old max ins size %lu"
1071
" new max ins size %lu\n"
1072
"InnoDB: Submit a detailed bug report"
1073
" to http://bugs.mysql.com\n",
1074
(unsigned long) data_size1, (unsigned long) data_size2,
1075
(unsigned long) max_ins_size1,
1076
(unsigned long) max_ins_size2);
1082
#ifdef UNIV_ZIP_DEBUG
1083
ut_a(!page_zip || page_zip_validate(page_zip, page));
1084
#endif /* UNIV_ZIP_DEBUG */
1085
#ifndef UNIV_HOTBACKUP
1086
buf_block_free(temp_block);
1087
#endif /* !UNIV_HOTBACKUP */
1089
/* Restore logging mode */
1090
mtr_set_log_mode(mtr, log_mode);
1095
#ifndef UNIV_HOTBACKUP
1096
/*************************************************************//**
1097
Reorganizes an index page.
1098
IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
1099
page of a non-clustered index, the caller must update the insert
1100
buffer free bits in the same mini-transaction in such a way that the
1101
modification will be redo-logged.
1102
@return TRUE on success, FALSE on failure */
1105
btr_page_reorganize(
1106
/*================*/
1107
buf_block_t* block, /*!< in: page to be reorganized */
1108
dict_index_t* index, /*!< in: record descriptor */
1109
mtr_t* mtr) /*!< in: mtr */
1111
return(btr_page_reorganize_low(FALSE, block, index, mtr));
1113
#endif /* !UNIV_HOTBACKUP */
1115
/***********************************************************//**
1116
Parses a redo log record of reorganizing a page.
1117
@return end of log record or NULL */
1120
btr_parse_page_reorganize(
1121
/*======================*/
1122
byte* ptr, /*!< in: buffer */
1123
byte* end_ptr __attribute__((unused)),
1124
/*!< in: buffer end */
1125
dict_index_t* index, /*!< in: record descriptor */
1126
buf_block_t* block, /*!< in: page to be reorganized, or NULL */
1127
mtr_t* mtr) /*!< in: mtr or NULL */
1129
ut_ad(ptr && end_ptr);
1131
/* The record is empty, except for the record initial part */
1133
if (UNIV_LIKELY(block != NULL)) {
1134
btr_page_reorganize_low(TRUE, block, index, mtr);
1140
#ifndef UNIV_HOTBACKUP
1141
/*************************************************************//**
1142
Empties an index page. @see btr_page_create(). */
1147
buf_block_t* block, /*!< in: page to be emptied */
1148
page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */
1149
dict_index_t* index, /*!< in: index of the page */
1150
ulint level, /*!< in: the B-tree level of the page */
1151
mtr_t* mtr) /*!< in: mtr */
1153
page_t* page = buf_block_get_frame(block);
1155
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1156
ut_ad(page_zip == buf_block_get_page_zip(block));
1157
#ifdef UNIV_ZIP_DEBUG
1158
ut_a(!page_zip || page_zip_validate(page_zip, page));
1159
#endif /* UNIV_ZIP_DEBUG */
1161
btr_search_drop_page_hash_index(block);
1163
/* Recreate the page: note that global data on page (possible
1164
segment headers, next page-field, etc.) is preserved intact */
1166
if (UNIV_LIKELY_NULL(page_zip)) {
1167
page_create_zip(block, index, level, mtr);
1169
page_create(block, mtr, dict_table_is_comp(index->table));
1170
btr_page_set_level(page, NULL, level, mtr);
1173
block->check_index_page_at_flush = TRUE;
1176
/*************************************************************//**
1177
Makes tree one level higher by splitting the root, and inserts
1178
the tuple. It is assumed that mtr contains an x-latch on the tree.
1179
NOTE that the operation of this function must always succeed,
1180
we cannot reverse it: therefore enough free disk space must be
1181
guaranteed to be available before this function is called.
1182
@return inserted record */
1185
btr_root_raise_and_insert(
1186
/*======================*/
1187
btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
1188
on the root page; when the function returns,
1189
the cursor is positioned on the predecessor
1190
of the inserted record */
1191
const dtuple_t* tuple, /*!< in: tuple to insert */
1192
ulint n_ext, /*!< in: number of externally stored columns */
1193
mtr_t* mtr) /*!< in: mtr */
1195
dict_index_t* index;
1203
rec_t* node_ptr_rec;
1204
page_cur_t* page_cursor;
1205
page_zip_des_t* root_page_zip;
1206
page_zip_des_t* new_page_zip;
1207
buf_block_t* root_block;
1208
buf_block_t* new_block;
1210
root = btr_cur_get_page(cursor);
1211
root_block = btr_cur_get_block(cursor);
1212
root_page_zip = buf_block_get_page_zip(root_block);
1213
#ifdef UNIV_ZIP_DEBUG
1214
ut_a(!root_page_zip || page_zip_validate(root_page_zip, root));
1215
#endif /* UNIV_ZIP_DEBUG */
1216
index = btr_cur_get_index(cursor);
1217
#ifdef UNIV_BTR_DEBUG
1218
if (!dict_index_is_ibuf(index)) {
1219
ulint space = dict_index_get_space(index);
1221
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
1223
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1227
ut_a(dict_index_get_page(index) == page_get_page_no(root));
1228
#endif /* UNIV_BTR_DEBUG */
1229
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1231
ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
1233
/* Allocate a new page to the tree. Root splitting is done by first
1234
moving the root records to the new page, emptying the root, putting
1235
a node pointer to the new page, and then splitting the new page. */
1237
level = btr_page_get_level(root, mtr);
1239
new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr);
1240
new_page = buf_block_get_frame(new_block);
1241
new_page_zip = buf_block_get_page_zip(new_block);
1242
ut_a(!new_page_zip == !root_page_zip);
1244
|| page_zip_get_size(new_page_zip)
1245
== page_zip_get_size(root_page_zip));
1247
btr_page_create(new_block, new_page_zip, index, level, mtr);
1249
/* Set the next node and previous node fields of new page */
1250
btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
1251
btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
1253
/* Copy the records from root to the new page one by one. */
1256
#ifdef UNIV_ZIP_COPY
1258
#endif /* UNIV_ZIP_COPY */
1260
(!page_copy_rec_list_end(new_block, root_block,
1261
page_get_infimum_rec(root),
1265
/* Copy the page byte for byte. */
1266
page_zip_copy_recs(new_page_zip, new_page,
1267
root_page_zip, root, index, mtr);
1269
/* Update the lock table and possible hash index. */
1271
lock_move_rec_list_end(new_block, root_block,
1272
page_get_infimum_rec(root));
1274
btr_search_move_or_delete_hash_entries(new_block, root_block,
1278
/* If this is a pessimistic insert which is actually done to
1279
perform a pessimistic update then we have stored the lock
1280
information of the record to be inserted on the infimum of the
1281
root page: we cannot discard the lock structs on the root page */
1283
lock_update_root_raise(new_block, root_block);
1285
/* Create a memory heap where the node pointer is stored */
1286
heap = mem_heap_create(100);
1288
rec = page_rec_get_next(page_get_infimum_rec(new_page));
1289
new_page_no = buf_block_get_page_no(new_block);
1291
/* Build the node pointer (= node key and page address) for the
1294
node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
1296
/* The node pointer must be marked as the predefined minimum record,
1297
as there is no lower alphabetical limit to records in the leftmost
1299
dtuple_set_info_bits(node_ptr,
1300
dtuple_get_info_bits(node_ptr)
1301
| REC_INFO_MIN_REC_FLAG);
1303
/* Rebuild the root page to get free space */
1304
btr_page_empty(root_block, root_page_zip, index, level + 1, mtr);
1306
/* Set the next node and previous node fields, although
1307
they should already have been set. The previous node field
1308
must be FIL_NULL if root_page_zip != NULL, because the
1309
REC_INFO_MIN_REC_FLAG (of the first user record) will be
1310
set if and only if btr_page_get_prev() == FIL_NULL. */
1311
btr_page_set_next(root, root_page_zip, FIL_NULL, mtr);
1312
btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr);
1314
page_cursor = btr_cur_get_page_cur(cursor);
1316
/* Insert node pointer to the root */
1318
page_cur_set_before_first(root_block, page_cursor);
1320
node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
1323
/* The root page should only contain the node pointer
1324
to new_page at this point. Thus, the data should fit. */
1327
/* Free the memory heap */
1328
mem_heap_free(heap);
1330
/* We play safe and reset the free bits for the new page */
1333
fprintf(stderr, "Root raise new page no %lu\n", new_page_no);
1336
if (!dict_index_is_clust(index)) {
1337
ibuf_reset_free_bits(new_block);
1340
/* Reposition the cursor to the child node */
1341
page_cur_search(new_block, index, tuple,
1342
PAGE_CUR_LE, page_cursor);
1344
/* Split the child and insert tuple */
1345
return(btr_page_split_and_insert(cursor, tuple, n_ext, mtr));
1348
/*************************************************************//**
1349
Decides if the page should be split at the convergence point of inserts
1350
converging to the left.
1351
@return TRUE if split recommended */
1354
btr_page_get_split_rec_to_left(
1355
/*===========================*/
1356
btr_cur_t* cursor, /*!< in: cursor at which to insert */
1357
rec_t** split_rec) /*!< out: if split recommended,
1358
the first record on upper half page,
1359
or NULL if tuple to be inserted should
1363
rec_t* insert_point;
1366
page = btr_cur_get_page(cursor);
1367
insert_point = btr_cur_get_rec(cursor);
1369
if (page_header_get_ptr(page, PAGE_LAST_INSERT)
1370
== page_rec_get_next(insert_point)) {
1372
infimum = page_get_infimum_rec(page);
1374
/* If the convergence is in the middle of a page, include also
1375
the record immediately before the new insert to the upper
1376
page. Otherwise, we could repeatedly move from page to page
1377
lots of records smaller than the convergence point. */
1379
if (infimum != insert_point
1380
&& page_rec_get_next(infimum) != insert_point) {
1382
*split_rec = insert_point;
1384
*split_rec = page_rec_get_next(insert_point);
1393
/*************************************************************//**
1394
Decides if the page should be split at the convergence point of inserts
1395
converging to the right.
1396
@return TRUE if split recommended */
1399
btr_page_get_split_rec_to_right(
1400
/*============================*/
1401
btr_cur_t* cursor, /*!< in: cursor at which to insert */
1402
rec_t** split_rec) /*!< out: if split recommended,
1403
the first record on upper half page,
1404
or NULL if tuple to be inserted should
1408
rec_t* insert_point;
1410
page = btr_cur_get_page(cursor);
1411
insert_point = btr_cur_get_rec(cursor);
1413
/* We use eager heuristics: if the new insert would be right after
1414
the previous insert on the same page, we assume that there is a
1415
pattern of sequential inserts here. */
1417
if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT)
1422
next_rec = page_rec_get_next(insert_point);
1424
if (page_rec_is_supremum(next_rec)) {
1426
/* Split at the new record to insert */
1429
rec_t* next_next_rec = page_rec_get_next(next_rec);
1430
if (page_rec_is_supremum(next_next_rec)) {
1435
/* If there are >= 2 user records up from the insert
1436
point, split all but 1 off. We want to keep one because
1437
then sequential inserts can use the adaptive hash
1438
index, as they can do the necessary checks of the right
1439
search position just by looking at the records on this
1442
*split_rec = next_next_rec;
1451
/*************************************************************//**
1452
Calculates a split record such that the tuple will certainly fit on
1453
its half-page when the split is performed. We assume in this function
1454
only that the cursor page has at least one user record.
1455
@return split record, or NULL if tuple will be the first record on
1456
the lower or upper half-page (determined by btr_page_tuple_smaller()) */
1459
btr_page_get_split_rec(
1460
/*===================*/
1461
btr_cur_t* cursor, /*!< in: cursor at which insert should be made */
1462
const dtuple_t* tuple, /*!< in: tuple to insert */
1463
ulint n_ext) /*!< in: number of externally stored columns */
1466
page_zip_des_t* page_zip;
1480
page = btr_cur_get_page(cursor);
1482
insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
1483
free_space = page_get_free_space_of_empty(page_is_comp(page));
1485
page_zip = btr_cur_get_page_zip(cursor);
1486
if (UNIV_LIKELY_NULL(page_zip)) {
1487
/* Estimate the free space of an empty compressed page. */
1488
ulint free_space_zip = page_zip_empty_size(
1489
cursor->index->n_fields,
1490
page_zip_get_size(page_zip));
1492
if (UNIV_LIKELY(free_space > (ulint) free_space_zip)) {
1493
free_space = (ulint) free_space_zip;
1497
/* free_space is now the free space of a created new page */
1499
total_data = page_get_data_size(page) + insert_size;
1500
total_n_recs = page_get_n_recs(page) + 1;
1501
ut_ad(total_n_recs >= 2);
1502
total_space = total_data + page_dir_calc_reserved_space(total_n_recs);
1506
ins_rec = btr_cur_get_rec(cursor);
1507
rec = page_get_infimum_rec(page);
1512
/* We start to include records to the left half, and when the
1513
space reserved by them exceeds half of total_space, then if
1514
the included records fit on the left page, they will be put there
1515
if something was left over also for the right page,
1516
otherwise the last included record will be the first on the right
1520
/* Decide the next record to include */
1521
if (rec == ins_rec) {
1522
rec = NULL; /* NULL denotes that tuple is
1524
} else if (rec == NULL) {
1525
rec = page_rec_get_next(ins_rec);
1527
rec = page_rec_get_next(rec);
1532
incl_data += insert_size;
1534
offsets = rec_get_offsets(rec, cursor->index,
1535
offsets, ULINT_UNDEFINED,
1537
incl_data += rec_offs_size(offsets);
1541
} while (incl_data + page_dir_calc_reserved_space(n)
1544
if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
1545
/* The next record will be the first on
1546
the right half page if it is not the
1547
supremum record of page */
1549
if (rec == ins_rec) {
1553
} else if (rec == NULL) {
1554
next_rec = page_rec_get_next(ins_rec);
1556
next_rec = page_rec_get_next(rec);
1559
if (!page_rec_is_supremum(next_rec)) {
1565
if (UNIV_LIKELY_NULL(heap)) {
1566
mem_heap_free(heap);
1571
/*************************************************************//**
1572
Returns TRUE if the insert fits on the appropriate half-page with the
1574
@return TRUE if fits */
1577
btr_page_insert_fits(
1578
/*=================*/
1579
btr_cur_t* cursor, /*!< in: cursor at which insert
1581
const rec_t* split_rec,/*!< in: suggestion for first record
1582
on upper half-page, or NULL if
1583
tuple to be inserted should be first */
1584
const ulint* offsets,/*!< in: rec_get_offsets(
1585
split_rec, cursor->index) */
1586
const dtuple_t* tuple, /*!< in: tuple to insert */
1587
ulint n_ext, /*!< in: number of externally stored columns */
1588
mem_heap_t* heap) /*!< in: temporary memory heap */
1596
const rec_t* end_rec;
1599
page = btr_cur_get_page(cursor);
1601
ut_ad(!split_rec == !offsets);
1603
|| !page_is_comp(page) == !rec_offs_comp(offsets));
1605
|| rec_offs_validate(split_rec, cursor->index, offsets));
1607
insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
1608
free_space = page_get_free_space_of_empty(page_is_comp(page));
1610
/* free_space is now the free space of a created new page */
1612
total_data = page_get_data_size(page) + insert_size;
1613
total_n_recs = page_get_n_recs(page) + 1;
1615
/* We determine which records (from rec to end_rec, not including
1616
end_rec) will end up on the other half page from tuple when it is
1619
if (split_rec == NULL) {
1620
rec = page_rec_get_next(page_get_infimum_rec(page));
1621
end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
1623
} else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) {
1625
rec = page_rec_get_next(page_get_infimum_rec(page));
1626
end_rec = split_rec;
1629
end_rec = page_get_supremum_rec(page);
1632
if (total_data + page_dir_calc_reserved_space(total_n_recs)
1635
/* Ok, there will be enough available space on the
1636
half page where the tuple is inserted */
1643
while (rec != end_rec) {
1644
/* In this loop we calculate the amount of reserved
1645
space after rec is removed from page. */
1647
offs = rec_get_offsets(rec, cursor->index, offs,
1648
ULINT_UNDEFINED, &heap);
1650
total_data -= rec_offs_size(offs);
1653
if (total_data + page_dir_calc_reserved_space(total_n_recs)
1656
/* Ok, there will be enough available space on the
1657
half page where the tuple is inserted */
1662
rec = page_rec_get_next_const(rec);
1668
/*******************************************************//**
1669
Inserts a data tuple to a tree on a non-leaf level. It is assumed
1670
that mtr holds an x-latch on the tree. */
1673
btr_insert_on_non_leaf_level_func(
1674
/*==============================*/
1675
dict_index_t* index, /*!< in: index */
1676
ulint level, /*!< in: level, must be > 0 */
1677
dtuple_t* tuple, /*!< in: the record to be inserted */
1678
const char* file, /*!< in: file name */
1679
ulint line, /*!< in: line where called */
1680
mtr_t* mtr) /*!< in: mtr */
1682
big_rec_t* dummy_big_rec;
1689
btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
1690
BTR_CONT_MODIFY_TREE,
1691
&cursor, 0, file, line, mtr);
1693
err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
1695
| BTR_NO_UNDO_LOG_FLAG,
1696
&cursor, tuple, &rec,
1697
&dummy_big_rec, 0, NULL, mtr);
1698
ut_a(err == DB_SUCCESS);
1701
/**************************************************************//**
1702
Attaches the halves of an index page on the appropriate level in an
1706
btr_attach_half_pages(
1707
/*==================*/
1708
dict_index_t* index, /*!< in: the index tree */
1709
buf_block_t* block, /*!< in/out: page to be split */
1710
rec_t* split_rec, /*!< in: first record on upper
1712
buf_block_t* new_block, /*!< in/out: the new half page */
1713
ulint direction, /*!< in: FSP_UP or FSP_DOWN */
1714
mtr_t* mtr) /*!< in: mtr */
1721
page_t* page = buf_block_get_frame(block);
1724
ulint lower_page_no;
1725
ulint upper_page_no;
1726
page_zip_des_t* lower_page_zip;
1727
page_zip_des_t* upper_page_zip;
1728
dtuple_t* node_ptr_upper;
1731
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1732
ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
1734
/* Create a memory heap where the data tuple is stored */
1735
heap = mem_heap_create(1024);
1737
/* Based on split direction, decide upper and lower pages */
1738
if (direction == FSP_DOWN) {
1743
lower_page = buf_block_get_frame(new_block);
1744
lower_page_no = buf_block_get_page_no(new_block);
1745
lower_page_zip = buf_block_get_page_zip(new_block);
1746
upper_page = buf_block_get_frame(block);
1747
upper_page_no = buf_block_get_page_no(block);
1748
upper_page_zip = buf_block_get_page_zip(block);
1750
/* Look up the index for the node pointer to page */
1751
offsets = btr_page_get_father_block(NULL, heap, index,
1752
block, mtr, &cursor);
1754
/* Replace the address of the old child node (= page) with the
1755
address of the new lower half */
1757
btr_node_ptr_set_child_page_no(
1758
btr_cur_get_rec(&cursor),
1759
btr_cur_get_page_zip(&cursor),
1760
offsets, lower_page_no, mtr);
1761
mem_heap_empty(heap);
1763
lower_page = buf_block_get_frame(block);
1764
lower_page_no = buf_block_get_page_no(block);
1765
lower_page_zip = buf_block_get_page_zip(block);
1766
upper_page = buf_block_get_frame(new_block);
1767
upper_page_no = buf_block_get_page_no(new_block);
1768
upper_page_zip = buf_block_get_page_zip(new_block);
1771
/* Get the level of the split pages */
1772
level = btr_page_get_level(buf_block_get_frame(block), mtr);
1774
== btr_page_get_level(buf_block_get_frame(new_block), mtr));
1776
/* Build the node pointer (= node key and page address) for the upper
1779
node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
1780
upper_page_no, heap, level);
1782
/* Insert it next to the pointer to the lower half. Note that this
1783
may generate recursion leading to a split on the higher level. */
1785
btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr);
1787
/* Free the memory heap */
1788
mem_heap_free(heap);
1790
/* Get the previous and next pages of page */
1792
prev_page_no = btr_page_get_prev(page, mtr);
1793
next_page_no = btr_page_get_next(page, mtr);
1794
space = buf_block_get_space(block);
1795
zip_size = buf_block_get_zip_size(block);
1797
/* Update page links of the level */
1799
if (prev_page_no != FIL_NULL) {
1800
buf_block_t* prev_block = btr_block_get(space, zip_size,
1803
#ifdef UNIV_BTR_DEBUG
1804
ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
1805
ut_a(btr_page_get_next(prev_block->frame, mtr)
1806
== buf_block_get_page_no(block));
1807
#endif /* UNIV_BTR_DEBUG */
1809
btr_page_set_next(buf_block_get_frame(prev_block),
1810
buf_block_get_page_zip(prev_block),
1811
lower_page_no, mtr);
1814
if (next_page_no != FIL_NULL) {
1815
buf_block_t* next_block = btr_block_get(space, zip_size,
1818
#ifdef UNIV_BTR_DEBUG
1819
ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
1820
ut_a(btr_page_get_prev(next_block->frame, mtr)
1821
== page_get_page_no(page));
1822
#endif /* UNIV_BTR_DEBUG */
1824
btr_page_set_prev(buf_block_get_frame(next_block),
1825
buf_block_get_page_zip(next_block),
1826
upper_page_no, mtr);
1829
btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr);
1830
btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
1832
btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
1833
btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr);
1836
/*************************************************************//**
1837
Determine if a tuple is smaller than any record on the page.
1838
@return TRUE if smaller */
1841
btr_page_tuple_smaller(
1842
/*===================*/
1843
btr_cur_t* cursor, /*!< in: b-tree cursor */
1844
const dtuple_t* tuple, /*!< in: tuple to consider */
1845
ulint* offsets,/*!< in/out: temporary storage */
1846
ulint n_uniq, /*!< in: number of unique fields
1847
in the index page records */
1848
mem_heap_t** heap) /*!< in/out: heap for offsets */
1851
const rec_t* first_rec;
1854
/* Read the first user record in the page. */
1855
block = btr_cur_get_block(cursor);
1856
page_cur_set_before_first(block, &pcur);
1857
page_cur_move_to_next(&pcur);
1858
first_rec = page_cur_get_rec(&pcur);
1860
offsets = rec_get_offsets(
1861
first_rec, cursor->index, offsets,
1864
return(cmp_dtuple_rec(tuple, first_rec, offsets) < 0);
1867
/*************************************************************//**
1868
Splits an index page to halves and inserts the tuple. It is assumed
1869
that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
1870
released within this function! NOTE that the operation of this
1871
function must always succeed, we cannot reverse it: therefore enough
1872
free disk space (2 pages) must be guaranteed to be available before
1873
this function is called.
1875
@return inserted record */
1878
btr_page_split_and_insert(
1879
/*======================*/
1880
btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
1881
function returns, the cursor is positioned
1882
on the predecessor of the inserted record */
1883
const dtuple_t* tuple, /*!< in: tuple to insert */
1884
ulint n_ext, /*!< in: number of externally stored columns */
1885
mtr_t* mtr) /*!< in: mtr */
1889
page_zip_des_t* page_zip;
1893
buf_block_t* new_block;
1895
page_zip_des_t* new_page_zip;
1897
buf_block_t* left_block;
1898
buf_block_t* right_block;
1899
buf_block_t* insert_block;
1900
page_cur_t* page_cursor;
1902
byte* buf = 0; /* remove warning */
1904
ibool insert_will_fit;
1906
ulint n_iterations = 0;
1912
heap = mem_heap_create(1024);
1913
n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
1915
mem_heap_empty(heap);
1918
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
1920
#ifdef UNIV_SYNC_DEBUG
1921
ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
1922
#endif /* UNIV_SYNC_DEBUG */
1924
block = btr_cur_get_block(cursor);
1925
page = buf_block_get_frame(block);
1926
page_zip = buf_block_get_page_zip(block);
1928
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1929
ut_ad(page_get_n_recs(page) >= 1);
1931
page_no = buf_block_get_page_no(block);
1933
/* 1. Decide the split record; split_rec == NULL means that the
1934
tuple to be inserted should be the first record on the upper
1936
insert_left = FALSE;
1938
if (n_iterations > 0) {
1940
hint_page_no = page_no + 1;
1941
split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
1943
if (UNIV_UNLIKELY(split_rec == NULL)) {
1944
insert_left = btr_page_tuple_smaller(
1945
cursor, tuple, offsets, n_uniq, &heap);
1947
} else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
1949
hint_page_no = page_no + 1;
1950
} else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
1951
direction = FSP_DOWN;
1952
hint_page_no = page_no - 1;
1956
hint_page_no = page_no + 1;
1957
/* If there is only one record in the index page, we
1958
can't split the node in the middle by default. We need
1959
to determine whether the new record will be inserted
1960
to the left or right. */
1962
if (page_get_n_recs(page) > 1) {
1963
split_rec = page_get_middle_rec(page);
1964
} else if (btr_page_tuple_smaller(cursor, tuple,
1965
offsets, n_uniq, &heap)) {
1966
split_rec = page_rec_get_next(
1967
page_get_infimum_rec(page));
1973
/* 2. Allocate a new page to the index */
1974
new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
1975
btr_page_get_level(page, mtr), mtr);
1976
new_page = buf_block_get_frame(new_block);
1977
new_page_zip = buf_block_get_page_zip(new_block);
1978
btr_page_create(new_block, new_page_zip, cursor->index,
1979
btr_page_get_level(page, mtr), mtr);
1981
/* 3. Calculate the first record on the upper half-page, and the
1982
first record (move_limit) on original page which ends up on the
1986
first_rec = move_limit = split_rec;
1988
offsets = rec_get_offsets(split_rec, cursor->index, offsets,
1991
insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0;
1993
if (UNIV_UNLIKELY(!insert_left && new_page_zip
1994
&& n_iterations > 0)) {
1995
/* If a compressed page has already been split,
1996
avoid further splits by inserting the record
1997
to an empty page. */
2001
} else if (UNIV_UNLIKELY(insert_left)) {
2002
ut_a(n_iterations > 0);
2003
first_rec = page_rec_get_next(page_get_infimum_rec(page));
2004
move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
2008
ut_ad(!insert_left);
2009
buf = mem_alloc(rec_get_converted_size(cursor->index,
2012
first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
2014
move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
2017
/* 4. Do first the modifications in the tree structure */
2019
btr_attach_half_pages(cursor->index, block,
2020
first_rec, new_block, direction, mtr);
2022
/* If the split is made on the leaf level and the insert will fit
2023
on the appropriate half-page, we may release the tree x-latch.
2024
We can then move the records after releasing the tree latch,
2025
thus reducing the tree latch contention. */
2028
insert_will_fit = !new_page_zip
2029
&& btr_page_insert_fits(cursor, split_rec,
2030
offsets, tuple, n_ext, heap);
2037
insert_will_fit = !new_page_zip
2038
&& btr_page_insert_fits(cursor, NULL,
2039
NULL, tuple, n_ext, heap);
2042
if (insert_will_fit && page_is_leaf(page)) {
2044
mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
2048
/* 5. Move then the records to the new page */
2049
if (direction == FSP_DOWN) {
2050
/* fputs("Split left\n", stderr); */
2053
#ifdef UNIV_ZIP_COPY
2055
#endif /* UNIV_ZIP_COPY */
2057
(!page_move_rec_list_start(new_block, block, move_limit,
2058
cursor->index, mtr))) {
2059
/* For some reason, compressing new_page failed,
2060
even though it should contain fewer records than
2061
the original page. Copy the page byte for byte
2062
and then delete the records from both pages
2063
as appropriate. Deleting will always succeed. */
2066
page_zip_copy_recs(new_page_zip, new_page,
2067
page_zip, page, cursor->index, mtr);
2068
page_delete_rec_list_end(move_limit - page + new_page,
2069
new_block, cursor->index,
2071
ULINT_UNDEFINED, mtr);
2073
/* Update the lock table and possible hash index. */
2075
lock_move_rec_list_start(
2076
new_block, block, move_limit,
2077
new_page + PAGE_NEW_INFIMUM);
2079
btr_search_move_or_delete_hash_entries(
2080
new_block, block, cursor->index);
2082
/* Delete the records from the source page. */
2084
page_delete_rec_list_start(move_limit, block,
2085
cursor->index, mtr);
2088
left_block = new_block;
2089
right_block = block;
2091
lock_update_split_left(right_block, left_block);
2093
/* fputs("Split right\n", stderr); */
2096
#ifdef UNIV_ZIP_COPY
2098
#endif /* UNIV_ZIP_COPY */
2100
(!page_move_rec_list_end(new_block, block, move_limit,
2101
cursor->index, mtr))) {
2102
/* For some reason, compressing new_page failed,
2103
even though it should contain fewer records than
2104
the original page. Copy the page byte for byte
2105
and then delete the records from both pages
2106
as appropriate. Deleting will always succeed. */
2109
page_zip_copy_recs(new_page_zip, new_page,
2110
page_zip, page, cursor->index, mtr);
2111
page_delete_rec_list_start(move_limit - page
2112
+ new_page, new_block,
2113
cursor->index, mtr);
2115
/* Update the lock table and possible hash index. */
2117
lock_move_rec_list_end(new_block, block, move_limit);
2119
btr_search_move_or_delete_hash_entries(
2120
new_block, block, cursor->index);
2122
/* Delete the records from the source page. */
2124
page_delete_rec_list_end(move_limit, block,
2127
ULINT_UNDEFINED, mtr);
2131
right_block = new_block;
2133
lock_update_split_right(right_block, left_block);
2136
#ifdef UNIV_ZIP_DEBUG
2137
if (UNIV_LIKELY_NULL(page_zip)) {
2138
ut_a(page_zip_validate(page_zip, page));
2139
ut_a(page_zip_validate(new_page_zip, new_page));
2141
#endif /* UNIV_ZIP_DEBUG */
2143
/* At this point, split_rec, move_limit and first_rec may point
2144
to garbage on the old page. */
2146
/* 6. The split and the tree modification is now completed. Decide the
2147
page where the tuple should be inserted */
2150
insert_block = left_block;
2152
insert_block = right_block;
2155
/* 7. Reposition the cursor for insert and try insertion */
2156
page_cursor = btr_cur_get_page_cur(cursor);
2158
page_cur_search(insert_block, cursor->index, tuple,
2159
PAGE_CUR_LE, page_cursor);
2161
rec = page_cur_tuple_insert(page_cursor, tuple,
2162
cursor->index, n_ext, mtr);
2164
#ifdef UNIV_ZIP_DEBUG
2167
= buf_block_get_frame(insert_block);
2169
page_zip_des_t* insert_page_zip
2170
= buf_block_get_page_zip(insert_block);
2172
ut_a(!insert_page_zip
2173
|| page_zip_validate(insert_page_zip, insert_page));
2175
#endif /* UNIV_ZIP_DEBUG */
2177
if (UNIV_LIKELY(rec != NULL)) {
2182
/* 8. If insert did not fit, try page reorganization */
2185
(!btr_page_reorganize(insert_block, cursor->index, mtr))) {
2190
page_cur_search(insert_block, cursor->index, tuple,
2191
PAGE_CUR_LE, page_cursor);
2192
rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
2195
if (UNIV_UNLIKELY(rec == NULL)) {
2196
/* The insert did not fit on the page: loop back to the
2197
start of the function for a new split */
2199
/* We play safe and reset the free bits for new_page */
2200
if (!dict_index_is_clust(cursor->index)) {
2201
ibuf_reset_free_bits(new_block);
2204
/* fprintf(stderr, "Split second round %lu\n",
2205
page_get_page_no(page)); */
2207
ut_ad(n_iterations < 2
2208
|| buf_block_get_page_zip(insert_block));
2209
ut_ad(!insert_will_fit);
2215
/* Insert fit on the page: update the free bits for the
2216
left and right pages in the same mtr */
2218
if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) {
2219
ibuf_update_free_bits_for_two_pages_low(
2220
buf_block_get_zip_size(left_block),
2221
left_block, right_block, mtr);
2225
fprintf(stderr, "Split and insert done %lu %lu\n",
2226
buf_block_get_page_no(left_block),
2227
buf_block_get_page_no(right_block));
2230
ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
2231
ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
2233
mem_heap_free(heap);
2237
/*************************************************************//**
2238
Removes a page from the level list of pages. */
2241
btr_level_list_remove(
2242
/*==================*/
2243
ulint space, /*!< in: space where removed */
2244
ulint zip_size,/*!< in: compressed page size in bytes
2245
or 0 for uncompressed pages */
2246
page_t* page, /*!< in: page to remove */
2247
mtr_t* mtr) /*!< in: mtr */
2253
ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
2254
ut_ad(space == page_get_space_id(page));
2255
/* Get the previous and next page numbers of page */
2257
prev_page_no = btr_page_get_prev(page, mtr);
2258
next_page_no = btr_page_get_next(page, mtr);
2260
/* Update page links of the level */
2262
if (prev_page_no != FIL_NULL) {
2263
buf_block_t* prev_block
2264
= btr_block_get(space, zip_size, prev_page_no,
2267
= buf_block_get_frame(prev_block);
2268
#ifdef UNIV_BTR_DEBUG
2269
ut_a(page_is_comp(prev_page) == page_is_comp(page));
2270
ut_a(btr_page_get_next(prev_page, mtr)
2271
== page_get_page_no(page));
2272
#endif /* UNIV_BTR_DEBUG */
2274
btr_page_set_next(prev_page,
2275
buf_block_get_page_zip(prev_block),
2279
if (next_page_no != FIL_NULL) {
2280
buf_block_t* next_block
2281
= btr_block_get(space, zip_size, next_page_no,
2284
= buf_block_get_frame(next_block);
2285
#ifdef UNIV_BTR_DEBUG
2286
ut_a(page_is_comp(next_page) == page_is_comp(page));
2287
ut_a(btr_page_get_prev(next_page, mtr)
2288
== page_get_page_no(page));
2289
#endif /* UNIV_BTR_DEBUG */
2291
btr_page_set_prev(next_page,
2292
buf_block_get_page_zip(next_block),
2297
/****************************************************************//**
2298
Writes the redo log record for setting an index record as the predefined
2302
btr_set_min_rec_mark_log(
2303
/*=====================*/
2304
rec_t* rec, /*!< in: record */
2305
byte type, /*!< in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */
2306
mtr_t* mtr) /*!< in: mtr */
2308
mlog_write_initial_log_record(rec, type, mtr);
2310
/* Write rec offset as a 2-byte ulint */
2311
mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
2313
#else /* !UNIV_HOTBACKUP */
2314
# define btr_set_min_rec_mark_log(rec,comp,mtr) ((void) 0)
2315
#endif /* !UNIV_HOTBACKUP */
2317
/****************************************************************//**
2318
Parses the redo log record for setting an index record as the predefined
2320
@return end of log record or NULL */
2323
btr_parse_set_min_rec_mark(
2324
/*=======================*/
2325
byte* ptr, /*!< in: buffer */
2326
byte* end_ptr,/*!< in: buffer end */
2327
ulint comp, /*!< in: nonzero=compact page format */
2328
page_t* page, /*!< in: page or NULL */
2329
mtr_t* mtr) /*!< in: mtr or NULL */
2333
if (end_ptr < ptr + 2) {
2339
ut_a(!page_is_comp(page) == !comp);
2341
rec = page + mach_read_from_2(ptr);
2343
btr_set_min_rec_mark(rec, mtr);
2349
/****************************************************************//**
2350
Sets a record as the predefined minimum record. */
2353
btr_set_min_rec_mark(
2354
/*=================*/
2355
rec_t* rec, /*!< in: record */
2356
mtr_t* mtr) /*!< in: mtr */
2360
if (UNIV_LIKELY(page_rec_is_comp(rec))) {
2361
info_bits = rec_get_info_bits(rec, TRUE);
2363
rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
2365
btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
2367
info_bits = rec_get_info_bits(rec, FALSE);
2369
rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
2371
btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
2375
#ifndef UNIV_HOTBACKUP
2376
/*************************************************************//**
2377
Deletes on the upper level the node pointer to a page. */
2380
btr_node_ptr_delete(
2381
/*================*/
2382
dict_index_t* index, /*!< in: index tree */
2383
buf_block_t* block, /*!< in: page whose node pointer is deleted */
2384
mtr_t* mtr) /*!< in: mtr */
2390
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2392
/* Delete node pointer on father page */
2393
btr_page_get_father(index, block, mtr, &cursor);
2395
compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, RB_NONE,
2397
ut_a(err == DB_SUCCESS);
2400
btr_cur_compress_if_useful(&cursor, mtr);
2404
/*************************************************************//**
2405
If page is the only on its level, this function moves its records to the
2406
father page, thus reducing the tree height. */
2411
dict_index_t* index, /*!< in: index tree */
2412
buf_block_t* block, /*!< in: page which is the only on its level;
2413
must not be empty: use
2414
btr_discard_only_page_on_level if the last
2415
record from the page should be removed */
2416
mtr_t* mtr) /*!< in: mtr */
2418
buf_block_t* father_block;
2419
page_t* father_page;
2421
page_zip_des_t* father_page_zip;
2422
page_t* page = buf_block_get_frame(block);
2424
buf_block_t* blocks[BTR_MAX_LEVELS];
2425
ulint n_blocks; /*!< last used index in blocks[] */
2428
ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
2429
ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
2430
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2432
page_level = btr_page_get_level(page, mtr);
2433
root_page_no = dict_index_get_page(index);
2437
mem_heap_t* heap = mem_heap_create(100);
2441
offsets = btr_page_get_father_block(NULL, heap, index,
2442
block, mtr, &cursor);
2443
father_block = btr_cur_get_block(&cursor);
2444
father_page_zip = buf_block_get_page_zip(father_block);
2445
father_page = buf_block_get_frame(father_block);
2449
/* Store all ancestor pages so we can reset their
2450
levels later on. We have to do all the searches on
2451
the tree now because later on, after we've replaced
2452
the first level, the tree is in an inconsistent state
2453
and can not be searched. */
2454
for (b = father_block;
2455
buf_block_get_page_no(b) != root_page_no; ) {
2456
ut_a(n_blocks < BTR_MAX_LEVELS);
2458
offsets = btr_page_get_father_block(offsets, heap,
2462
blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
2465
mem_heap_free(heap);
2468
btr_search_drop_page_hash_index(block);
2470
/* Make the father empty */
2471
btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
2473
/* Copy the records to the father page one by one. */
2475
#ifdef UNIV_ZIP_COPY
2477
#endif /* UNIV_ZIP_COPY */
2479
(!page_copy_rec_list_end(father_block, block,
2480
page_get_infimum_rec(page),
2482
const page_zip_des_t* page_zip
2483
= buf_block_get_page_zip(block);
2484
ut_a(father_page_zip);
2487
/* Copy the page byte for byte. */
2488
page_zip_copy_recs(father_page_zip, father_page,
2489
page_zip, page, index, mtr);
2491
/* Update the lock table and possible hash index. */
2493
lock_move_rec_list_end(father_block, block,
2494
page_get_infimum_rec(page));
2496
btr_search_move_or_delete_hash_entries(father_block, block,
2500
lock_update_copy_and_discard(father_block, block);
2502
/* Go upward to root page, decrementing levels by one. */
2503
for (i = 0; i < n_blocks; i++, page_level++) {
2504
page_t* page = buf_block_get_frame(blocks[i]);
2505
page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]);
2507
ut_ad(btr_page_get_level(page, mtr) == page_level + 1);
2509
btr_page_set_level(page, page_zip, page_level, mtr);
2510
#ifdef UNIV_ZIP_DEBUG
2511
ut_a(!page_zip || page_zip_validate(page_zip, page));
2512
#endif /* UNIV_ZIP_DEBUG */
2515
/* Free the file page */
2516
btr_page_free(index, block, mtr);
2518
/* We play it safe and reset the free bits for the father */
2519
if (!dict_index_is_clust(index)) {
2520
ibuf_reset_free_bits(father_block);
2522
ut_ad(page_validate(father_page, index));
2523
ut_ad(btr_check_node_ptr(index, father_block, mtr));
2526
/*************************************************************//**
2527
Tries to merge the page first to the left immediate brother if such a
2528
brother exists, and the node pointers to the current page and to the brother
2529
reside on the same page. If the left brother does not satisfy these
2530
conditions, looks at the right brother. If the page is the only one on that
2531
level lifts the records of the page to the father page, thus reducing the
2532
tree height. It is assumed that mtr holds an x-latch on the tree and on the
2533
page. If cursor is on the leaf level, mtr must also hold x-latches to the
2534
brothers, if they exist.
2535
@return TRUE on success */
2540
btr_cur_t* cursor, /*!< in: cursor on the page to merge or lift;
2541
the page must not be empty: in record delete
2542
use btr_discard_page if the page would become
2544
mtr_t* mtr) /*!< in: mtr */
2546
dict_index_t* index;
2550
ulint right_page_no;
2551
buf_block_t* merge_block;
2553
page_zip_des_t* merge_page_zip;
2557
btr_cur_t father_cursor;
2563
ulint max_ins_size_reorg;
2565
block = btr_cur_get_block(cursor);
2566
page = btr_cur_get_page(cursor);
2567
index = btr_cur_get_index(cursor);
2568
ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table));
2570
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2572
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2573
space = dict_index_get_space(index);
2574
zip_size = dict_table_zip_size(index->table);
2576
left_page_no = btr_page_get_prev(page, mtr);
2577
right_page_no = btr_page_get_next(page, mtr);
2580
fprintf(stderr, "Merge left page %lu right %lu \n",
2581
left_page_no, right_page_no);
2584
heap = mem_heap_create(100);
2585
offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
2588
/* Decide the page to which we try to merge and which will inherit
2591
is_left = left_page_no != FIL_NULL;
2595
merge_block = btr_block_get(space, zip_size, left_page_no,
2597
merge_page = buf_block_get_frame(merge_block);
2598
#ifdef UNIV_BTR_DEBUG
2599
ut_a(btr_page_get_next(merge_page, mtr)
2600
== buf_block_get_page_no(block));
2601
#endif /* UNIV_BTR_DEBUG */
2602
} else if (right_page_no != FIL_NULL) {
2604
merge_block = btr_block_get(space, zip_size, right_page_no,
2606
merge_page = buf_block_get_frame(merge_block);
2607
#ifdef UNIV_BTR_DEBUG
2608
ut_a(btr_page_get_prev(merge_page, mtr)
2609
== buf_block_get_page_no(block));
2610
#endif /* UNIV_BTR_DEBUG */
2612
/* The page is the only one on the level, lift the records
2614
btr_lift_page_up(index, block, mtr);
2615
mem_heap_free(heap);
2619
n_recs = page_get_n_recs(page);
2620
data_size = page_get_data_size(page);
2621
#ifdef UNIV_BTR_DEBUG
2622
ut_a(page_is_comp(merge_page) == page_is_comp(page));
2623
#endif /* UNIV_BTR_DEBUG */
2625
max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
2626
merge_page, n_recs);
2627
if (data_size > max_ins_size_reorg) {
2629
/* No space for merge */
2631
/* We play it safe and reset the free bits. */
2633
&& page_is_leaf(merge_page)
2634
&& !dict_index_is_clust(index)) {
2635
ibuf_reset_free_bits(merge_block);
2638
mem_heap_free(heap);
2642
ut_ad(page_validate(merge_page, index));
2644
max_ins_size = page_get_max_insert_size(merge_page, n_recs);
2646
if (UNIV_UNLIKELY(data_size > max_ins_size)) {
2648
/* We have to reorganize merge_page */
2650
if (UNIV_UNLIKELY(!btr_page_reorganize(merge_block,
2656
max_ins_size = page_get_max_insert_size(merge_page, n_recs);
2658
ut_ad(page_validate(merge_page, index));
2659
ut_ad(max_ins_size == max_ins_size_reorg);
2661
if (UNIV_UNLIKELY(data_size > max_ins_size)) {
2663
/* Add fault tolerance, though this should
2670
merge_page_zip = buf_block_get_page_zip(merge_block);
2671
#ifdef UNIV_ZIP_DEBUG
2672
if (UNIV_LIKELY_NULL(merge_page_zip)) {
2673
const page_zip_des_t* page_zip
2674
= buf_block_get_page_zip(block);
2676
ut_a(page_zip_validate(merge_page_zip, merge_page));
2677
ut_a(page_zip_validate(page_zip, page));
2679
#endif /* UNIV_ZIP_DEBUG */
2681
/* Move records to the merge page */
2683
rec_t* orig_pred = page_copy_rec_list_start(
2684
merge_block, block, page_get_supremum_rec(page),
2687
if (UNIV_UNLIKELY(!orig_pred)) {
2691
btr_search_drop_page_hash_index(block);
2693
/* Remove the page from the level list */
2694
btr_level_list_remove(space, zip_size, page, mtr);
2696
btr_node_ptr_delete(index, block, mtr);
2697
lock_update_merge_left(merge_block, orig_pred, block);
2700
#ifdef UNIV_BTR_DEBUG
2701
byte fil_page_prev[4];
2702
#endif /* UNIV_BTR_DEBUG */
2704
if (UNIV_LIKELY_NULL(merge_page_zip)) {
2705
/* The function page_zip_compress(), which will be
2706
invoked by page_copy_rec_list_end() below,
2707
requires that FIL_PAGE_PREV be FIL_NULL.
2708
Clear the field, but prepare to restore it. */
2709
#ifdef UNIV_BTR_DEBUG
2710
memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
2711
#endif /* UNIV_BTR_DEBUG */
2712
#if FIL_NULL != 0xffffffff
2713
# error "FIL_NULL != 0xffffffff"
2715
memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
2718
orig_succ = page_copy_rec_list_end(merge_block, block,
2719
page_get_infimum_rec(page),
2720
cursor->index, mtr);
2722
if (UNIV_UNLIKELY(!orig_succ)) {
2723
ut_a(merge_page_zip);
2724
#ifdef UNIV_BTR_DEBUG
2725
/* FIL_PAGE_PREV was restored from merge_page_zip. */
2726
ut_a(!memcmp(fil_page_prev,
2727
merge_page + FIL_PAGE_PREV, 4));
2728
#endif /* UNIV_BTR_DEBUG */
2732
btr_search_drop_page_hash_index(block);
2734
#ifdef UNIV_BTR_DEBUG
2735
if (UNIV_LIKELY_NULL(merge_page_zip)) {
2736
/* Restore FIL_PAGE_PREV in order to avoid an assertion
2737
failure in btr_level_list_remove(), which will set
2738
the field again to FIL_NULL. Even though this makes
2739
merge_page and merge_page_zip inconsistent for a
2740
split second, it is harmless, because the pages
2742
memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
2744
#endif /* UNIV_BTR_DEBUG */
2746
/* Remove the page from the level list */
2747
btr_level_list_remove(space, zip_size, page, mtr);
2749
/* Replace the address of the old child node (= page) with the
2750
address of the merge page to the right */
2752
btr_node_ptr_set_child_page_no(
2753
btr_cur_get_rec(&father_cursor),
2754
btr_cur_get_page_zip(&father_cursor),
2755
offsets, right_page_no, mtr);
2756
btr_node_ptr_delete(index, merge_block, mtr);
2758
lock_update_merge_right(merge_block, orig_succ, block);
2761
mem_heap_free(heap);
2763
if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
2764
/* Update the free bits of the B-tree page in the
2765
insert buffer bitmap. This has to be done in a
2766
separate mini-transaction that is committed before the
2767
main mini-transaction. We cannot update the insert
2768
buffer bitmap in this mini-transaction, because
2769
btr_compress() can be invoked recursively without
2770
committing the mini-transaction in between. Since
2771
insert buffer bitmap pages have a lower rank than
2772
B-tree pages, we must not access other pages in the
2773
same mini-transaction after accessing an insert buffer
2776
/* The free bits in the insert buffer bitmap must
2777
never exceed the free space on a page. It is safe to
2778
decrement or reset the bits in the bitmap in a
2779
mini-transaction that is committed before the
2780
mini-transaction that affects the free space. */
2782
/* It is unsafe to increment the bits in a separately
2783
committed mini-transaction, because in crash recovery,
2784
the free bits could momentarily be set too high. */
2787
/* Because the free bits may be incremented
2788
and we cannot update the insert buffer bitmap
2789
in the same mini-transaction, the only safe
2790
thing we can do here is the pessimistic
2791
approach: reset the free bits. */
2792
ibuf_reset_free_bits(merge_block);
2794
/* On uncompressed pages, the free bits will
2795
never increase here. Thus, it is safe to
2796
write the bits accurately in a separate
2797
mini-transaction. */
2798
ibuf_update_free_bits_if_full(merge_block,
2804
ut_ad(page_validate(merge_page, index));
2805
#ifdef UNIV_ZIP_DEBUG
2806
ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page));
2807
#endif /* UNIV_ZIP_DEBUG */
2809
/* Free the file page */
2810
btr_page_free(index, block, mtr);
2812
ut_ad(btr_check_node_ptr(index, merge_block, mtr));
2816
/*************************************************************//**
2817
Discards a page that is the only page on its level. This will empty
2818
the whole B-tree, leaving just an empty root page. This function
2819
should never be reached, because btr_compress(), which is invoked in
2820
delete operations, calls btr_lift_page_up() to flatten the B-tree. */
2823
btr_discard_only_page_on_level(
2824
/*===========================*/
2825
dict_index_t* index, /*!< in: index tree */
2826
buf_block_t* block, /*!< in: page which is the only on its level */
2827
mtr_t* mtr) /*!< in: mtr */
2829
ulint page_level = 0;
2830
trx_id_t max_trx_id;
2832
/* Save the PAGE_MAX_TRX_ID from the leaf page. */
2833
max_trx_id = page_get_max_trx_id(buf_block_get_frame(block));
2835
while (buf_block_get_page_no(block) != dict_index_get_page(index)) {
2837
buf_block_t* father;
2838
const page_t* page = buf_block_get_frame(block);
2840
ut_a(page_get_n_recs(page) == 1);
2841
ut_a(page_level == btr_page_get_level(page, mtr));
2842
ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
2843
ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
2845
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2846
btr_search_drop_page_hash_index(block);
2848
btr_page_get_father(index, block, mtr, &cursor);
2849
father = btr_cur_get_block(&cursor);
2851
lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block);
2853
/* Free the file page */
2854
btr_page_free(index, block, mtr);
2860
/* block is the root page, which must be empty, except
2861
for the node pointer to the (now discarded) block(s). */
2863
#ifdef UNIV_BTR_DEBUG
2864
if (!dict_index_is_ibuf(index)) {
2865
const page_t* root = buf_block_get_frame(block);
2866
const ulint space = dict_index_get_space(index);
2867
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
2869
ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
2872
#endif /* UNIV_BTR_DEBUG */
2874
btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
2876
if (!dict_index_is_clust(index)) {
2877
/* We play it safe and reset the free bits for the root */
2878
ibuf_reset_free_bits(block);
2880
if (page_is_leaf(buf_block_get_frame(block))) {
2882
page_set_max_trx_id(block,
2883
buf_block_get_page_zip(block),
2889
/*************************************************************//**
2890
Discards a page from a B-tree. This is used to remove the last record from
2891
a B-tree page: the whole page must be removed at the same time. This cannot
2892
be used for the root page, which is allowed to be empty. */
2897
btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
2899
mtr_t* mtr) /*!< in: mtr */
2901
dict_index_t* index;
2905
ulint right_page_no;
2906
buf_block_t* merge_block;
2912
block = btr_cur_get_block(cursor);
2913
index = btr_cur_get_index(cursor);
2915
ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block));
2916
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2918
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2919
space = dict_index_get_space(index);
2920
zip_size = dict_table_zip_size(index->table);
2922
/* Decide the page which will inherit the locks */
2924
left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
2925
right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr);
2927
if (left_page_no != FIL_NULL) {
2928
merge_block = btr_block_get(space, zip_size, left_page_no,
2930
merge_page = buf_block_get_frame(merge_block);
2931
#ifdef UNIV_BTR_DEBUG
2932
ut_a(btr_page_get_next(merge_page, mtr)
2933
== buf_block_get_page_no(block));
2934
#endif /* UNIV_BTR_DEBUG */
2935
} else if (right_page_no != FIL_NULL) {
2936
merge_block = btr_block_get(space, zip_size, right_page_no,
2938
merge_page = buf_block_get_frame(merge_block);
2939
#ifdef UNIV_BTR_DEBUG
2940
ut_a(btr_page_get_prev(merge_page, mtr)
2941
== buf_block_get_page_no(block));
2942
#endif /* UNIV_BTR_DEBUG */
2944
btr_discard_only_page_on_level(index, block, mtr);
2949
page = buf_block_get_frame(block);
2950
ut_a(page_is_comp(merge_page) == page_is_comp(page));
2951
btr_search_drop_page_hash_index(block);
2953
if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
2955
/* We have to mark the leftmost node pointer on the right
2956
side page as the predefined minimum record */
2957
node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
2959
ut_ad(page_rec_is_user_rec(node_ptr));
2961
/* This will make page_zip_validate() fail on merge_page
2962
until btr_level_list_remove() completes. This is harmless,
2963
because everything will take place within a single
2964
mini-transaction and because writing to the redo log
2965
is an atomic operation (performed by mtr_commit()). */
2966
btr_set_min_rec_mark(node_ptr, mtr);
2969
btr_node_ptr_delete(index, block, mtr);
2971
/* Remove the page from the level list */
2972
btr_level_list_remove(space, zip_size, page, mtr);
2973
#ifdef UNIV_ZIP_DEBUG
2975
page_zip_des_t* merge_page_zip
2976
= buf_block_get_page_zip(merge_block);
2977
ut_a(!merge_page_zip
2978
|| page_zip_validate(merge_page_zip, merge_page));
2980
#endif /* UNIV_ZIP_DEBUG */
2982
if (left_page_no != FIL_NULL) {
2983
lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
2986
lock_update_discard(merge_block,
2987
lock_get_min_heap_no(merge_block),
2991
/* Free the file page */
2992
btr_page_free(index, block, mtr);
2994
ut_ad(btr_check_node_ptr(index, merge_block, mtr));
2997
#ifdef UNIV_BTR_PRINT
2998
/*************************************************************//**
2999
Prints size info of a B-tree. */
3004
dict_index_t* index) /*!< in: index tree */
3010
if (dict_index_is_ibuf(index)) {
3011
fputs("Sorry, cannot print info of an ibuf tree:"
3012
" use ibuf functions\n", stderr);
3019
root = btr_root_get(index, &mtr);
3021
seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
3023
fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
3024
fseg_print(seg, &mtr);
3026
if (!(index->type & DICT_UNIVERSAL)) {
3028
seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
3030
fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
3031
fseg_print(seg, &mtr);
3037
/************************************************************//**
3038
Prints recursively index tree pages. */
3041
btr_print_recursive(
3042
/*================*/
3043
dict_index_t* index, /*!< in: index tree */
3044
buf_block_t* block, /*!< in: index page */
3045
ulint width, /*!< in: print this many entries from start
3047
mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
3048
ulint** offsets,/*!< in/out: buffer for rec_get_offsets() */
3049
mtr_t* mtr) /*!< in: mtr */
3051
const page_t* page = buf_block_get_frame(block);
3057
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3058
fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
3059
(ulong) btr_page_get_level(page, mtr),
3060
(ulong) buf_block_get_page_no(block));
3062
page_print(block, index, width, width);
3064
n_recs = page_get_n_recs(page);
3066
page_cur_set_before_first(block, &cursor);
3067
page_cur_move_to_next(&cursor);
3069
while (!page_cur_is_after_last(&cursor)) {
3071
if (page_is_leaf(page)) {
3073
/* If this is the leaf level, do nothing */
3075
} else if ((i <= width) || (i >= n_recs - width)) {
3077
const rec_t* node_ptr;
3081
node_ptr = page_cur_get_rec(&cursor);
3083
*offsets = rec_get_offsets(node_ptr, index, *offsets,
3084
ULINT_UNDEFINED, heap);
3085
btr_print_recursive(index,
3086
btr_node_ptr_get_child(node_ptr,
3090
width, heap, offsets, &mtr2);
3094
page_cur_move_to_next(&cursor);
3099
/**************************************************************//**
3100
Prints directories and other info of all nodes in the tree. */
3105
dict_index_t* index, /*!< in: index */
3106
ulint width) /*!< in: print this many entries from start
3111
mem_heap_t* heap = NULL;
3112
ulint offsets_[REC_OFFS_NORMAL_SIZE];
3113
ulint* offsets = offsets_;
3114
rec_offs_init(offsets_);
3116
fputs("--------------------------\n"
3117
"INDEX TREE PRINT\n", stderr);
3121
root = btr_root_block_get(index, &mtr);
3123
btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
3124
if (UNIV_LIKELY_NULL(heap)) {
3125
mem_heap_free(heap);
3130
btr_validate_index(index, NULL);
3132
#endif /* UNIV_BTR_PRINT */
3135
/************************************************************//**
3136
Checks that the node pointer to a page is appropriate.
3142
dict_index_t* index, /*!< in: index tree */
3143
buf_block_t* block, /*!< in: index page */
3144
mtr_t* mtr) /*!< in: mtr */
3150
page_t* page = buf_block_get_frame(block);
3152
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3153
if (dict_index_get_page(index) == buf_block_get_page_no(block)) {
3158
heap = mem_heap_create(256);
3159
offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
3162
if (page_is_leaf(page)) {
3167
tuple = dict_index_build_node_ptr(
3168
index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
3169
btr_page_get_level(page, mtr));
3171
ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
3173
mem_heap_free(heap);
3177
#endif /* UNIV_DEBUG */
3179
/************************************************************//**
3180
Display identification information for a record. */
3183
btr_index_rec_validate_report(
3184
/*==========================*/
3185
const page_t* page, /*!< in: index page */
3186
const rec_t* rec, /*!< in: index record */
3187
const dict_index_t* index) /*!< in: index */
3189
fputs("InnoDB: Record in ", stderr);
3190
dict_index_name_print(stderr, NULL, index);
3191
fprintf(stderr, ", page %lu, at offset %lu\n",
3192
page_get_page_no(page), (ulint) page_offset(rec));
3195
/************************************************************//**
3196
Checks the size and number of fields in a record based on the definition of
3198
@return TRUE if ok */
3201
btr_index_rec_validate(
3202
/*===================*/
3203
const rec_t* rec, /*!< in: index record */
3204
const dict_index_t* index, /*!< in: index */
3205
ibool dump_on_error) /*!< in: TRUE if the function
3206
should print hex dump of record
3207
and page on error */
3213
mem_heap_t* heap = NULL;
3214
ulint offsets_[REC_OFFS_NORMAL_SIZE];
3215
ulint* offsets = offsets_;
3216
rec_offs_init(offsets_);
3218
page = page_align(rec);
3220
if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
3221
/* The insert buffer index tree can contain records from any
3222
other index: we cannot check the number of fields or
3228
if (UNIV_UNLIKELY((ibool)!!page_is_comp(page)
3229
!= dict_table_is_comp(index->table))) {
3230
btr_index_rec_validate_report(page, rec, index);
3231
fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
3232
(ulong) !!page_is_comp(page),
3233
(ulong) dict_table_is_comp(index->table));
3238
n = dict_index_get_n_fields(index);
3240
if (!page_is_comp(page)
3241
&& UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) {
3242
btr_index_rec_validate_report(page, rec, index);
3243
fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
3244
(ulong) rec_get_n_fields_old(rec), (ulong) n);
3246
if (dump_on_error) {
3247
buf_page_print(page, 0);
3249
fputs("InnoDB: corrupt record ", stderr);
3250
rec_print_old(stderr, rec);
3256
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
3258
for (i = 0; i < n; i++) {
3259
ulint fixed_size = dict_col_get_fixed_size(
3260
dict_index_get_nth_col(index, i), page_is_comp(page));
3262
rec_get_nth_field_offs(offsets, i, &len);
3264
/* Note that if fixed_size != 0, it equals the
3265
length of a fixed-size column in the clustered index.
3266
A prefix index of the column is of fixed, but different
3267
length. When fixed_size == 0, prefix_len is the maximum
3268
length of the prefix index column. */
3270
if ((dict_index_get_nth_field(index, i)->prefix_len == 0
3271
&& len != UNIV_SQL_NULL && fixed_size
3272
&& len != fixed_size)
3273
|| (dict_index_get_nth_field(index, i)->prefix_len > 0
3274
&& len != UNIV_SQL_NULL
3276
> dict_index_get_nth_field(index, i)->prefix_len)) {
3278
btr_index_rec_validate_report(page, rec, index);
3280
"InnoDB: field %lu len is %lu,"
3282
(ulong) i, (ulong) len, (ulong) fixed_size);
3284
if (dump_on_error) {
3285
buf_page_print(page, 0);
3287
fputs("InnoDB: corrupt record ", stderr);
3288
rec_print_new(stderr, rec, offsets);
3291
if (UNIV_LIKELY_NULL(heap)) {
3292
mem_heap_free(heap);
3298
if (UNIV_LIKELY_NULL(heap)) {
3299
mem_heap_free(heap);
3304
/************************************************************//**
3305
Checks the size and number of fields in records based on the definition of
3307
@return TRUE if ok */
3310
btr_index_page_validate(
3311
/*====================*/
3312
buf_block_t* block, /*!< in: index page */
3313
dict_index_t* index) /*!< in: index */
3318
page_cur_set_before_first(block, &cur);
3319
page_cur_move_to_next(&cur);
3322
if (page_cur_is_after_last(&cur)) {
3327
if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
3332
page_cur_move_to_next(&cur);
3338
/************************************************************//**
3339
Report an error on one page of an index tree. */
3342
btr_validate_report1(
3343
/*=================*/
3344
dict_index_t* index, /*!< in: index */
3345
ulint level, /*!< in: B-tree level */
3346
const buf_block_t* block) /*!< in: index page */
3348
fprintf(stderr, "InnoDB: Error in page %lu of ",
3349
buf_block_get_page_no(block));
3350
dict_index_name_print(stderr, NULL, index);
3352
fprintf(stderr, ", index tree level %lu", level);
3357
/************************************************************//**
3358
Report an error on two pages of an index tree. */
3361
btr_validate_report2(
3362
/*=================*/
3363
const dict_index_t* index, /*!< in: index */
3364
ulint level, /*!< in: B-tree level */
3365
const buf_block_t* block1, /*!< in: first index page */
3366
const buf_block_t* block2) /*!< in: second index page */
3368
fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
3369
buf_block_get_page_no(block1),
3370
buf_block_get_page_no(block2));
3371
dict_index_name_print(stderr, NULL, index);
3373
fprintf(stderr, ", index tree level %lu", level);
3378
/************************************************************//**
3379
Validates index tree level.
3380
@return TRUE if ok */
3385
dict_index_t* index, /*!< in: index tree */
3386
trx_t* trx, /*!< in: transaction or NULL */
3387
ulint level) /*!< in: level number */
3393
buf_block_t* right_block = 0; /* remove warning */
3394
page_t* right_page = 0; /* remove warning */
3395
page_t* father_page;
3397
btr_cur_t right_node_cur;
3399
ulint right_page_no;
3402
dtuple_t* node_ptr_tuple;
3405
mem_heap_t* heap = mem_heap_create(256);
3406
ulint* offsets = NULL;
3407
ulint* offsets2= NULL;
3408
#ifdef UNIV_ZIP_DEBUG
3409
page_zip_des_t* page_zip;
3410
#endif /* UNIV_ZIP_DEBUG */
3414
mtr_x_lock(dict_index_get_lock(index), &mtr);
3416
block = btr_root_block_get(index, &mtr);
3417
page = buf_block_get_frame(block);
3419
space = dict_index_get_space(index);
3420
zip_size = dict_table_zip_size(index->table);
3422
while (level != btr_page_get_level(page, &mtr)) {
3423
const rec_t* node_ptr;
3425
ut_a(space == buf_block_get_space(block));
3426
ut_a(space == page_get_space_id(page));
3427
#ifdef UNIV_ZIP_DEBUG
3428
page_zip = buf_block_get_page_zip(block);
3429
ut_a(!page_zip || page_zip_validate(page_zip, page));
3430
#endif /* UNIV_ZIP_DEBUG */
3431
ut_a(!page_is_leaf(page));
3433
page_cur_set_before_first(block, &cursor);
3434
page_cur_move_to_next(&cursor);
3436
node_ptr = page_cur_get_rec(&cursor);
3437
offsets = rec_get_offsets(node_ptr, index, offsets,
3438
ULINT_UNDEFINED, &heap);
3439
block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
3440
page = buf_block_get_frame(block);
3443
/* Now we are on the desired level. Loop through the pages on that
3446
if (trx_is_interrupted(trx)) {
3448
mem_heap_free(heap);
3451
mem_heap_empty(heap);
3452
offsets = offsets2 = NULL;
3453
mtr_x_lock(dict_index_get_lock(index), &mtr);
3455
#ifdef UNIV_ZIP_DEBUG
3456
page_zip = buf_block_get_page_zip(block);
3457
ut_a(!page_zip || page_zip_validate(page_zip, page));
3458
#endif /* UNIV_ZIP_DEBUG */
3460
/* Check ordering etc. of records */
3462
if (!page_validate(page, index)) {
3463
btr_validate_report1(index, level, block);
3466
} else if (level == 0) {
3467
/* We are on level 0. Check that the records have the right
3468
number of fields, and field lengths are right. */
3470
if (!btr_index_page_validate(block, index)) {
3476
ut_a(btr_page_get_level(page, &mtr) == level);
3478
right_page_no = btr_page_get_next(page, &mtr);
3479
left_page_no = btr_page_get_prev(page, &mtr);
3481
ut_a(page_get_n_recs(page) > 0 || (level == 0
3482
&& page_get_page_no(page)
3483
== dict_index_get_page(index)));
3485
if (right_page_no != FIL_NULL) {
3486
const rec_t* right_rec;
3487
right_block = btr_block_get(space, zip_size, right_page_no,
3489
right_page = buf_block_get_frame(right_block);
3490
if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
3491
!= page_get_page_no(page))) {
3492
btr_validate_report2(index, level, block, right_block);
3493
fputs("InnoDB: broken FIL_PAGE_NEXT"
3494
" or FIL_PAGE_PREV links\n", stderr);
3495
buf_page_print(page, 0);
3496
buf_page_print(right_page, 0);
3501
if (UNIV_UNLIKELY(page_is_comp(right_page)
3502
!= page_is_comp(page))) {
3503
btr_validate_report2(index, level, block, right_block);
3504
fputs("InnoDB: 'compact' flag mismatch\n", stderr);
3505
buf_page_print(page, 0);
3506
buf_page_print(right_page, 0);
3510
goto node_ptr_fails;
3513
rec = page_rec_get_prev(page_get_supremum_rec(page));
3514
right_rec = page_rec_get_next(page_get_infimum_rec(
3516
offsets = rec_get_offsets(rec, index,
3517
offsets, ULINT_UNDEFINED, &heap);
3518
offsets2 = rec_get_offsets(right_rec, index,
3519
offsets2, ULINT_UNDEFINED, &heap);
3520
if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec,
3524
btr_validate_report2(index, level, block, right_block);
3526
fputs("InnoDB: records in wrong order"
3527
" on adjacent pages\n", stderr);
3529
buf_page_print(page, 0);
3530
buf_page_print(right_page, 0);
3532
fputs("InnoDB: record ", stderr);
3533
rec = page_rec_get_prev(page_get_supremum_rec(page));
3534
rec_print(stderr, rec, index);
3536
fputs("InnoDB: record ", stderr);
3537
rec = page_rec_get_next(
3538
page_get_infimum_rec(right_page));
3539
rec_print(stderr, rec, index);
3546
if (level > 0 && left_page_no == FIL_NULL) {
3547
ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
3548
page_rec_get_next(page_get_infimum_rec(page)),
3549
page_is_comp(page)));
3552
if (buf_block_get_page_no(block) != dict_index_get_page(index)) {
3554
/* Check father node pointers */
3558
offsets = btr_page_get_father_block(offsets, heap, index,
3559
block, &mtr, &node_cur);
3560
father_page = btr_cur_get_page(&node_cur);
3561
node_ptr = btr_cur_get_rec(&node_cur);
3564
index, page_rec_get_prev(page_get_supremum_rec(page)),
3566
offsets = btr_page_get_father_node_ptr(offsets, heap,
3569
if (UNIV_UNLIKELY(node_ptr != btr_cur_get_rec(&node_cur))
3570
|| UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr,
3572
!= buf_block_get_page_no(block))) {
3574
btr_validate_report1(index, level, block);
3576
fputs("InnoDB: node pointer to the page is wrong\n",
3579
buf_page_print(father_page, 0);
3580
buf_page_print(page, 0);
3582
fputs("InnoDB: node ptr ", stderr);
3583
rec_print(stderr, node_ptr, index);
3585
rec = btr_cur_get_rec(&node_cur);
3586
fprintf(stderr, "\n"
3587
"InnoDB: node ptr child page n:o %lu\n",
3588
(ulong) btr_node_ptr_get_child_page_no(
3591
fputs("InnoDB: record on page ", stderr);
3592
rec_print_new(stderr, rec, offsets);
3596
goto node_ptr_fails;
3599
if (!page_is_leaf(page)) {
3600
node_ptr_tuple = dict_index_build_node_ptr(
3602
page_rec_get_next(page_get_infimum_rec(page)),
3603
0, heap, btr_page_get_level(page, &mtr));
3605
if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
3607
const rec_t* first_rec = page_rec_get_next(
3608
page_get_infimum_rec(page));
3610
btr_validate_report1(index, level, block);
3612
buf_page_print(father_page, 0);
3613
buf_page_print(page, 0);
3615
fputs("InnoDB: Error: node ptrs differ"
3617
"InnoDB: node ptr ", stderr);
3618
rec_print_new(stderr, node_ptr, offsets);
3619
fputs("InnoDB: first rec ", stderr);
3620
rec_print(stderr, first_rec, index);
3624
goto node_ptr_fails;
3628
if (left_page_no == FIL_NULL) {
3629
ut_a(node_ptr == page_rec_get_next(
3630
page_get_infimum_rec(father_page)));
3631
ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
3634
if (right_page_no == FIL_NULL) {
3635
ut_a(node_ptr == page_rec_get_prev(
3636
page_get_supremum_rec(father_page)));
3637
ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
3639
const rec_t* right_node_ptr
3640
= page_rec_get_next(node_ptr);
3642
offsets = btr_page_get_father_block(
3643
offsets, heap, index, right_block,
3644
&mtr, &right_node_cur);
3646
!= page_get_supremum_rec(father_page)) {
3648
if (btr_cur_get_rec(&right_node_cur)
3649
!= right_node_ptr) {
3651
fputs("InnoDB: node pointer to"
3652
" the right page is wrong\n",
3655
btr_validate_report1(index, level,
3658
buf_page_print(father_page, 0);
3659
buf_page_print(page, 0);
3660
buf_page_print(right_page, 0);
3663
page_t* right_father_page
3664
= btr_cur_get_page(&right_node_cur);
3666
if (btr_cur_get_rec(&right_node_cur)
3667
!= page_rec_get_next(
3668
page_get_infimum_rec(
3669
right_father_page))) {
3671
fputs("InnoDB: node pointer 2 to"
3672
" the right page is wrong\n",
3675
btr_validate_report1(index, level,
3678
buf_page_print(father_page, 0);
3679
buf_page_print(right_father_page, 0);
3680
buf_page_print(page, 0);
3681
buf_page_print(right_page, 0);
3684
if (page_get_page_no(right_father_page)
3685
!= btr_page_get_next(father_page, &mtr)) {
3688
fputs("InnoDB: node pointer 3 to"
3689
" the right page is wrong\n",
3692
btr_validate_report1(index, level,
3695
buf_page_print(father_page, 0);
3696
buf_page_print(right_father_page, 0);
3697
buf_page_print(page, 0);
3698
buf_page_print(right_page, 0);
3705
/* Commit the mini-transaction to release the latch on 'page'.
3706
Re-acquire the latch on right_page, which will become 'page'
3707
on the next loop. The page has already been checked. */
3710
if (right_page_no != FIL_NULL) {
3713
block = btr_block_get(space, zip_size, right_page_no,
3715
page = buf_block_get_frame(block);
3720
mem_heap_free(heap);
3724
/**************************************************************//**
3725
Checks the consistency of an index tree.
3726
@return TRUE if ok */
3731
dict_index_t* index, /*!< in: index */
3732
trx_t* trx) /*!< in: transaction or NULL */
3740
mtr_x_lock(dict_index_get_lock(index), &mtr);
3742
root = btr_root_get(index, &mtr);
3743
n = btr_page_get_level(root, &mtr);
3745
for (i = 0; i <= n && !trx_is_interrupted(trx); i++) {
3746
if (!btr_validate_level(index, trx, n - i)) {
3758
#endif /* !UNIV_HOTBACKUP */