1
/******************************************************
4
(c) 1994-1996 Innobase Oy
6
Created 6/2/1994 Heikki Tuuri
7
*******************************************************/
16
#include "page0page.h"
21
#include "lock0lock.h"
22
#include "ibuf0ibuf.h"
26
Latching strategy of the InnoDB B-tree
27
--------------------------------------
28
A tree latch protects all non-leaf nodes of the tree. Each node of a tree
29
also has a latch of its own.
31
A B-tree operation normally first acquires an S-latch on the tree. It
32
searches down the tree and releases the tree latch when it has the
33
leaf node latch. To save CPU time we do not acquire any latch on
34
non-leaf nodes of the tree during a search, those pages are only bufferfixed.
36
If an operation needs to restructure the tree, it acquires an X-latch on
37
the tree before searching to a leaf node. If it needs, for example, to
39
(1) InnoDB decides the split point in the leaf,
40
(2) allocates a new page,
41
(3) inserts the appropriate node pointer to the first non-leaf level,
42
(4) releases the tree X-latch,
43
(5) and then moves records from the leaf to the new allocated page.
47
Leaf pages of a B-tree contain the index records stored in the
48
tree. On levels n > 0 we store 'node pointers' to pages on level
49
n - 1. For each page there is exactly one node pointer stored:
50
thus the our tree is an ordinary B-tree, not a B-link tree.
52
A node pointer contains a prefix P of an index record. The prefix
53
is long enough so that it determines an index record uniquely.
54
The file page number of the child page is added as the last
55
field. To the child page we can store node pointers or index records
56
which are >= P in the alphabetical order, but < P1 if there is
57
a next node pointer on the level, and P1 is its prefix.
59
If a node pointer with a prefix P points to a non-leaf child,
60
then the leftmost record in the child must have the same
61
prefix P. If it points to a leaf node, the child is not required
62
to contain any record with a prefix equal to P. The leaf case
63
is decided this way to allow arbitrary deletions in a leaf node
64
without touching upper levels of the tree.
66
We have predefined a special minimum record which we
67
define as the smallest record in any alphabetical order.
68
A minimum record is denoted by setting a bit in the record
69
header. A minimum record acts as the prefix of a node pointer
70
which points to a leftmost node on any level of the tree.
74
In the root node of a B-tree there are two file segment headers.
75
The leaf pages of a tree are allocated from one file segment, to
76
make them consecutive on disk if possible. From the other file segment
77
we allocate pages for the non-leaf levels of the tree.
80
/****************************************************************
81
Returns the upper level node pointer to a page. It is assumed that
82
mtr holds an x-latch on the tree. */
85
btr_page_get_father_node_ptr(
86
/*=========================*/
87
/* out: pointer to node pointer record */
88
dict_index_t* index, /* in: index tree */
89
page_t* page, /* in: page: must contain at least one
91
mtr_t* mtr); /* in: mtr */
92
/*****************************************************************
93
Empties an index page. */
98
page_t* page, /* in: page to be emptied */
99
mtr_t* mtr); /* in: mtr */
100
/*****************************************************************
101
Returns TRUE if the insert fits on the appropriate half-page
102
with the chosen split_rec. */
105
btr_page_insert_fits(
106
/*=================*/
107
/* out: TRUE if fits */
108
btr_cur_t* cursor, /* in: cursor at which insert
110
rec_t* split_rec, /* in: suggestion for first record
111
on upper half-page, or NULL if
112
tuple should be first */
113
const ulint* offsets, /* in: rec_get_offsets(
114
split_rec, cursor->index) */
115
dtuple_t* tuple, /* in: tuple to insert */
116
mem_heap_t* heap); /* in: temporary memory heap */
118
/******************************************************************
119
Gets the root node of a tree and x-latches it. */
124
/* out: root page, x-latched */
125
dict_index_t* index, /* in: index tree */
126
mtr_t* mtr) /* in: mtr */
132
space = dict_index_get_space(index);
133
root_page_no = dict_index_get_page(index);
135
root = btr_page_get(space, root_page_no, RW_X_LATCH, mtr);
136
ut_a((ibool)!!page_is_comp(root) == dict_table_is_comp(index->table));
141
/*****************************************************************
142
Gets pointer to the previous user record in the tree. It is assumed that
143
the caller has appropriate latches on the page and its neighbor. */
146
btr_get_prev_user_rec(
147
/*==================*/
148
/* out: previous user record, NULL if there is none */
149
rec_t* rec, /* in: record on leaf level */
150
mtr_t* mtr) /* in: mtr holding a latch on the page, and if
151
needed, also to the previous page */
158
if (!page_rec_is_infimum(rec)) {
160
rec_t* prev_rec = page_rec_get_prev(rec);
162
if (!page_rec_is_infimum(prev_rec)) {
168
page = buf_frame_align(rec);
169
prev_page_no = btr_page_get_prev(page, mtr);
170
space = buf_frame_get_space_id(page);
172
if (prev_page_no != FIL_NULL) {
174
prev_page = buf_page_get_with_no_latch(space, prev_page_no,
176
/* The caller must already have a latch to the brother */
177
ut_ad((mtr_memo_contains(mtr, buf_block_align(prev_page),
178
MTR_MEMO_PAGE_S_FIX))
179
|| (mtr_memo_contains(mtr, buf_block_align(prev_page),
180
MTR_MEMO_PAGE_X_FIX)));
181
ut_a(page_is_comp(prev_page) == page_is_comp(page));
182
#ifdef UNIV_BTR_DEBUG
183
ut_a(btr_page_get_next(prev_page, mtr)
184
== buf_frame_get_page_no(page));
185
#endif /* UNIV_BTR_DEBUG */
187
return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
193
/*****************************************************************
194
Gets pointer to the next user record in the tree. It is assumed that the
195
caller has appropriate latches on the page and its neighbor. */
198
btr_get_next_user_rec(
199
/*==================*/
200
/* out: next user record, NULL if there is none */
201
rec_t* rec, /* in: record on leaf level */
202
mtr_t* mtr) /* in: mtr holding a latch on the page, and if
203
needed, also to the next page */
210
if (!page_rec_is_supremum(rec)) {
212
rec_t* next_rec = page_rec_get_next(rec);
214
if (!page_rec_is_supremum(next_rec)) {
220
page = buf_frame_align(rec);
221
next_page_no = btr_page_get_next(page, mtr);
222
space = buf_frame_get_space_id(page);
224
if (next_page_no != FIL_NULL) {
226
next_page = buf_page_get_with_no_latch(space, next_page_no,
228
/* The caller must already have a latch to the brother */
229
ut_ad((mtr_memo_contains(mtr, buf_block_align(next_page),
230
MTR_MEMO_PAGE_S_FIX))
231
|| (mtr_memo_contains(mtr, buf_block_align(next_page),
232
MTR_MEMO_PAGE_X_FIX)));
233
#ifdef UNIV_BTR_DEBUG
234
ut_a(btr_page_get_prev(next_page, mtr)
235
== buf_frame_get_page_no(page));
236
#endif /* UNIV_BTR_DEBUG */
238
ut_a(page_is_comp(next_page) == page_is_comp(page));
239
return(page_rec_get_next(page_get_infimum_rec(next_page)));
245
/******************************************************************
246
Creates a new index page (not the root, and also not
247
used in page reorganization). */
252
page_t* page, /* in: page to be created */
253
dict_index_t* index, /* in: index */
254
mtr_t* mtr) /* in: mtr */
256
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
257
MTR_MEMO_PAGE_X_FIX));
258
page_create(page, mtr, dict_table_is_comp(index->table));
259
buf_block_align(page)->check_index_page_at_flush = TRUE;
261
btr_page_set_index_id(page, index->id, mtr);
264
/******************************************************************
265
Allocates a new file page to be used in an ibuf tree. Takes the page from
266
the free list of the tree, which must contain pages! */
269
btr_page_alloc_for_ibuf(
270
/*====================*/
271
/* out: new allocated page, x-latched */
272
dict_index_t* index, /* in: index tree */
273
mtr_t* mtr) /* in: mtr */
275
fil_addr_t node_addr;
279
root = btr_root_get(index, mtr);
281
node_addr = flst_get_first(root + PAGE_HEADER
282
+ PAGE_BTR_IBUF_FREE_LIST, mtr);
283
ut_a(node_addr.page != FIL_NULL);
285
new_page = buf_page_get(dict_index_get_space(index), node_addr.page,
287
#ifdef UNIV_SYNC_DEBUG
288
buf_page_dbg_add_level(new_page, SYNC_TREE_NODE_NEW);
289
#endif /* UNIV_SYNC_DEBUG */
291
flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
292
new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
294
ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
300
/******************************************************************
301
Allocates a new file page to be used in an index tree. NOTE: we assume
302
that the caller has made the reservation for free extents! */
307
/* out: new allocated page, x-latched;
308
NULL if out of space */
309
dict_index_t* index, /* in: index */
310
ulint hint_page_no, /* in: hint of a good page */
311
byte file_direction, /* in: direction where a possible
312
page split is made */
313
ulint level, /* in: level where the page is placed
315
mtr_t* mtr) /* in: mtr */
317
fseg_header_t* seg_header;
322
if (index->type & DICT_IBUF) {
324
return(btr_page_alloc_for_ibuf(index, mtr));
327
root = btr_root_get(index, mtr);
330
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
332
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
335
/* Parameter TRUE below states that the caller has made the
336
reservation for free extents, and thus we know that a page can
339
new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
340
file_direction, TRUE, mtr);
341
if (new_page_no == FIL_NULL) {
346
new_page = buf_page_get(dict_index_get_space(index), new_page_no,
348
#ifdef UNIV_SYNC_DEBUG
349
buf_page_dbg_add_level(new_page, SYNC_TREE_NODE_NEW);
350
#endif /* UNIV_SYNC_DEBUG */
355
/******************************************************************
356
Gets the number of pages in a B-tree. */
361
/* out: number of pages */
362
dict_index_t* index, /* in: index */
363
ulint flag) /* in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
365
fseg_header_t* seg_header;
373
mtr_s_lock(dict_index_get_lock(index), &mtr);
375
root = btr_root_get(index, &mtr);
377
if (flag == BTR_N_LEAF_PAGES) {
378
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
380
fseg_n_reserved_pages(seg_header, &n, &mtr);
382
} else if (flag == BTR_TOTAL_SIZE) {
383
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
385
n = fseg_n_reserved_pages(seg_header, &dummy, &mtr);
387
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
389
n += fseg_n_reserved_pages(seg_header, &dummy, &mtr);
399
/******************************************************************
400
Frees a page used in an ibuf tree. Puts the page to the free list of the
404
btr_page_free_for_ibuf(
405
/*===================*/
406
dict_index_t* index, /* in: index tree */
407
page_t* page, /* in: page to be freed, x-latched */
408
mtr_t* mtr) /* in: mtr */
412
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
413
MTR_MEMO_PAGE_X_FIX));
414
root = btr_root_get(index, mtr);
416
flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
417
page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
419
ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
423
/******************************************************************
424
Frees a file page used in an index tree. Can be used also to (BLOB)
425
external storage pages, because the page level 0 can be given as an
431
dict_index_t* index, /* in: index tree */
432
page_t* page, /* in: page to be freed, x-latched */
433
ulint level, /* in: page level */
434
mtr_t* mtr) /* in: mtr */
436
fseg_header_t* seg_header;
441
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
442
MTR_MEMO_PAGE_X_FIX));
443
/* The page gets invalid for optimistic searches: increment the frame
446
buf_frame_modify_clock_inc(page);
448
if (index->type & DICT_IBUF) {
450
btr_page_free_for_ibuf(index, page, mtr);
455
root = btr_root_get(index, mtr);
458
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
460
seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
463
space = buf_frame_get_space_id(page);
464
page_no = buf_frame_get_page_no(page);
466
fseg_free_page(seg_header, space, page_no, mtr);
469
/******************************************************************
470
Frees a file page used in an index tree. NOTE: cannot free field external
471
storage pages because the page must contain info on its level. */
476
dict_index_t* index, /* in: index tree */
477
page_t* page, /* in: page to be freed, x-latched */
478
mtr_t* mtr) /* in: mtr */
482
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
483
MTR_MEMO_PAGE_X_FIX));
484
level = btr_page_get_level(page, mtr);
486
btr_page_free_low(index, page, level, mtr);
489
/******************************************************************
490
Sets the child node file address in a node pointer. */
493
btr_node_ptr_set_child_page_no(
494
/*===========================*/
495
rec_t* rec, /* in: node pointer record */
496
const ulint* offsets,/* in: array returned by rec_get_offsets() */
497
ulint page_no,/* in: child node address */
498
mtr_t* mtr) /* in: mtr */
503
ut_ad(rec_offs_validate(rec, NULL, offsets));
504
ut_ad(0 < btr_page_get_level(buf_frame_align(rec), mtr));
505
ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
507
/* The child address is in the last field */
508
field = rec_get_nth_field(rec, offsets,
509
rec_offs_n_fields(offsets) - 1, &len);
513
mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
516
/****************************************************************
517
Returns the child page of a node pointer and x-latches it. */
520
btr_node_ptr_get_child(
521
/*===================*/
522
/* out: child page, x-latched */
523
rec_t* node_ptr,/* in: node pointer */
524
const ulint* offsets,/* in: array returned by rec_get_offsets() */
525
mtr_t* mtr) /* in: mtr */
531
ut_ad(rec_offs_validate(node_ptr, NULL, offsets));
532
space = buf_frame_get_space_id(node_ptr);
533
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
535
page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
540
/****************************************************************
541
Returns the upper level node pointer to a page. It is assumed that mtr holds
542
an x-latch on the tree. */
545
btr_page_get_father_for_rec(
546
/*========================*/
547
/* out: pointer to node pointer record,
548
its page x-latched */
549
dict_index_t* index, /* in: index tree */
550
page_t* page, /* in: page: must contain at least one
552
rec_t* user_rec,/* in: user_record on page */
553
mtr_t* mtr) /* in: mtr */
559
ulint offsets_[REC_OFFS_NORMAL_SIZE];
560
ulint* offsets = offsets_;
561
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
563
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
565
ut_a(page_rec_is_user_rec(user_rec));
567
ut_ad(dict_index_get_page(index) != buf_frame_get_page_no(page));
569
heap = mem_heap_create(100);
571
tuple = dict_index_build_node_ptr(index, user_rec, 0, heap,
572
btr_page_get_level(page, mtr));
574
btr_cur_search_to_nth_level(index,
575
btr_page_get_level(page, mtr) + 1,
577
BTR_CONT_MODIFY_TREE, &cursor, 0, mtr);
579
node_ptr = btr_cur_get_rec(&cursor);
580
offsets = rec_get_offsets(node_ptr, index, offsets,
581
ULINT_UNDEFINED, &heap);
583
if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets)
584
!= buf_frame_get_page_no(page))) {
586
fputs("InnoDB: Dump of the child page:\n", stderr);
587
buf_page_print(buf_frame_align(page));
588
fputs("InnoDB: Dump of the parent page:\n", stderr);
589
buf_page_print(buf_frame_align(node_ptr));
591
fputs("InnoDB: Corruption of an index tree: table ", stderr);
592
ut_print_name(stderr, NULL, TRUE, index->table_name);
593
fputs(", index ", stderr);
594
ut_print_name(stderr, NULL, FALSE, index->name);
595
fprintf(stderr, ",\n"
596
"InnoDB: father ptr page no %lu, child page no %lu\n",
598
btr_node_ptr_get_child_page_no(node_ptr, offsets),
599
(ulong) buf_frame_get_page_no(page));
600
print_rec = page_rec_get_next(page_get_infimum_rec(page));
601
offsets = rec_get_offsets(print_rec, index,
602
offsets, ULINT_UNDEFINED, &heap);
603
page_rec_print(print_rec, offsets);
604
offsets = rec_get_offsets(node_ptr, index, offsets,
605
ULINT_UNDEFINED, &heap);
606
page_rec_print(node_ptr, offsets);
608
fputs("InnoDB: You should dump + drop + reimport the table"
610
"InnoDB: corruption. If the crash happens at "
611
"the database startup, see\n"
612
"InnoDB: http://dev.mysql.com/doc/refman/5.1/en/"
613
"forcing-recovery.html about\n"
614
"InnoDB: forcing recovery. "
615
"Then dump + drop + reimport.\n", stderr);
618
ut_a(btr_node_ptr_get_child_page_no(node_ptr, offsets)
619
== buf_frame_get_page_no(page));
625
/****************************************************************
626
Returns the upper level node pointer to a page. It is assumed that
627
mtr holds an x-latch on the tree. */
630
btr_page_get_father_node_ptr(
631
/*=========================*/
632
/* out: pointer to node pointer record */
633
dict_index_t* index, /* in: index tree */
634
page_t* page, /* in: page: must contain at least one
636
mtr_t* mtr) /* in: mtr */
638
return(btr_page_get_father_for_rec(
640
page_rec_get_next(page_get_infimum_rec(page)), mtr));
643
/****************************************************************
644
Creates the root node for a new index tree. */
649
/* out: page number of the created root, FIL_NULL if
651
ulint type, /* in: type of the index */
652
ulint space, /* in: space where created */
653
dulint index_id,/* in: index id */
654
ulint comp, /* in: nonzero=compact page format */
655
mtr_t* mtr) /* in: mini-transaction handle */
658
buf_frame_t* ibuf_hdr_frame;
662
/* Create the two new segments (one, in the case of an ibuf tree) for
663
the index tree; the segment headers are put on the allocated root page
664
(for an ibuf tree, not in the root, but on a separate ibuf header
667
if (type & DICT_IBUF) {
668
/* Allocate first the ibuf header page */
669
ibuf_hdr_frame = fseg_create(
670
space, 0, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
672
#ifdef UNIV_SYNC_DEBUG
673
buf_page_dbg_add_level(ibuf_hdr_frame, SYNC_TREE_NODE_NEW);
674
#endif /* UNIV_SYNC_DEBUG */
675
ut_ad(buf_frame_get_page_no(ibuf_hdr_frame)
676
== IBUF_HEADER_PAGE_NO);
677
/* Allocate then the next page to the segment: it will be the
680
page_no = fseg_alloc_free_page(ibuf_hdr_frame + IBUF_HEADER
681
+ IBUF_TREE_SEG_HEADER,
682
IBUF_TREE_ROOT_PAGE_NO,
684
ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
686
frame = buf_page_get(space, page_no, RW_X_LATCH, mtr);
688
frame = fseg_create(space, 0, PAGE_HEADER + PAGE_BTR_SEG_TOP,
697
page_no = buf_frame_get_page_no(frame);
699
#ifdef UNIV_SYNC_DEBUG
700
buf_page_dbg_add_level(frame, SYNC_TREE_NODE_NEW);
701
#endif /* UNIV_SYNC_DEBUG */
703
if (type & DICT_IBUF) {
704
/* It is an insert buffer tree: initialize the free list */
706
ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
708
flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
710
/* It is a non-ibuf tree: create a file segment for leaf
712
fseg_create(space, page_no, PAGE_HEADER + PAGE_BTR_SEG_LEAF,
714
/* The fseg create acquires a second latch on the page,
715
therefore we must declare it: */
716
#ifdef UNIV_SYNC_DEBUG
717
buf_page_dbg_add_level(frame, SYNC_TREE_NODE_NEW);
718
#endif /* UNIV_SYNC_DEBUG */
721
/* Create a new index page on the the allocated segment page */
722
page = page_create(frame, mtr, comp);
723
buf_block_align(page)->check_index_page_at_flush = TRUE;
725
/* Set the index id of the page */
726
btr_page_set_index_id(page, index_id, mtr);
728
/* Set the level of the new index page */
729
btr_page_set_level(page, 0, mtr);
731
/* Set the next node and previous node fields */
732
btr_page_set_next(page, FIL_NULL, mtr);
733
btr_page_set_prev(page, FIL_NULL, mtr);
735
/* We reset the free bits for the page to allow creation of several
736
trees in the same mtr, otherwise the latch on a bitmap page would
737
prevent it because of the latching order */
739
ibuf_reset_free_bits_with_type(type, page);
741
/* In the following assertion we test that two records of maximum
742
allowed size fit on the root page: this fact is needed to ensure
743
correctness of split algorithms */
745
ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
750
/****************************************************************
751
Frees a B-tree except the root page, which MUST be freed after this
752
by calling btr_free_root. */
755
btr_free_but_not_root(
756
/*==================*/
757
ulint space, /* in: space where created */
758
ulint root_page_no) /* in: root page number */
767
root = btr_page_get(space, root_page_no, RW_X_LATCH, &mtr);
769
/* NOTE: page hash indexes are dropped when a page is freed inside
772
finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
783
root = btr_page_get(space, root_page_no, RW_X_LATCH, &mtr);
785
finished = fseg_free_step_not_header(
786
root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
795
/****************************************************************
796
Frees the B-tree root page. Other tree MUST already have been freed. */
801
ulint space, /* in: space where created */
802
ulint root_page_no, /* in: root page number */
803
mtr_t* mtr) /* in: a mini-transaction which has already
809
root = btr_page_get(space, root_page_no, RW_X_LATCH, mtr);
811
btr_search_drop_page_hash_index(root);
813
finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
820
/*****************************************************************
821
Reorganizes an index page. */
824
btr_page_reorganize_low(
825
/*====================*/
826
ibool recovery,/* in: TRUE if called in recovery:
827
locks should not be updated, i.e.,
828
there cannot exist locks on the
829
page, and a hash index should not be
830
dropped: it cannot exist */
831
page_t* page, /* in: page to be reorganized */
832
dict_index_t* index, /* in: record descriptor */
833
mtr_t* mtr) /* in: mtr */
842
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
843
MTR_MEMO_PAGE_X_FIX));
844
ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
845
data_size1 = page_get_data_size(page);
846
max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
848
/* Write the log record */
849
mlog_open_and_write_index(mtr, page, index, page_is_comp(page)
850
? MLOG_COMP_PAGE_REORGANIZE
851
: MLOG_PAGE_REORGANIZE, 0);
853
/* Turn logging off */
854
log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
856
new_page = buf_frame_alloc();
858
/* Copy the old page to temporary space */
859
buf_frame_copy(new_page, page);
862
btr_search_drop_page_hash_index(page);
865
/* Recreate the page: note that global data on page (possible
866
segment headers, next page-field, etc.) is preserved intact */
868
page_create(page, mtr, page_is_comp(page));
869
buf_block_align(page)->check_index_page_at_flush = TRUE;
871
/* Copy the records from the temporary space to the recreated page;
872
do not copy the lock bits yet */
874
page_copy_rec_list_end_no_locks(page, new_page,
875
page_get_infimum_rec(new_page),
877
/* Copy max trx id to recreated page */
878
page_set_max_trx_id(page, page_get_max_trx_id(new_page));
881
/* Update the record lock bitmaps */
882
lock_move_reorganize_page(page, new_page);
885
data_size2 = page_get_data_size(page);
886
max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
888
if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) {
889
buf_page_print(page);
890
buf_page_print(new_page);
892
"InnoDB: Error: page old data size %lu"
893
" new data size %lu\n"
894
"InnoDB: Error: page old max ins size %lu"
895
" new max ins size %lu\n"
896
"InnoDB: Submit a detailed bug report"
897
" to http://bugs.mysql.com\n",
898
(unsigned long) data_size1, (unsigned long) data_size2,
899
(unsigned long) max_ins_size1,
900
(unsigned long) max_ins_size2);
903
buf_frame_free(new_page);
905
/* Restore logging mode */
906
mtr_set_log_mode(mtr, log_mode);
909
/*****************************************************************
910
Reorganizes an index page. */
915
page_t* page, /* in: page to be reorganized */
916
dict_index_t* index, /* in: record descriptor */
917
mtr_t* mtr) /* in: mtr */
919
btr_page_reorganize_low(FALSE, page, index, mtr);
922
/***************************************************************
923
Parses a redo log record of reorganizing a page. */
926
btr_parse_page_reorganize(
927
/*======================*/
928
/* out: end of log record or NULL */
929
byte* ptr, /* in: buffer */
930
byte* end_ptr __attribute__((unused)),
932
dict_index_t* index, /* in: record descriptor */
933
page_t* page, /* in: page or NULL */
934
mtr_t* mtr) /* in: mtr or NULL */
936
ut_ad(ptr && end_ptr);
938
/* The record is empty, except for the record initial part */
941
btr_page_reorganize_low(TRUE, page, index, mtr);
947
/*****************************************************************
948
Empties an index page. */
953
page_t* page, /* in: page to be emptied */
954
mtr_t* mtr) /* in: mtr */
956
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
957
MTR_MEMO_PAGE_X_FIX));
958
btr_search_drop_page_hash_index(page);
960
/* Recreate the page: note that global data on page (possible
961
segment headers, next page-field, etc.) is preserved intact */
963
page_create(page, mtr, page_is_comp(page));
964
buf_block_align(page)->check_index_page_at_flush = TRUE;
967
/*****************************************************************
968
Makes tree one level higher by splitting the root, and inserts
969
the tuple. It is assumed that mtr contains an x-latch on the tree.
970
NOTE that the operation of this function must always succeed,
971
we cannot reverse it: therefore enough free disk space must be
972
guaranteed to be available before this function is called. */
975
btr_root_raise_and_insert(
976
/*======================*/
977
/* out: inserted record */
978
btr_cur_t* cursor, /* in: cursor at which to insert: must be
979
on the root page; when the function returns,
980
the cursor is positioned on the predecessor
981
of the inserted record */
982
dtuple_t* tuple, /* in: tuple to insert */
983
mtr_t* mtr) /* in: mtr */
994
page_cur_t* page_cursor;
996
root = btr_cur_get_page(cursor);
997
index = btr_cur_get_index(cursor);
999
ut_ad(dict_index_get_page(index) == buf_frame_get_page_no(root));
1000
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1002
ut_ad(mtr_memo_contains(mtr, buf_block_align(root),
1003
MTR_MEMO_PAGE_X_FIX));
1004
btr_search_drop_page_hash_index(root);
1006
/* Allocate a new page to the tree. Root splitting is done by first
1007
moving the root records to the new page, emptying the root, putting
1008
a node pointer to the new page, and then splitting the new page. */
1010
new_page = btr_page_alloc(index, 0, FSP_NO_DIR,
1011
btr_page_get_level(root, mtr), mtr);
1013
btr_page_create(new_page, index, mtr);
1015
level = btr_page_get_level(root, mtr);
1017
/* Set the levels of the new index page and root page */
1018
btr_page_set_level(new_page, level, mtr);
1019
btr_page_set_level(root, level + 1, mtr);
1021
/* Set the next node and previous node fields of new page */
1022
btr_page_set_next(new_page, FIL_NULL, mtr);
1023
btr_page_set_prev(new_page, FIL_NULL, mtr);
1025
/* Move the records from root to the new page */
1027
page_move_rec_list_end(new_page, root, page_get_infimum_rec(root),
1029
/* If this is a pessimistic insert which is actually done to
1030
perform a pessimistic update then we have stored the lock
1031
information of the record to be inserted on the infimum of the
1032
root page: we cannot discard the lock structs on the root page */
1034
lock_update_root_raise(new_page, root);
1036
/* Create a memory heap where the node pointer is stored */
1037
heap = mem_heap_create(100);
1039
rec = page_rec_get_next(page_get_infimum_rec(new_page));
1040
new_page_no = buf_frame_get_page_no(new_page);
1042
/* Build the node pointer (= node key and page address) for the
1045
node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
1047
/* Reorganize the root to get free space */
1048
btr_page_reorganize(root, index, mtr);
1050
page_cursor = btr_cur_get_page_cur(cursor);
1052
/* Insert node pointer to the root */
1054
page_cur_set_before_first(root, page_cursor);
1056
node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
1059
ut_ad(node_ptr_rec);
1061
/* The node pointer must be marked as the predefined minimum record,
1062
as there is no lower alphabetical limit to records in the leftmost
1065
btr_set_min_rec_mark(node_ptr_rec, page_is_comp(root), mtr);
1067
/* Free the memory heap */
1068
mem_heap_free(heap);
1070
/* We play safe and reset the free bits for the new page */
1073
fprintf(stderr, "Root raise new page no %lu\n",
1074
buf_frame_get_page_no(new_page));
1077
ibuf_reset_free_bits(index, new_page);
1078
/* Reposition the cursor to the child node */
1079
page_cur_search(new_page, index, tuple,
1080
PAGE_CUR_LE, page_cursor);
1082
/* Split the child and insert tuple */
1083
return(btr_page_split_and_insert(cursor, tuple, mtr));
1086
/*****************************************************************
1087
Decides if the page should be split at the convergence point of inserts
1088
converging to the left. */
1091
btr_page_get_split_rec_to_left(
1092
/*===========================*/
1093
/* out: TRUE if split recommended */
1094
btr_cur_t* cursor, /* in: cursor at which to insert */
1095
rec_t** split_rec) /* out: if split recommended,
1096
the first record on upper half page,
1097
or NULL if tuple to be inserted should
1101
rec_t* insert_point;
1104
page = btr_cur_get_page(cursor);
1105
insert_point = btr_cur_get_rec(cursor);
1107
if (page_header_get_ptr(page, PAGE_LAST_INSERT)
1108
== page_rec_get_next(insert_point)) {
1110
infimum = page_get_infimum_rec(page);
1112
/* If the convergence is in the middle of a page, include also
1113
the record immediately before the new insert to the upper
1114
page. Otherwise, we could repeatedly move from page to page
1115
lots of records smaller than the convergence point. */
1117
if (infimum != insert_point
1118
&& page_rec_get_next(infimum) != insert_point) {
1120
*split_rec = insert_point;
1122
*split_rec = page_rec_get_next(insert_point);
1131
/*****************************************************************
1132
Decides if the page should be split at the convergence point of inserts
1133
converging to the right. */
1136
btr_page_get_split_rec_to_right(
1137
/*============================*/
1138
/* out: TRUE if split recommended */
1139
btr_cur_t* cursor, /* in: cursor at which to insert */
1140
rec_t** split_rec) /* out: if split recommended,
1141
the first record on upper half page,
1142
or NULL if tuple to be inserted should
1146
rec_t* insert_point;
1148
page = btr_cur_get_page(cursor);
1149
insert_point = btr_cur_get_rec(cursor);
1151
/* We use eager heuristics: if the new insert would be right after
1152
the previous insert on the same page, we assume that there is a
1153
pattern of sequential inserts here. */
1155
if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT)
1160
next_rec = page_rec_get_next(insert_point);
1162
if (page_rec_is_supremum(next_rec)) {
1164
/* Split at the new record to insert */
1167
rec_t* next_next_rec = page_rec_get_next(next_rec);
1168
if (page_rec_is_supremum(next_next_rec)) {
1173
/* If there are >= 2 user records up from the insert
1174
point, split all but 1 off. We want to keep one because
1175
then sequential inserts can use the adaptive hash
1176
index, as they can do the necessary checks of the right
1177
search position just by looking at the records on this
1180
*split_rec = next_next_rec;
1189
/*****************************************************************
1190
Calculates a split record such that the tuple will certainly fit on
1191
its half-page when the split is performed. We assume in this function
1192
only that the cursor page has at least one user record. */
1195
btr_page_get_sure_split_rec(
1196
/*========================*/
1197
/* out: split record, or NULL if
1198
tuple will be the first record on
1200
btr_cur_t* cursor, /* in: cursor at which insert
1202
dtuple_t* tuple) /* in: tuple to insert */
1218
page = btr_cur_get_page(cursor);
1220
insert_size = rec_get_converted_size(cursor->index, tuple);
1221
free_space = page_get_free_space_of_empty(page_is_comp(page));
1223
/* free_space is now the free space of a created new page */
1225
total_data = page_get_data_size(page) + insert_size;
1226
total_n_recs = page_get_n_recs(page) + 1;
1227
ut_ad(total_n_recs >= 2);
1228
total_space = total_data + page_dir_calc_reserved_space(total_n_recs);
1232
ins_rec = btr_cur_get_rec(cursor);
1233
rec = page_get_infimum_rec(page);
1238
/* We start to include records to the left half, and when the
1239
space reserved by them exceeds half of total_space, then if
1240
the included records fit on the left page, they will be put there
1241
if something was left over also for the right page,
1242
otherwise the last included record will be the first on the right
1246
/* Decide the next record to include */
1247
if (rec == ins_rec) {
1248
rec = NULL; /* NULL denotes that tuple is
1250
} else if (rec == NULL) {
1251
rec = page_rec_get_next(ins_rec);
1253
rec = page_rec_get_next(rec);
1258
incl_data += insert_size;
1260
offsets = rec_get_offsets(rec, cursor->index,
1261
offsets, ULINT_UNDEFINED,
1263
incl_data += rec_offs_size(offsets);
1268
if (incl_data + page_dir_calc_reserved_space(n)
1269
>= total_space / 2) {
1271
if (incl_data + page_dir_calc_reserved_space(n)
1273
/* The next record will be the first on
1274
the right half page if it is not the
1275
supremum record of page */
1277
if (rec == ins_rec) {
1281
} else if (rec == NULL) {
1282
next_rec = page_rec_get_next(ins_rec);
1284
next_rec = page_rec_get_next(rec);
1287
if (!page_rec_is_supremum(next_rec)) {
1293
if (UNIV_LIKELY_NULL(heap)) {
1294
mem_heap_free(heap);
1301
/*****************************************************************
1302
Returns TRUE if the insert fits on the appropriate half-page with the
1303
chosen split_rec. */
1306
btr_page_insert_fits(
1307
/*=================*/
1308
/* out: TRUE if fits */
1309
btr_cur_t* cursor, /* in: cursor at which insert
1311
rec_t* split_rec, /* in: suggestion for first record
1312
on upper half-page, or NULL if
1313
tuple to be inserted should be first */
1314
const ulint* offsets, /* in: rec_get_offsets(
1315
split_rec, cursor->index) */
1316
dtuple_t* tuple, /* in: tuple to insert */
1317
mem_heap_t* heap) /* in: temporary memory heap */
1328
page = btr_cur_get_page(cursor);
1330
ut_ad(!split_rec == !offsets);
1332
|| !page_is_comp(page) == !rec_offs_comp(offsets));
1334
|| rec_offs_validate(split_rec, cursor->index, offsets));
1336
insert_size = rec_get_converted_size(cursor->index, tuple);
1337
free_space = page_get_free_space_of_empty(page_is_comp(page));
1339
/* free_space is now the free space of a created new page */
1341
total_data = page_get_data_size(page) + insert_size;
1342
total_n_recs = page_get_n_recs(page) + 1;
1344
/* We determine which records (from rec to end_rec, not including
1345
end_rec) will end up on the other half page from tuple when it is
1348
if (split_rec == NULL) {
1349
rec = page_rec_get_next(page_get_infimum_rec(page));
1350
end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
1352
} else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) {
1354
rec = page_rec_get_next(page_get_infimum_rec(page));
1355
end_rec = split_rec;
1358
end_rec = page_get_supremum_rec(page);
1361
if (total_data + page_dir_calc_reserved_space(total_n_recs)
1364
/* Ok, there will be enough available space on the
1365
half page where the tuple is inserted */
1372
while (rec != end_rec) {
1373
/* In this loop we calculate the amount of reserved
1374
space after rec is removed from page. */
1376
offs = rec_get_offsets(rec, cursor->index, offs,
1377
ULINT_UNDEFINED, &heap);
1379
total_data -= rec_offs_size(offs);
1382
if (total_data + page_dir_calc_reserved_space(total_n_recs)
1385
/* Ok, there will be enough available space on the
1386
half page where the tuple is inserted */
1391
rec = page_rec_get_next(rec);
1397
/***********************************************************
1398
Inserts a data tuple to a tree on a non-leaf level. It is assumed
1399
that mtr holds an x-latch on the tree. */
1402
btr_insert_on_non_leaf_level(
1403
/*=========================*/
1404
dict_index_t* index, /* in: index */
1405
ulint level, /* in: level, must be > 0 */
1406
dtuple_t* tuple, /* in: the record to be inserted */
1407
mtr_t* mtr) /* in: mtr */
1409
big_rec_t* dummy_big_rec;
1416
btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
1417
BTR_CONT_MODIFY_TREE,
1420
err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
1422
| BTR_NO_UNDO_LOG_FLAG,
1423
&cursor, tuple, &rec,
1424
&dummy_big_rec, NULL, mtr);
1425
ut_a(err == DB_SUCCESS);
1428
/******************************************************************
1429
Attaches the halves of an index page on the appropriate level in an
1433
btr_attach_half_pages(
1434
/*==================*/
1435
dict_index_t* index, /* in: the index tree */
1436
page_t* page, /* in: page to be split */
1437
rec_t* split_rec, /* in: first record on upper
1439
page_t* new_page, /* in: the new half page */
1440
ulint direction, /* in: FSP_UP or FSP_DOWN */
1441
mtr_t* mtr) /* in: mtr */
1452
ulint lower_page_no;
1453
ulint upper_page_no;
1454
dtuple_t* node_ptr_upper;
1457
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1458
MTR_MEMO_PAGE_X_FIX));
1459
ut_ad(mtr_memo_contains(mtr, buf_block_align(new_page),
1460
MTR_MEMO_PAGE_X_FIX));
1461
ut_a(page_is_comp(page) == page_is_comp(new_page));
1463
/* Create a memory heap where the data tuple is stored */
1464
heap = mem_heap_create(1024);
1466
/* Based on split direction, decide upper and lower pages */
1467
if (direction == FSP_DOWN) {
1469
lower_page_no = buf_frame_get_page_no(new_page);
1470
upper_page_no = buf_frame_get_page_no(page);
1471
lower_page = new_page;
1474
/* Look up the index for the node pointer to page */
1475
node_ptr = btr_page_get_father_node_ptr(index, page, mtr);
1477
/* Replace the address of the old child node (= page) with the
1478
address of the new lower half */
1480
btr_node_ptr_set_child_page_no(node_ptr,
1483
NULL, ULINT_UNDEFINED,
1485
lower_page_no, mtr);
1486
mem_heap_empty(heap);
1488
lower_page_no = buf_frame_get_page_no(page);
1489
upper_page_no = buf_frame_get_page_no(new_page);
1491
upper_page = new_page;
1494
/* Get the level of the split pages */
1495
level = btr_page_get_level(page, mtr);
1497
/* Build the node pointer (= node key and page address) for the upper
1500
node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
1501
upper_page_no, heap, level);
1503
/* Insert it next to the pointer to the lower half. Note that this
1504
may generate recursion leading to a split on the higher level. */
1506
btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr);
1508
/* Free the memory heap */
1509
mem_heap_free(heap);
1511
/* Get the previous and next pages of page */
1513
prev_page_no = btr_page_get_prev(page, mtr);
1514
next_page_no = btr_page_get_next(page, mtr);
1515
space = buf_frame_get_space_id(page);
1517
/* Update page links of the level */
1519
if (prev_page_no != FIL_NULL) {
1521
prev_page = btr_page_get(space, prev_page_no, RW_X_LATCH, mtr);
1522
ut_a(page_is_comp(prev_page) == page_is_comp(page));
1523
#ifdef UNIV_BTR_DEBUG
1524
ut_a(btr_page_get_next(prev_page, mtr)
1525
== buf_frame_get_page_no(page));
1526
#endif /* UNIV_BTR_DEBUG */
1528
btr_page_set_next(prev_page, lower_page_no, mtr);
1531
if (next_page_no != FIL_NULL) {
1533
next_page = btr_page_get(space, next_page_no, RW_X_LATCH, mtr);
1534
ut_a(page_is_comp(next_page) == page_is_comp(page));
1536
btr_page_set_prev(next_page, upper_page_no, mtr);
1539
btr_page_set_prev(lower_page, prev_page_no, mtr);
1540
btr_page_set_next(lower_page, upper_page_no, mtr);
1541
btr_page_set_level(lower_page, level, mtr);
1543
btr_page_set_prev(upper_page, lower_page_no, mtr);
1544
btr_page_set_next(upper_page, next_page_no, mtr);
1545
btr_page_set_level(upper_page, level, mtr);
1548
/*****************************************************************
1549
Splits an index page to halves and inserts the tuple. It is assumed
1550
that mtr holds an x-latch to the index tree. NOTE: the tree x-latch
1551
is released within this function! NOTE that the operation of this
1552
function must always succeed, we cannot reverse it: therefore
1553
enough free disk space must be guaranteed to be available before
1554
this function is called. */
1557
btr_page_split_and_insert(
1558
/*======================*/
1559
/* out: inserted record; NOTE: the tree
1560
x-latch is released! NOTE: 2 free disk
1561
pages must be available! */
1562
btr_cur_t* cursor, /* in: cursor at which to insert; when the
1563
function returns, the cursor is positioned
1564
on the predecessor of the inserted record */
1565
dtuple_t* tuple, /* in: tuple to insert */
1566
mtr_t* mtr) /* in: mtr */
1576
page_t* insert_page;
1577
page_cur_t* page_cursor;
1579
byte* buf = 0; /* remove warning */
1581
ibool insert_will_fit;
1582
ulint n_iterations = 0;
1588
heap = mem_heap_create(1024);
1589
n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
1591
mem_heap_empty(heap);
1594
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
1596
#ifdef UNIV_SYNC_DEBUG
1597
ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
1598
#endif /* UNIV_SYNC_DEBUG */
1600
page = btr_cur_get_page(cursor);
1602
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1603
MTR_MEMO_PAGE_X_FIX));
1604
ut_ad(page_get_n_recs(page) >= 2);
1606
page_no = buf_frame_get_page_no(page);
1608
/* 1. Decide the split record; split_rec == NULL means that the
1609
tuple to be inserted should be the first record on the upper
1612
if (n_iterations > 0) {
1614
hint_page_no = page_no + 1;
1615
split_rec = btr_page_get_sure_split_rec(cursor, tuple);
1617
} else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
1619
hint_page_no = page_no + 1;
1621
} else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
1622
direction = FSP_DOWN;
1623
hint_page_no = page_no - 1;
1626
hint_page_no = page_no + 1;
1627
split_rec = page_get_middle_rec(page);
1630
/* 2. Allocate a new page to the index */
1631
new_page = btr_page_alloc(cursor->index, hint_page_no, direction,
1632
btr_page_get_level(page, mtr), mtr);
1633
btr_page_create(new_page, cursor->index, mtr);
1635
/* 3. Calculate the first record on the upper half-page, and the
1636
first record (move_limit) on original page which ends up on the
1639
if (split_rec != NULL) {
1640
first_rec = split_rec;
1641
move_limit = split_rec;
1643
buf = mem_alloc(rec_get_converted_size(cursor->index, tuple));
1645
first_rec = rec_convert_dtuple_to_rec(buf,
1646
cursor->index, tuple);
1647
move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
1650
/* 4. Do first the modifications in the tree structure */
1652
btr_attach_half_pages(cursor->index, page, first_rec,
1653
new_page, direction, mtr);
1655
if (split_rec == NULL) {
1659
/* If the split is made on the leaf level and the insert will fit
1660
on the appropriate half-page, we may release the tree x-latch.
1661
We can then move the records after releasing the tree latch,
1662
thus reducing the tree latch contention. */
1665
offsets = rec_get_offsets(split_rec, cursor->index, offsets,
1668
insert_will_fit = btr_page_insert_fits(cursor,
1672
insert_will_fit = btr_page_insert_fits(cursor,
1677
if (insert_will_fit && (btr_page_get_level(page, mtr) == 0)) {
1679
mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
1683
/* 5. Move then the records to the new page */
1684
if (direction == FSP_DOWN) {
1685
/* fputs("Split left\n", stderr); */
1687
page_move_rec_list_start(new_page, page, move_limit,
1688
cursor->index, mtr);
1689
left_page = new_page;
1692
lock_update_split_left(right_page, left_page);
1694
/* fputs("Split right\n", stderr); */
1696
page_move_rec_list_end(new_page, page, move_limit,
1697
cursor->index, mtr);
1699
right_page = new_page;
1701
lock_update_split_right(right_page, left_page);
1704
/* 6. The split and the tree modification is now completed. Decide the
1705
page where the tuple should be inserted */
1707
if (split_rec == NULL) {
1708
insert_page = right_page;
1711
offsets = rec_get_offsets(first_rec, cursor->index,
1712
offsets, n_uniq, &heap);
1714
if (cmp_dtuple_rec(tuple, first_rec, offsets) >= 0) {
1716
insert_page = right_page;
1718
insert_page = left_page;
1722
/* 7. Reposition the cursor for insert and try insertion */
1723
page_cursor = btr_cur_get_page_cur(cursor);
1725
page_cur_search(insert_page, cursor->index, tuple,
1726
PAGE_CUR_LE, page_cursor);
1728
rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr);
1731
/* Insert fit on the page: update the free bits for the
1732
left and right pages in the same mtr */
1734
ibuf_update_free_bits_for_two_pages_low(cursor->index,
1737
/* fprintf(stderr, "Split and insert done %lu %lu\n",
1738
buf_frame_get_page_no(left_page),
1739
buf_frame_get_page_no(right_page)); */
1740
mem_heap_free(heap);
1744
/* 8. If insert did not fit, try page reorganization */
1746
btr_page_reorganize(insert_page, cursor->index, mtr);
1748
page_cur_search(insert_page, cursor->index, tuple,
1749
PAGE_CUR_LE, page_cursor);
1750
rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr);
1753
/* The insert did not fit on the page: loop back to the
1754
start of the function for a new split */
1756
/* We play safe and reset the free bits for new_page */
1757
ibuf_reset_free_bits(cursor->index, new_page);
1759
/* fprintf(stderr, "Split second round %lu\n",
1760
buf_frame_get_page_no(page)); */
1762
ut_ad(n_iterations < 2);
1763
ut_ad(!insert_will_fit);
1768
/* Insert fit on the page: update the free bits for the
1769
left and right pages in the same mtr */
1771
ibuf_update_free_bits_for_two_pages_low(cursor->index, left_page,
1774
fprintf(stderr, "Split and insert done %lu %lu\n",
1775
buf_frame_get_page_no(left_page),
1776
buf_frame_get_page_no(right_page));
1779
ut_ad(page_validate(left_page, cursor->index));
1780
ut_ad(page_validate(right_page, cursor->index));
1782
mem_heap_free(heap);
1786
/*****************************************************************
1787
Removes a page from the level list of pages. */
1790
btr_level_list_remove(
1791
/*==================*/
1792
page_t* page, /* in: page to remove */
1793
mtr_t* mtr) /* in: mtr */
1802
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1803
MTR_MEMO_PAGE_X_FIX));
1804
/* Get the previous and next page numbers of page */
1806
prev_page_no = btr_page_get_prev(page, mtr);
1807
next_page_no = btr_page_get_next(page, mtr);
1808
space = buf_frame_get_space_id(page);
1810
/* Update page links of the level */
1812
if (prev_page_no != FIL_NULL) {
1814
prev_page = btr_page_get(space, prev_page_no, RW_X_LATCH, mtr);
1815
ut_a(page_is_comp(prev_page) == page_is_comp(page));
1816
#ifdef UNIV_BTR_DEBUG
1817
ut_a(btr_page_get_next(prev_page, mtr)
1818
== buf_frame_get_page_no(page));
1819
#endif /* UNIV_BTR_DEBUG */
1821
btr_page_set_next(prev_page, next_page_no, mtr);
1824
if (next_page_no != FIL_NULL) {
1826
next_page = btr_page_get(space, next_page_no, RW_X_LATCH, mtr);
1827
ut_a(page_is_comp(next_page) == page_is_comp(page));
1828
#ifdef UNIV_BTR_DEBUG
1829
ut_a(btr_page_get_prev(next_page, mtr)
1830
== buf_frame_get_page_no(page));
1831
#endif /* UNIV_BTR_DEBUG */
1833
btr_page_set_prev(next_page, prev_page_no, mtr);
1837
/********************************************************************
1838
Writes the redo log record for setting an index record as the predefined
1842
btr_set_min_rec_mark_log(
1843
/*=====================*/
1844
rec_t* rec, /* in: record */
1845
ulint comp, /* nonzero=compact record format */
1846
mtr_t* mtr) /* in: mtr */
1848
mlog_write_initial_log_record(
1849
rec, comp ? MLOG_COMP_REC_MIN_MARK : MLOG_REC_MIN_MARK, mtr);
1851
/* Write rec offset as a 2-byte ulint */
1852
mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
1855
/********************************************************************
1856
Parses the redo log record for setting an index record as the predefined
1860
btr_parse_set_min_rec_mark(
1861
/*=======================*/
1862
/* out: end of log record or NULL */
1863
byte* ptr, /* in: buffer */
1864
byte* end_ptr,/* in: buffer end */
1865
ulint comp, /* in: nonzero=compact page format */
1866
page_t* page, /* in: page or NULL */
1867
mtr_t* mtr) /* in: mtr or NULL */
1871
if (end_ptr < ptr + 2) {
1877
ut_a(!page_is_comp(page) == !comp);
1879
rec = page + mach_read_from_2(ptr);
1881
btr_set_min_rec_mark(rec, comp, mtr);
1887
/********************************************************************
1888
Sets a record as the predefined minimum record. */
1891
btr_set_min_rec_mark(
1892
/*=================*/
1893
rec_t* rec, /* in: record */
1894
ulint comp, /* in: nonzero=compact page format */
1895
mtr_t* mtr) /* in: mtr */
1899
info_bits = rec_get_info_bits(rec, comp);
1901
rec_set_info_bits(rec, comp, info_bits | REC_INFO_MIN_REC_FLAG);
1903
btr_set_min_rec_mark_log(rec, comp, mtr);
1906
/*****************************************************************
1907
Deletes on the upper level the node pointer to a page. */
1910
btr_node_ptr_delete(
1911
/*================*/
1912
dict_index_t* index, /* in: index tree */
1913
page_t* page, /* in: page whose node pointer is deleted */
1914
mtr_t* mtr) /* in: mtr */
1921
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1922
MTR_MEMO_PAGE_X_FIX));
1923
/* Delete node pointer on father page */
1925
node_ptr = btr_page_get_father_node_ptr(index, page, mtr);
1927
btr_cur_position(index, node_ptr, &cursor);
1928
compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, FALSE,
1930
ut_a(err == DB_SUCCESS);
1933
btr_cur_compress_if_useful(&cursor, mtr);
1937
/*****************************************************************
1938
If page is the only on its level, this function moves its records to the
1939
father page, thus reducing the tree height. */
1944
dict_index_t* index, /* in: index tree */
1945
page_t* page, /* in: page which is the only on its level;
1946
must not be empty: use
1947
btr_discard_only_page_on_level if the last
1948
record from the page should be removed */
1949
mtr_t* mtr) /* in: mtr */
1951
page_t* father_page;
1953
page_t* pages[BTR_MAX_LEVELS];
1959
ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
1960
ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
1961
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1962
MTR_MEMO_PAGE_X_FIX));
1963
father_page = buf_frame_align(
1964
btr_page_get_father_node_ptr(index, page, mtr));
1966
page_level = btr_page_get_level(page, mtr);
1967
root_page_no = dict_index_get_page(index);
1970
pages[0] = father_page;
1972
/* Store all ancestor pages so we can reset their levels later on.
1973
We have to do all the searches on the tree now because later on,
1974
after we've replaced the first level, the tree is in an inconsistent
1975
state and can not be searched. */
1976
iter_page = father_page;
1978
if (buf_block_get_page_no(buf_block_align(iter_page))
1984
ut_a(ancestors < BTR_MAX_LEVELS);
1986
iter_page = buf_frame_align(
1987
btr_page_get_father_node_ptr(index, iter_page, mtr));
1989
pages[ancestors++] = iter_page;
1992
btr_search_drop_page_hash_index(page);
1994
/* Make the father empty */
1995
btr_page_empty(father_page, mtr);
1997
/* Move records to the father */
1998
page_copy_rec_list_end(father_page, page, page_get_infimum_rec(page),
2000
lock_update_copy_and_discard(father_page, page);
2002
/* Go upward to root page, decreasing levels by one. */
2003
for (i = 0; i < ancestors; i++) {
2004
iter_page = pages[i];
2006
ut_ad(btr_page_get_level(iter_page, mtr) == (page_level + 1));
2008
btr_page_set_level(iter_page, page_level, mtr);
2012
/* Free the file page */
2013
btr_page_free(index, page, mtr);
2015
/* We play safe and reset the free bits for the father */
2016
ibuf_reset_free_bits(index, father_page);
2017
ut_ad(page_validate(father_page, index));
2018
ut_ad(btr_check_node_ptr(index, father_page, mtr));
2021
/*****************************************************************
2022
Tries to merge the page first to the left immediate brother if such a
2023
brother exists, and the node pointers to the current page and to the brother
2024
reside on the same page. If the left brother does not satisfy these
2025
conditions, looks at the right brother. If the page is the only one on that
2026
level lifts the records of the page to the father page, thus reducing the
2027
tree height. It is assumed that mtr holds an x-latch on the tree and on the
2028
page. If cursor is on the leaf level, mtr must also hold x-latches to the
2029
brothers, if they exist. NOTE: it is assumed that the caller has reserved
2030
enough free extents so that the compression will always succeed if done! */
2035
btr_cur_t* cursor, /* in: cursor on the page to merge or lift;
2036
the page must not be empty: in record delete
2037
use btr_discard_page if the page would become
2039
mtr_t* mtr) /* in: mtr */
2041
dict_index_t* index;
2044
ulint right_page_no;
2046
page_t* father_page;
2055
ulint max_ins_size_reorg;
2059
page = btr_cur_get_page(cursor);
2060
index = btr_cur_get_index(cursor);
2061
comp = page_is_comp(page);
2062
ut_a((ibool)!!comp == dict_table_is_comp(index->table));
2064
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2066
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
2067
MTR_MEMO_PAGE_X_FIX));
2068
level = btr_page_get_level(page, mtr);
2069
space = dict_index_get_space(index);
2071
left_page_no = btr_page_get_prev(page, mtr);
2072
right_page_no = btr_page_get_next(page, mtr);
2075
fprintf(stderr, "Merge left page %lu right %lu \n",
2076
left_page_no, right_page_no);
2079
node_ptr = btr_page_get_father_node_ptr(index, page, mtr);
2080
ut_ad(!comp || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
2081
father_page = buf_frame_align(node_ptr);
2082
ut_a(comp == page_is_comp(father_page));
2084
/* Decide the page to which we try to merge and which will inherit
2087
is_left = left_page_no != FIL_NULL;
2091
merge_page = btr_page_get(space, left_page_no, RW_X_LATCH,
2093
#ifdef UNIV_BTR_DEBUG
2094
ut_a(btr_page_get_next(merge_page, mtr)
2095
== buf_frame_get_page_no(page));
2096
#endif /* UNIV_BTR_DEBUG */
2097
} else if (right_page_no != FIL_NULL) {
2099
merge_page = btr_page_get(space, right_page_no, RW_X_LATCH,
2101
#ifdef UNIV_BTR_DEBUG
2102
ut_a(btr_page_get_prev(merge_page, mtr)
2103
== buf_frame_get_page_no(page));
2104
#endif /* UNIV_BTR_DEBUG */
2106
/* The page is the only one on the level, lift the records
2108
btr_lift_page_up(index, page, mtr);
2113
n_recs = page_get_n_recs(page);
2114
data_size = page_get_data_size(page);
2115
ut_a(page_is_comp(merge_page) == comp);
2117
max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
2118
merge_page, n_recs);
2119
if (data_size > max_ins_size_reorg) {
2121
/* No space for merge */
2126
ut_ad(page_validate(merge_page, index));
2128
max_ins_size = page_get_max_insert_size(merge_page, n_recs);
2130
if (data_size > max_ins_size) {
2132
/* We have to reorganize merge_page */
2134
btr_page_reorganize(merge_page, index, mtr);
2136
max_ins_size = page_get_max_insert_size(merge_page, n_recs);
2138
ut_ad(page_validate(merge_page, index));
2139
ut_ad(page_get_max_insert_size(merge_page, n_recs)
2140
== max_ins_size_reorg);
2143
if (data_size > max_ins_size) {
2145
/* Add fault tolerance, though this should never happen */
2150
btr_search_drop_page_hash_index(page);
2152
/* Remove the page from the level list */
2153
btr_level_list_remove(page, mtr);
2156
btr_node_ptr_delete(index, page, mtr);
2158
mem_heap_t* heap = NULL;
2159
ulint offsets_[REC_OFFS_NORMAL_SIZE];
2160
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
2161
/* Replace the address of the old child node (= page) with the
2162
address of the merge page to the right */
2164
btr_node_ptr_set_child_page_no(node_ptr,
2170
right_page_no, mtr);
2171
if (UNIV_LIKELY_NULL(heap)) {
2172
mem_heap_free(heap);
2174
btr_node_ptr_delete(index, merge_page, mtr);
2177
/* Move records to the merge page */
2179
orig_pred = page_rec_get_prev(
2180
page_get_supremum_rec(merge_page));
2181
page_copy_rec_list_start(merge_page, page,
2182
page_get_supremum_rec(page),
2185
lock_update_merge_left(merge_page, orig_pred, page);
2187
orig_succ = page_rec_get_next(
2188
page_get_infimum_rec(merge_page));
2189
page_copy_rec_list_end(merge_page, page,
2190
page_get_infimum_rec(page),
2193
lock_update_merge_right(orig_succ, page);
2196
/* We have added new records to merge_page: update its free bits */
2197
ibuf_update_free_bits_if_full(index, merge_page,
2198
UNIV_PAGE_SIZE, ULINT_UNDEFINED);
2200
ut_ad(page_validate(merge_page, index));
2202
/* Free the file page */
2203
btr_page_free(index, page, mtr);
2205
ut_ad(btr_check_node_ptr(index, merge_page, mtr));
2208
/*****************************************************************
2209
Discards a page that is the only page on its level. */
2212
btr_discard_only_page_on_level(
2213
/*===========================*/
2214
dict_index_t* index, /* in: index tree */
2215
page_t* page, /* in: page which is the only on its level */
2216
mtr_t* mtr) /* in: mtr */
2219
page_t* father_page;
2222
ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
2223
ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
2224
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
2225
MTR_MEMO_PAGE_X_FIX));
2226
btr_search_drop_page_hash_index(page);
2228
node_ptr = btr_page_get_father_node_ptr(index, page, mtr);
2229
father_page = buf_frame_align(node_ptr);
2231
page_level = btr_page_get_level(page, mtr);
2233
lock_update_discard(page_get_supremum_rec(father_page), page);
2235
btr_page_set_level(father_page, page_level, mtr);
2237
/* Free the file page */
2238
btr_page_free(index, page, mtr);
2240
if (buf_frame_get_page_no(father_page) == dict_index_get_page(index)) {
2241
/* The father is the root page */
2243
btr_page_empty(father_page, mtr);
2245
/* We play safe and reset the free bits for the father */
2246
ibuf_reset_free_bits(index, father_page);
2248
ut_ad(page_get_n_recs(father_page) == 1);
2250
btr_discard_only_page_on_level(index, father_page, mtr);
2254
/*****************************************************************
2255
Discards a page from a B-tree. This is used to remove the last record from
2256
a B-tree page: the whole page must be removed at the same time. This cannot
2257
be used for the root page, which is allowed to be empty. */
2262
btr_cur_t* cursor, /* in: cursor on the page to discard: not on
2264
mtr_t* mtr) /* in: mtr */
2266
dict_index_t* index;
2269
ulint right_page_no;
2274
page = btr_cur_get_page(cursor);
2275
index = btr_cur_get_index(cursor);
2277
ut_ad(dict_index_get_page(index) != buf_frame_get_page_no(page));
2278
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2280
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
2281
MTR_MEMO_PAGE_X_FIX));
2282
space = dict_index_get_space(index);
2284
/* Decide the page which will inherit the locks */
2286
left_page_no = btr_page_get_prev(page, mtr);
2287
right_page_no = btr_page_get_next(page, mtr);
2289
if (left_page_no != FIL_NULL) {
2290
merge_page = btr_page_get(space, left_page_no, RW_X_LATCH,
2292
#ifdef UNIV_BTR_DEBUG
2293
ut_a(btr_page_get_next(merge_page, mtr)
2294
== buf_frame_get_page_no(page));
2295
#endif /* UNIV_BTR_DEBUG */
2296
} else if (right_page_no != FIL_NULL) {
2297
merge_page = btr_page_get(space, right_page_no, RW_X_LATCH,
2299
#ifdef UNIV_BTR_DEBUG
2300
ut_a(btr_page_get_prev(merge_page, mtr)
2301
== buf_frame_get_page_no(page));
2302
#endif /* UNIV_BTR_DEBUG */
2304
btr_discard_only_page_on_level(index, page, mtr);
2309
ut_a(page_is_comp(merge_page) == page_is_comp(page));
2310
btr_search_drop_page_hash_index(page);
2312
if (left_page_no == FIL_NULL && btr_page_get_level(page, mtr) > 0) {
2314
/* We have to mark the leftmost node pointer on the right
2315
side page as the predefined minimum record */
2317
node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
2319
ut_ad(page_rec_is_user_rec(node_ptr));
2321
btr_set_min_rec_mark(node_ptr, page_is_comp(merge_page), mtr);
2324
btr_node_ptr_delete(index, page, mtr);
2326
/* Remove the page from the level list */
2327
btr_level_list_remove(page, mtr);
2329
if (left_page_no != FIL_NULL) {
2330
lock_update_discard(page_get_supremum_rec(merge_page), page);
2332
lock_update_discard(page_rec_get_next(
2333
page_get_infimum_rec(merge_page)),
2337
/* Free the file page */
2338
btr_page_free(index, page, mtr);
2340
ut_ad(btr_check_node_ptr(index, merge_page, mtr));
2343
#ifdef UNIV_BTR_PRINT
2344
/*****************************************************************
2345
Prints size info of a B-tree. */
2350
dict_index_t* index) /* in: index tree */
2356
if (index->type & DICT_IBUF) {
2357
fputs("Sorry, cannot print info of an ibuf tree:"
2358
" use ibuf functions\n", stderr);
2365
root = btr_root_get(index, &mtr);
2367
seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
2369
fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
2370
fseg_print(seg, &mtr);
2372
if (!(index->type & DICT_UNIVERSAL)) {
2374
seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
2376
fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
2377
fseg_print(seg, &mtr);
2383
/****************************************************************
2384
Prints recursively index tree pages. */
2387
btr_print_recursive(
2388
/*================*/
2389
dict_index_t* index, /* in: index tree */
2390
page_t* page, /* in: index page */
2391
ulint width, /* in: print this many entries from start
2393
mem_heap_t** heap, /* in/out: heap for rec_get_offsets() */
2394
ulint** offsets,/* in/out: buffer for rec_get_offsets() */
2395
mtr_t* mtr) /* in: mtr */
2404
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
2405
MTR_MEMO_PAGE_X_FIX));
2406
fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
2407
(ulong) btr_page_get_level(page, mtr),
2408
(ulong) buf_frame_get_page_no(page));
2410
page_print(page, index, width, width);
2412
n_recs = page_get_n_recs(page);
2414
page_cur_set_before_first(page, &cursor);
2415
page_cur_move_to_next(&cursor);
2417
while (!page_cur_is_after_last(&cursor)) {
2419
if (0 == btr_page_get_level(page, mtr)) {
2421
/* If this is the leaf level, do nothing */
2423
} else if ((i <= width) || (i >= n_recs - width)) {
2427
node_ptr = page_cur_get_rec(&cursor);
2429
*offsets = rec_get_offsets(node_ptr, index, *offsets,
2430
ULINT_UNDEFINED, heap);
2431
child = btr_node_ptr_get_child(node_ptr,
2433
btr_print_recursive(index, child, width,
2434
heap, offsets, &mtr2);
2438
page_cur_move_to_next(&cursor);
2443
/******************************************************************
2444
Prints directories and other info of all nodes in the tree. */
2449
dict_index_t* index, /* in: index */
2450
ulint width) /* in: print this many entries from start
2455
mem_heap_t* heap = NULL;
2456
ulint offsets_[REC_OFFS_NORMAL_SIZE];
2457
ulint* offsets = offsets_;
2458
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
2460
fputs("--------------------------\n"
2461
"INDEX TREE PRINT\n", stderr);
2465
root = btr_root_get(index, &mtr);
2467
btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
2468
if (UNIV_LIKELY_NULL(heap)) {
2469
mem_heap_free(heap);
2474
btr_validate_index(index, NULL);
2476
#endif /* UNIV_BTR_PRINT */
2479
/****************************************************************
2480
Checks that the node pointer to a page is appropriate. */
2486
dict_index_t* index, /* in: index tree */
2487
page_t* page, /* in: index page */
2488
mtr_t* mtr) /* in: mtr */
2492
dtuple_t* node_ptr_tuple;
2494
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
2495
MTR_MEMO_PAGE_X_FIX));
2496
if (dict_index_get_page(index) == buf_frame_get_page_no(page)) {
2501
node_ptr = btr_page_get_father_node_ptr(index, page, mtr);
2503
if (btr_page_get_level(page, mtr) == 0) {
2508
heap = mem_heap_create(256);
2510
node_ptr_tuple = dict_index_build_node_ptr(
2511
index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
2512
btr_page_get_level(page, mtr));
2514
ut_a(!cmp_dtuple_rec(node_ptr_tuple, node_ptr,
2515
rec_get_offsets(node_ptr, index,
2516
NULL, ULINT_UNDEFINED, &heap)));
2518
mem_heap_free(heap);
2522
#endif /* UNIV_DEBUG */
2524
/****************************************************************
2525
Display identification information for a record. */
2528
btr_index_rec_validate_report(
2529
/*==========================*/
2530
page_t* page, /* in: index page */
2531
rec_t* rec, /* in: index record */
2532
dict_index_t* index) /* in: index */
2534
fputs("InnoDB: Record in ", stderr);
2535
dict_index_name_print(stderr, NULL, index);
2536
fprintf(stderr, ", page %lu, at offset %lu\n",
2537
buf_frame_get_page_no(page), (ulint)(rec - page));
2540
/****************************************************************
2541
Checks the size and number of fields in a record based on the definition of
2545
btr_index_rec_validate(
2546
/*===================*/
2547
/* out: TRUE if ok */
2548
rec_t* rec, /* in: index record */
2549
dict_index_t* index, /* in: index */
2550
ibool dump_on_error) /* in: TRUE if the function
2551
should print hex dump of record
2552
and page on error */
2558
mem_heap_t* heap = NULL;
2559
ulint offsets_[REC_OFFS_NORMAL_SIZE];
2560
ulint* offsets = offsets_;
2561
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
2563
page = buf_frame_align(rec);
2565
if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
2566
/* The insert buffer index tree can contain records from any
2567
other index: we cannot check the number of fields or
2573
if (UNIV_UNLIKELY((ibool)!!page_is_comp(page)
2574
!= dict_table_is_comp(index->table))) {
2575
btr_index_rec_validate_report(page, rec, index);
2576
fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
2577
(ulong) !!page_is_comp(page),
2578
(ulong) dict_table_is_comp(index->table));
2583
n = dict_index_get_n_fields(index);
2585
if (!page_is_comp(page)
2586
&& UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) {
2587
btr_index_rec_validate_report(page, rec, index);
2588
fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
2589
(ulong) rec_get_n_fields_old(rec), (ulong) n);
2591
if (dump_on_error) {
2592
buf_page_print(page);
2594
fputs("InnoDB: corrupt record ", stderr);
2595
rec_print_old(stderr, rec);
2601
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
2603
for (i = 0; i < n; i++) {
2604
ulint fixed_size = dict_col_get_fixed_size(
2605
dict_index_get_nth_col(index, i));
2607
rec_get_nth_field(rec, offsets, i, &len);
2609
/* Note that if fixed_size != 0, it equals the
2610
length of a fixed-size column in the clustered index.
2611
A prefix index of the column is of fixed, but different
2612
length. When fixed_size == 0, prefix_len is the maximum
2613
length of the prefix index column. */
2615
if ((dict_index_get_nth_field(index, i)->prefix_len == 0
2616
&& len != UNIV_SQL_NULL && fixed_size
2617
&& len != fixed_size)
2618
|| (dict_index_get_nth_field(index, i)->prefix_len > 0
2619
&& len != UNIV_SQL_NULL
2621
> dict_index_get_nth_field(index, i)->prefix_len)) {
2623
btr_index_rec_validate_report(page, rec, index);
2625
"InnoDB: field %lu len is %lu,"
2627
(ulong) i, (ulong) len, (ulong) fixed_size);
2629
if (dump_on_error) {
2630
buf_page_print(page);
2632
fputs("InnoDB: corrupt record ", stderr);
2633
rec_print_new(stderr, rec, offsets);
2636
if (UNIV_LIKELY_NULL(heap)) {
2637
mem_heap_free(heap);
2643
if (UNIV_LIKELY_NULL(heap)) {
2644
mem_heap_free(heap);
2649
/****************************************************************
2650
Checks the size and number of fields in records based on the definition of
2654
btr_index_page_validate(
2655
/*====================*/
2656
/* out: TRUE if ok */
2657
page_t* page, /* in: index page */
2658
dict_index_t* index) /* in: index */
2663
page_cur_set_before_first(page, &cur);
2664
page_cur_move_to_next(&cur);
2667
if (page_cur_is_after_last(&cur)) {
2672
if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
2677
page_cur_move_to_next(&cur);
2683
/****************************************************************
2684
Report an error on one page of an index tree. */
2687
btr_validate_report1(
2688
/*=================*/
2689
/* out: TRUE if ok */
2690
dict_index_t* index, /* in: index */
2691
ulint level, /* in: B-tree level */
2692
page_t* page) /* in: index page */
2694
fprintf(stderr, "InnoDB: Error in page %lu of ",
2695
buf_frame_get_page_no(page));
2696
dict_index_name_print(stderr, NULL, index);
2698
fprintf(stderr, ", index tree level %lu", level);
2703
/****************************************************************
2704
Report an error on two pages of an index tree. */
2707
btr_validate_report2(
2708
/*=================*/
2709
/* out: TRUE if ok */
2710
dict_index_t* index, /* in: index */
2711
ulint level, /* in: B-tree level */
2712
page_t* page1, /* in: first index page */
2713
page_t* page2) /* in: second index page */
2715
fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
2716
buf_frame_get_page_no(page1),
2717
buf_frame_get_page_no(page2));
2718
dict_index_name_print(stderr, NULL, index);
2720
fprintf(stderr, ", index tree level %lu", level);
2725
/****************************************************************
2726
Validates index tree level. */
2731
/* out: TRUE if ok */
2732
dict_index_t* index, /* in: index tree */
2733
trx_t* trx, /* in: transaction or NULL */
2734
ulint level) /* in: level number */
2738
page_t* right_page = 0; /* remove warning */
2739
page_t* father_page;
2740
page_t* right_father_page;
2742
rec_t* right_node_ptr;
2744
ulint right_page_no;
2747
dtuple_t* node_ptr_tuple;
2750
mem_heap_t* heap = mem_heap_create(256);
2751
ulint* offsets = NULL;
2752
ulint* offsets2= NULL;
2756
mtr_x_lock(dict_index_get_lock(index), &mtr);
2758
page = btr_root_get(index, &mtr);
2760
space = buf_frame_get_space_id(page);
2762
while (level != btr_page_get_level(page, &mtr)) {
2764
ut_a(btr_page_get_level(page, &mtr) > 0);
2766
page_cur_set_before_first(page, &cursor);
2767
page_cur_move_to_next(&cursor);
2769
node_ptr = page_cur_get_rec(&cursor);
2770
offsets = rec_get_offsets(node_ptr, index, offsets,
2771
ULINT_UNDEFINED, &heap);
2772
page = btr_node_ptr_get_child(node_ptr, offsets, &mtr);
2775
/* Now we are on the desired level. Loop through the pages on that
2778
if (trx_is_interrupted(trx)) {
2780
mem_heap_free(heap);
2783
mem_heap_empty(heap);
2784
offsets = offsets2 = NULL;
2785
mtr_x_lock(dict_index_get_lock(index), &mtr);
2787
/* Check ordering etc. of records */
2789
if (!page_validate(page, index)) {
2790
btr_validate_report1(index, level, page);
2793
} else if (level == 0) {
2794
/* We are on level 0. Check that the records have the right
2795
number of fields, and field lengths are right. */
2797
if (!btr_index_page_validate(page, index)) {
2803
ut_a(btr_page_get_level(page, &mtr) == level);
2805
right_page_no = btr_page_get_next(page, &mtr);
2806
left_page_no = btr_page_get_prev(page, &mtr);
2808
ut_a((page_get_n_recs(page) > 0)
2810
&& (buf_frame_get_page_no(page)
2811
== dict_index_get_page(index))));
2813
if (right_page_no != FIL_NULL) {
2815
right_page = btr_page_get(space, right_page_no, RW_X_LATCH,
2817
if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
2818
!= buf_frame_get_page_no(page))) {
2819
btr_validate_report2(index, level, page, right_page);
2820
fputs("InnoDB: broken FIL_PAGE_NEXT"
2821
" or FIL_PAGE_PREV links\n", stderr);
2822
buf_page_print(page);
2823
buf_page_print(right_page);
2828
if (UNIV_UNLIKELY(page_is_comp(right_page)
2829
!= page_is_comp(page))) {
2830
btr_validate_report2(index, level, page, right_page);
2831
fputs("InnoDB: 'compact' flag mismatch\n", stderr);
2832
buf_page_print(page);
2833
buf_page_print(right_page);
2837
goto node_ptr_fails;
2840
rec = page_rec_get_prev(page_get_supremum_rec(page));
2841
right_rec = page_rec_get_next(page_get_infimum_rec(
2843
offsets = rec_get_offsets(rec, index,
2844
offsets, ULINT_UNDEFINED, &heap);
2845
offsets2 = rec_get_offsets(right_rec, index,
2846
offsets2, ULINT_UNDEFINED, &heap);
2847
if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec,
2851
btr_validate_report2(index, level, page, right_page);
2853
fputs("InnoDB: records in wrong order"
2854
" on adjacent pages\n", stderr);
2856
buf_page_print(page);
2857
buf_page_print(right_page);
2859
fputs("InnoDB: record ", stderr);
2860
rec = page_rec_get_prev(page_get_supremum_rec(page));
2861
rec_print(stderr, rec, index);
2863
fputs("InnoDB: record ", stderr);
2864
rec = page_rec_get_next(
2865
page_get_infimum_rec(right_page));
2866
rec_print(stderr, rec, index);
2873
if (level > 0 && left_page_no == FIL_NULL) {
2874
ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
2875
page_rec_get_next(page_get_infimum_rec(page)),
2876
page_is_comp(page)));
2879
if (buf_frame_get_page_no(page) != dict_index_get_page(index)) {
2881
/* Check father node pointers */
2883
node_ptr = btr_page_get_father_node_ptr(index, page, &mtr);
2884
father_page = buf_frame_align(node_ptr);
2885
offsets = rec_get_offsets(node_ptr, index,
2886
offsets, ULINT_UNDEFINED, &heap);
2888
if (btr_node_ptr_get_child_page_no(node_ptr, offsets)
2889
!= buf_frame_get_page_no(page)
2890
|| node_ptr != btr_page_get_father_for_rec(
2892
page_rec_get_prev(page_get_supremum_rec(page)),
2894
btr_validate_report1(index, level, page);
2896
fputs("InnoDB: node pointer to the page is wrong\n",
2899
buf_page_print(father_page);
2900
buf_page_print(page);
2902
fputs("InnoDB: node ptr ", stderr);
2903
rec_print_new(stderr, node_ptr, offsets);
2905
fprintf(stderr, "\n"
2906
"InnoDB: node ptr child page n:o %lu\n",
2907
(unsigned long) btr_node_ptr_get_child_page_no
2908
(node_ptr, offsets));
2910
fputs("InnoDB: record on page ", stderr);
2911
rec = btr_page_get_father_for_rec(
2913
page_rec_get_prev(page_get_supremum_rec(page)),
2915
rec_print(stderr, rec, index);
2919
goto node_ptr_fails;
2922
if (btr_page_get_level(page, &mtr) > 0) {
2923
offsets = rec_get_offsets(node_ptr, index,
2924
offsets, ULINT_UNDEFINED,
2927
node_ptr_tuple = dict_index_build_node_ptr(
2929
page_rec_get_next(page_get_infimum_rec(page)),
2930
0, heap, btr_page_get_level(page, &mtr));
2932
if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
2934
rec_t* first_rec = page_rec_get_next(
2935
page_get_infimum_rec(page));
2937
btr_validate_report1(index, level, page);
2939
buf_page_print(father_page);
2940
buf_page_print(page);
2942
fputs("InnoDB: Error: node ptrs differ"
2944
"InnoDB: node ptr ", stderr);
2945
rec_print_new(stderr, node_ptr, offsets);
2946
fputs("InnoDB: first rec ", stderr);
2947
rec_print(stderr, first_rec, index);
2951
goto node_ptr_fails;
2955
if (left_page_no == FIL_NULL) {
2956
ut_a(node_ptr == page_rec_get_next(
2957
page_get_infimum_rec(father_page)));
2958
ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
2961
if (right_page_no == FIL_NULL) {
2962
ut_a(node_ptr == page_rec_get_prev(
2963
page_get_supremum_rec(father_page)));
2964
ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
2966
right_node_ptr = btr_page_get_father_node_ptr(
2967
index, right_page, &mtr);
2968
if (page_rec_get_next(node_ptr)
2969
!= page_get_supremum_rec(father_page)) {
2972
!= page_rec_get_next(node_ptr)) {
2974
fputs("InnoDB: node pointer to"
2975
" the right page is wrong\n",
2978
btr_validate_report1(index, level,
2981
buf_page_print(father_page);
2982
buf_page_print(page);
2983
buf_page_print(right_page);
2986
right_father_page = buf_frame_align(
2989
if (right_node_ptr != page_rec_get_next(
2990
page_get_infimum_rec(
2991
right_father_page))) {
2993
fputs("InnoDB: node pointer 2 to"
2994
" the right page is wrong\n",
2997
btr_validate_report1(index, level,
3000
buf_page_print(father_page);
3001
buf_page_print(right_father_page);
3002
buf_page_print(page);
3003
buf_page_print(right_page);
3006
if (buf_frame_get_page_no(right_father_page)
3007
!= btr_page_get_next(father_page, &mtr)) {
3010
fputs("InnoDB: node pointer 3 to"
3011
" the right page is wrong\n",
3014
btr_validate_report1(index, level,
3017
buf_page_print(father_page);
3018
buf_page_print(right_father_page);
3019
buf_page_print(page);
3020
buf_page_print(right_page);
3027
/* Commit the mini-transaction to release the latch on 'page'.
3028
Re-acquire the latch on right_page, which will become 'page'
3029
on the next loop. The page has already been checked. */
3032
if (right_page_no != FIL_NULL) {
3035
page = btr_page_get(space, right_page_no, RW_X_LATCH, &mtr);
3040
mem_heap_free(heap);
3044
/******************************************************************
3045
Checks the consistency of an index tree. */
3050
/* out: TRUE if ok */
3051
dict_index_t* index, /* in: index */
3052
trx_t* trx) /* in: transaction or NULL */
3060
mtr_x_lock(dict_index_get_lock(index), &mtr);
3062
root = btr_root_get(index, &mtr);
3063
n = btr_page_get_level(root, &mtr);
3065
for (i = 0; i <= n && !trx_is_interrupted(trx); i++) {
3066
if (!btr_validate_level(index, trx, n - i)) {