~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to storage/innobase/btr/btr0btr.c

  • Committer: Monty
  • Date: 2008-11-07 05:51:15 UTC
  • mto: This revision was merged to the branch mainline in revision 579.
  • Revision ID: mordred@palanthas.inaugust.com-20081107055115-0275gvq62buzls77
Fixed a decimal sign thing.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/******************************************************
 
2
The B-tree
 
3
 
 
4
(c) 1994-1996 Innobase Oy
 
5
 
 
6
Created 6/2/1994 Heikki Tuuri
 
7
*******************************************************/
 
8
 
 
9
#include "btr0btr.h"
 
10
 
 
11
#ifdef UNIV_NONINL
 
12
#include "btr0btr.ic"
 
13
#endif
 
14
 
 
15
#include "fsp0fsp.h"
 
16
#include "page0page.h"
 
17
#include "page0zip.h"
 
18
#include "btr0cur.h"
 
19
#include "btr0sea.h"
 
20
#include "btr0pcur.h"
 
21
#include "rem0cmp.h"
 
22
#include "lock0lock.h"
 
23
#include "ibuf0ibuf.h"
 
24
#include "trx0trx.h"
 
25
#include "page0zip.h"
 
26
 
 
27
/*
 
28
Latching strategy of the InnoDB B-tree
 
29
--------------------------------------
 
30
A tree latch protects all non-leaf nodes of the tree. Each node of a tree
 
31
also has a latch of its own.
 
32
 
 
33
A B-tree operation normally first acquires an S-latch on the tree. It
 
34
searches down the tree and releases the tree latch when it has the
 
35
leaf node latch. To save CPU time we do not acquire any latch on
 
36
non-leaf nodes of the tree during a search, those pages are only bufferfixed.
 
37
 
 
38
If an operation needs to restructure the tree, it acquires an X-latch on
 
39
the tree before searching to a leaf node. If it needs, for example, to
 
40
split a leaf,
 
41
(1) InnoDB decides the split point in the leaf,
 
42
(2) allocates a new page,
 
43
(3) inserts the appropriate node pointer to the first non-leaf level,
 
44
(4) releases the tree X-latch,
 
45
(5) and then moves records from the leaf to the new allocated page.
 
46
 
 
47
Node pointers
 
48
-------------
 
49
Leaf pages of a B-tree contain the index records stored in the
 
50
tree. On levels n > 0 we store 'node pointers' to pages on level
 
51
n - 1. For each page there is exactly one node pointer stored:
 
52
thus the our tree is an ordinary B-tree, not a B-link tree.
 
53
 
 
54
A node pointer contains a prefix P of an index record. The prefix
 
55
is long enough so that it determines an index record uniquely.
 
56
The file page number of the child page is added as the last
 
57
field. To the child page we can store node pointers or index records
 
58
which are >= P in the alphabetical order, but < P1 if there is
 
59
a next node pointer on the level, and P1 is its prefix.
 
60
 
 
61
If a node pointer with a prefix P points to a non-leaf child,
 
62
then the leftmost record in the child must have the same
 
63
prefix P. If it points to a leaf node, the child is not required
 
64
to contain any record with a prefix equal to P. The leaf case
 
65
is decided this way to allow arbitrary deletions in a leaf node
 
66
without touching upper levels of the tree.
 
67
 
 
68
We have predefined a special minimum record which we
 
69
define as the smallest record in any alphabetical order.
 
70
A minimum record is denoted by setting a bit in the record
 
71
header. A minimum record acts as the prefix of a node pointer
 
72
which points to a leftmost node on any level of the tree.
 
73
 
 
74
File page allocation
 
75
--------------------
 
76
In the root node of a B-tree there are two file segment headers.
 
77
The leaf pages of a tree are allocated from one file segment, to
 
78
make them consecutive on disk if possible. From the other file segment
 
79
we allocate pages for the non-leaf levels of the tree.
 
80
*/
 
81
 
 
82
/******************************************************************
 
83
Gets the root node of a tree and x-latches it. */
 
84
static
 
85
buf_block_t*
 
86
btr_root_block_get(
 
87
/*===============*/
 
88
                                /* out: root page, x-latched */
 
89
        dict_index_t*   index,  /* in: index tree */
 
90
        mtr_t*          mtr)    /* in: mtr */
 
91
{
 
92
        ulint           space;
 
93
        ulint           zip_size;
 
94
        ulint           root_page_no;
 
95
        buf_block_t*    block;
 
96
 
 
97
        space = dict_index_get_space(index);
 
98
        zip_size = dict_table_zip_size(index->table);
 
99
        root_page_no = dict_index_get_page(index);
 
100
 
 
101
        block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
 
102
        ut_a((ibool)!!page_is_comp(buf_block_get_frame(block))
 
103
             == dict_table_is_comp(index->table));
 
104
 
 
105
        return(block);
 
106
}
 
107
 
 
108
/******************************************************************
 
109
Gets the root node of a tree and x-latches it. */
 
110
UNIV_INTERN
 
111
page_t*
 
112
btr_root_get(
 
113
/*=========*/
 
114
                                /* out: root page, x-latched */
 
115
        dict_index_t*   index,  /* in: index tree */
 
116
        mtr_t*          mtr)    /* in: mtr */
 
117
{
 
118
        return(buf_block_get_frame(btr_root_block_get(index, mtr)));
 
119
}
 
120
 
 
121
/*****************************************************************
 
122
Gets pointer to the previous user record in the tree. It is assumed that
 
123
the caller has appropriate latches on the page and its neighbor. */
 
124
UNIV_INTERN
 
125
rec_t*
 
126
btr_get_prev_user_rec(
 
127
/*==================*/
 
128
                        /* out: previous user record, NULL if there is none */
 
129
        rec_t*  rec,    /* in: record on leaf level */
 
130
        mtr_t*  mtr)    /* in: mtr holding a latch on the page, and if
 
131
                        needed, also to the previous page */
 
132
{
 
133
        page_t* page;
 
134
        page_t* prev_page;
 
135
        ulint   prev_page_no;
 
136
 
 
137
        if (!page_rec_is_infimum(rec)) {
 
138
 
 
139
                rec_t*  prev_rec = page_rec_get_prev(rec);
 
140
 
 
141
                if (!page_rec_is_infimum(prev_rec)) {
 
142
 
 
143
                        return(prev_rec);
 
144
                }
 
145
        }
 
146
 
 
147
        page = page_align(rec);
 
148
        prev_page_no = btr_page_get_prev(page, mtr);
 
149
 
 
150
        if (prev_page_no != FIL_NULL) {
 
151
 
 
152
                ulint           space;
 
153
                ulint           zip_size;
 
154
                buf_block_t*    prev_block;
 
155
 
 
156
                space = page_get_space_id(page);
 
157
                zip_size = fil_space_get_zip_size(space);
 
158
 
 
159
                prev_block = buf_page_get_with_no_latch(space, zip_size,
 
160
                                                        prev_page_no, mtr);
 
161
                prev_page = buf_block_get_frame(prev_block);
 
162
                /* The caller must already have a latch to the brother */
 
163
                ut_ad(mtr_memo_contains(mtr, prev_block,
 
164
                                        MTR_MEMO_PAGE_S_FIX)
 
165
                      || mtr_memo_contains(mtr, prev_block,
 
166
                                           MTR_MEMO_PAGE_X_FIX));
 
167
#ifdef UNIV_BTR_DEBUG
 
168
                ut_a(page_is_comp(prev_page) == page_is_comp(page));
 
169
                ut_a(btr_page_get_next(prev_page, mtr)
 
170
                     == page_get_page_no(page));
 
171
#endif /* UNIV_BTR_DEBUG */
 
172
 
 
173
                return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
 
174
        }
 
175
 
 
176
        return(NULL);
 
177
}
 
178
 
 
179
/*****************************************************************
 
180
Gets pointer to the next user record in the tree. It is assumed that the
 
181
caller has appropriate latches on the page and its neighbor. */
 
182
UNIV_INTERN
 
183
rec_t*
 
184
btr_get_next_user_rec(
 
185
/*==================*/
 
186
                        /* out: next user record, NULL if there is none */
 
187
        rec_t*  rec,    /* in: record on leaf level */
 
188
        mtr_t*  mtr)    /* in: mtr holding a latch on the page, and if
 
189
                        needed, also to the next page */
 
190
{
 
191
        page_t* page;
 
192
        page_t* next_page;
 
193
        ulint   next_page_no;
 
194
 
 
195
        if (!page_rec_is_supremum(rec)) {
 
196
 
 
197
                rec_t*  next_rec = page_rec_get_next(rec);
 
198
 
 
199
                if (!page_rec_is_supremum(next_rec)) {
 
200
 
 
201
                        return(next_rec);
 
202
                }
 
203
        }
 
204
 
 
205
        page = page_align(rec);
 
206
        next_page_no = btr_page_get_next(page, mtr);
 
207
 
 
208
        if (next_page_no != FIL_NULL) {
 
209
                ulint           space;
 
210
                ulint           zip_size;
 
211
                buf_block_t*    next_block;
 
212
 
 
213
                space = page_get_space_id(page);
 
214
                zip_size = fil_space_get_zip_size(space);
 
215
 
 
216
                next_block = buf_page_get_with_no_latch(space, zip_size,
 
217
                                                        next_page_no, mtr);
 
218
                next_page = buf_block_get_frame(next_block);
 
219
                /* The caller must already have a latch to the brother */
 
220
                ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX)
 
221
                      || mtr_memo_contains(mtr, next_block,
 
222
                                           MTR_MEMO_PAGE_X_FIX));
 
223
#ifdef UNIV_BTR_DEBUG
 
224
                ut_a(page_is_comp(next_page) == page_is_comp(page));
 
225
                ut_a(btr_page_get_prev(next_page, mtr)
 
226
                     == page_get_page_no(page));
 
227
#endif /* UNIV_BTR_DEBUG */
 
228
 
 
229
                return(page_rec_get_next(page_get_infimum_rec(next_page)));
 
230
        }
 
231
 
 
232
        return(NULL);
 
233
}
 
234
 
 
235
/******************************************************************
 
236
Creates a new index page (not the root, and also not
 
237
used in page reorganization). */
 
238
static
 
239
void
 
240
btr_page_create(
 
241
/*============*/
 
242
        buf_block_t*    block,  /* in/out: page to be created */
 
243
        page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */
 
244
        dict_index_t*   index,  /* in: index */
 
245
        ulint           level,  /* in: the B-tree level of the page */
 
246
        mtr_t*          mtr)    /* in: mtr */
 
247
{
 
248
        page_t*         page = buf_block_get_frame(block);
 
249
 
 
250
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
251
 
 
252
        if (UNIV_LIKELY_NULL(page_zip)) {
 
253
                page_create_zip(block, index, level, mtr);
 
254
        } else {
 
255
                page_create(block, mtr, dict_table_is_comp(index->table));
 
256
                /* Set the level of the new index page */
 
257
                btr_page_set_level(page, NULL, level, mtr);
 
258
        }
 
259
 
 
260
        block->check_index_page_at_flush = TRUE;
 
261
 
 
262
        btr_page_set_index_id(page, page_zip, index->id, mtr);
 
263
}
 
264
 
 
265
/******************************************************************
 
266
Allocates a new file page to be used in an ibuf tree. Takes the page from
 
267
the free list of the tree, which must contain pages! */
 
268
static
 
269
buf_block_t*
 
270
btr_page_alloc_for_ibuf(
 
271
/*====================*/
 
272
                                /* out: new allocated block, x-latched */
 
273
        dict_index_t*   index,  /* in: index tree */
 
274
        mtr_t*          mtr)    /* in: mtr */
 
275
{
 
276
        fil_addr_t      node_addr;
 
277
        page_t*         root;
 
278
        page_t*         new_page;
 
279
        buf_block_t*    new_block;
 
280
 
 
281
        root = btr_root_get(index, mtr);
 
282
 
 
283
        node_addr = flst_get_first(root + PAGE_HEADER
 
284
                                   + PAGE_BTR_IBUF_FREE_LIST, mtr);
 
285
        ut_a(node_addr.page != FIL_NULL);
 
286
 
 
287
        new_block = buf_page_get(dict_index_get_space(index),
 
288
                                 dict_table_zip_size(index->table),
 
289
                                 node_addr.page, RW_X_LATCH, mtr);
 
290
        new_page = buf_block_get_frame(new_block);
 
291
#ifdef UNIV_SYNC_DEBUG
 
292
        buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
 
293
#endif /* UNIV_SYNC_DEBUG */
 
294
 
 
295
        flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 
296
                    new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
 
297
                    mtr);
 
298
        ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 
299
                            mtr));
 
300
 
 
301
        return(new_block);
 
302
}
 
303
 
 
304
/******************************************************************
 
305
Allocates a new file page to be used in an index tree. NOTE: we assume
 
306
that the caller has made the reservation for free extents! */
 
307
UNIV_INTERN
 
308
buf_block_t*
 
309
btr_page_alloc(
 
310
/*===========*/
 
311
                                        /* out: new allocated block, x-latched;
 
312
                                        NULL if out of space */
 
313
        dict_index_t*   index,          /* in: index */
 
314
        ulint           hint_page_no,   /* in: hint of a good page */
 
315
        byte            file_direction, /* in: direction where a possible
 
316
                                        page split is made */
 
317
        ulint           level,          /* in: level where the page is placed
 
318
                                        in the tree */
 
319
        mtr_t*          mtr)            /* in: mtr */
 
320
{
 
321
        fseg_header_t*  seg_header;
 
322
        page_t*         root;
 
323
        buf_block_t*    new_block;
 
324
        ulint           new_page_no;
 
325
 
 
326
        if (dict_index_is_ibuf(index)) {
 
327
 
 
328
                return(btr_page_alloc_for_ibuf(index, mtr));
 
329
        }
 
330
 
 
331
        root = btr_root_get(index, mtr);
 
332
 
 
333
        if (level == 0) {
 
334
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
335
        } else {
 
336
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
337
        }
 
338
 
 
339
        /* Parameter TRUE below states that the caller has made the
 
340
        reservation for free extents, and thus we know that a page can
 
341
        be allocated: */
 
342
 
 
343
        new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
 
344
                                                   file_direction, TRUE, mtr);
 
345
        if (new_page_no == FIL_NULL) {
 
346
 
 
347
                return(NULL);
 
348
        }
 
349
 
 
350
        new_block = buf_page_get(dict_index_get_space(index),
 
351
                                 dict_table_zip_size(index->table),
 
352
                                 new_page_no, RW_X_LATCH, mtr);
 
353
#ifdef UNIV_SYNC_DEBUG
 
354
        buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
 
355
#endif /* UNIV_SYNC_DEBUG */
 
356
 
 
357
        return(new_block);
 
358
}
 
359
 
 
360
/******************************************************************
 
361
Gets the number of pages in a B-tree. */
 
362
UNIV_INTERN
 
363
ulint
 
364
btr_get_size(
 
365
/*=========*/
 
366
                                /* out: number of pages */
 
367
        dict_index_t*   index,  /* in: index */
 
368
        ulint           flag)   /* in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
 
369
{
 
370
        fseg_header_t*  seg_header;
 
371
        page_t*         root;
 
372
        ulint           n;
 
373
        ulint           dummy;
 
374
        mtr_t           mtr;
 
375
 
 
376
        mtr_start(&mtr);
 
377
 
 
378
        mtr_s_lock(dict_index_get_lock(index), &mtr);
 
379
 
 
380
        root = btr_root_get(index, &mtr);
 
381
 
 
382
        if (flag == BTR_N_LEAF_PAGES) {
 
383
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
384
 
 
385
                fseg_n_reserved_pages(seg_header, &n, &mtr);
 
386
 
 
387
        } else if (flag == BTR_TOTAL_SIZE) {
 
388
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
389
 
 
390
                n = fseg_n_reserved_pages(seg_header, &dummy, &mtr);
 
391
 
 
392
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
393
 
 
394
                n += fseg_n_reserved_pages(seg_header, &dummy, &mtr);
 
395
        } else {
 
396
                ut_error;
 
397
        }
 
398
 
 
399
        mtr_commit(&mtr);
 
400
 
 
401
        return(n);
 
402
}
 
403
 
 
404
/******************************************************************
 
405
Frees a page used in an ibuf tree. Puts the page to the free list of the
 
406
ibuf tree. */
 
407
static
 
408
void
 
409
btr_page_free_for_ibuf(
 
410
/*===================*/
 
411
        dict_index_t*   index,  /* in: index tree */
 
412
        buf_block_t*    block,  /* in: block to be freed, x-latched */
 
413
        mtr_t*          mtr)    /* in: mtr */
 
414
{
 
415
        page_t*         root;
 
416
 
 
417
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
418
        root = btr_root_get(index, mtr);
 
419
 
 
420
        flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 
421
                       buf_block_get_frame(block)
 
422
                       + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
 
423
 
 
424
        ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
 
425
                            mtr));
 
426
}
 
427
 
 
428
/******************************************************************
 
429
Frees a file page used in an index tree. Can be used also to (BLOB)
 
430
external storage pages, because the page level 0 can be given as an
 
431
argument. */
 
432
UNIV_INTERN
 
433
void
 
434
btr_page_free_low(
 
435
/*==============*/
 
436
        dict_index_t*   index,  /* in: index tree */
 
437
        buf_block_t*    block,  /* in: block to be freed, x-latched */
 
438
        ulint           level,  /* in: page level */
 
439
        mtr_t*          mtr)    /* in: mtr */
 
440
{
 
441
        fseg_header_t*  seg_header;
 
442
        page_t*         root;
 
443
 
 
444
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
445
        /* The page gets invalid for optimistic searches: increment the frame
 
446
        modify clock */
 
447
 
 
448
        buf_block_modify_clock_inc(block);
 
449
 
 
450
        if (dict_index_is_ibuf(index)) {
 
451
 
 
452
                btr_page_free_for_ibuf(index, block, mtr);
 
453
 
 
454
                return;
 
455
        }
 
456
 
 
457
        root = btr_root_get(index, mtr);
 
458
 
 
459
        if (level == 0) {
 
460
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
461
        } else {
 
462
                seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
463
        }
 
464
 
 
465
        fseg_free_page(seg_header,
 
466
                       buf_block_get_space(block),
 
467
                       buf_block_get_page_no(block), mtr);
 
468
}
 
469
 
 
470
/******************************************************************
 
471
Frees a file page used in an index tree. NOTE: cannot free field external
 
472
storage pages because the page must contain info on its level. */
 
473
UNIV_INTERN
 
474
void
 
475
btr_page_free(
 
476
/*==========*/
 
477
        dict_index_t*   index,  /* in: index tree */
 
478
        buf_block_t*    block,  /* in: block to be freed, x-latched */
 
479
        mtr_t*          mtr)    /* in: mtr */
 
480
{
 
481
        ulint           level;
 
482
 
 
483
        level = btr_page_get_level(buf_block_get_frame(block), mtr);
 
484
 
 
485
        btr_page_free_low(index, block, level, mtr);
 
486
}
 
487
 
 
488
/******************************************************************
 
489
Sets the child node file address in a node pointer. */
 
490
UNIV_INLINE
 
491
void
 
492
btr_node_ptr_set_child_page_no(
 
493
/*===========================*/
 
494
        rec_t*          rec,    /* in: node pointer record */
 
495
        page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed
 
496
                                part will be updated, or NULL */
 
497
        const ulint*    offsets,/* in: array returned by rec_get_offsets() */
 
498
        ulint           page_no,/* in: child node address */
 
499
        mtr_t*          mtr)    /* in: mtr */
 
500
{
 
501
        byte*   field;
 
502
        ulint   len;
 
503
 
 
504
        ut_ad(rec_offs_validate(rec, NULL, offsets));
 
505
        ut_ad(!page_is_leaf(page_align(rec)));
 
506
        ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
 
507
 
 
508
        /* The child address is in the last field */
 
509
        field = rec_get_nth_field(rec, offsets,
 
510
                                  rec_offs_n_fields(offsets) - 1, &len);
 
511
 
 
512
        ut_ad(len == REC_NODE_PTR_SIZE);
 
513
 
 
514
        if (UNIV_LIKELY_NULL(page_zip)) {
 
515
                page_zip_write_node_ptr(page_zip, rec,
 
516
                                        rec_offs_data_size(offsets),
 
517
                                        page_no, mtr);
 
518
        } else {
 
519
                mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
 
520
        }
 
521
}
 
522
 
 
523
/****************************************************************
 
524
Returns the child page of a node pointer and x-latches it. */
 
525
static
 
526
buf_block_t*
 
527
btr_node_ptr_get_child(
 
528
/*===================*/
 
529
                                /* out: child page, x-latched */
 
530
        const rec_t*    node_ptr,/* in: node pointer */
 
531
        dict_index_t*   index,  /* in: index */
 
532
        const ulint*    offsets,/* in: array returned by rec_get_offsets() */
 
533
        mtr_t*          mtr)    /* in: mtr */
 
534
{
 
535
        ulint   page_no;
 
536
        ulint   space;
 
537
 
 
538
        ut_ad(rec_offs_validate(node_ptr, index, offsets));
 
539
        space = page_get_space_id(page_align(node_ptr));
 
540
        page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
 
541
 
 
542
        return(btr_block_get(space, dict_table_zip_size(index->table),
 
543
                             page_no, RW_X_LATCH, mtr));
 
544
}
 
545
 
 
546
/****************************************************************
 
547
Returns the upper level node pointer to a page. It is assumed that mtr holds
 
548
an x-latch on the tree. */
 
549
static
 
550
ulint*
 
551
btr_page_get_father_node_ptr(
 
552
/*=========================*/
 
553
                                /* out: rec_get_offsets() of the
 
554
                                node pointer record */
 
555
        ulint*          offsets,/* in: work area for the return value */
 
556
        mem_heap_t*     heap,   /* in: memory heap to use */
 
557
        btr_cur_t*      cursor, /* in: cursor pointing to user record,
 
558
                                out: cursor on node pointer record,
 
559
                                its page x-latched */
 
560
        mtr_t*          mtr)    /* in: mtr */
 
561
{
 
562
        dtuple_t*       tuple;
 
563
        rec_t*          user_rec;
 
564
        rec_t*          node_ptr;
 
565
        ulint           level;
 
566
        ulint           page_no;
 
567
        dict_index_t*   index;
 
568
 
 
569
        page_no = buf_block_get_page_no(btr_cur_get_block(cursor));
 
570
        index = btr_cur_get_index(cursor);
 
571
 
 
572
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 
573
                                MTR_MEMO_X_LOCK));
 
574
 
 
575
        ut_ad(dict_index_get_page(index) != page_no);
 
576
 
 
577
        level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
 
578
        user_rec = btr_cur_get_rec(cursor);
 
579
        ut_a(page_rec_is_user_rec(user_rec));
 
580
        tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
 
581
 
 
582
        btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
 
583
                                    BTR_CONT_MODIFY_TREE, cursor, 0, mtr);
 
584
 
 
585
        node_ptr = btr_cur_get_rec(cursor);
 
586
        ut_ad(!page_rec_is_comp(node_ptr)
 
587
              || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
 
588
        offsets = rec_get_offsets(node_ptr, index, offsets,
 
589
                                  ULINT_UNDEFINED, &heap);
 
590
 
 
591
        if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets)
 
592
                          != page_no)) {
 
593
                rec_t*  print_rec;
 
594
                fputs("InnoDB: Dump of the child page:\n", stderr);
 
595
                buf_page_print(page_align(user_rec), 0);
 
596
                fputs("InnoDB: Dump of the parent page:\n", stderr);
 
597
                buf_page_print(page_align(node_ptr), 0);
 
598
 
 
599
                fputs("InnoDB: Corruption of an index tree: table ", stderr);
 
600
                ut_print_name(stderr, NULL, TRUE, index->table_name);
 
601
                fputs(", index ", stderr);
 
602
                ut_print_name(stderr, NULL, FALSE, index->name);
 
603
                fprintf(stderr, ",\n"
 
604
                        "InnoDB: father ptr page no %lu, child page no %lu\n",
 
605
                        (ulong)
 
606
                        btr_node_ptr_get_child_page_no(node_ptr, offsets),
 
607
                        (ulong) page_no);
 
608
                print_rec = page_rec_get_next(
 
609
                        page_get_infimum_rec(page_align(user_rec)));
 
610
                offsets = rec_get_offsets(print_rec, index,
 
611
                                          offsets, ULINT_UNDEFINED, &heap);
 
612
                page_rec_print(print_rec, offsets);
 
613
                offsets = rec_get_offsets(node_ptr, index, offsets,
 
614
                                          ULINT_UNDEFINED, &heap);
 
615
                page_rec_print(node_ptr, offsets);
 
616
 
 
617
                fputs("InnoDB: You should dump + drop + reimport the table"
 
618
                      " to fix the\n"
 
619
                      "InnoDB: corruption. If the crash happens at "
 
620
                      "the database startup, see\n"
 
621
                      "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/"
 
622
                      "forcing-recovery.html about\n"
 
623
                      "InnoDB: forcing recovery. "
 
624
                      "Then dump + drop + reimport.\n", stderr);
 
625
 
 
626
                ut_error;
 
627
        }
 
628
 
 
629
        return(offsets);
 
630
}
 
631
 
 
632
/****************************************************************
 
633
Returns the upper level node pointer to a page. It is assumed that mtr holds
 
634
an x-latch on the tree. */
 
635
static
 
636
ulint*
 
637
btr_page_get_father_block(
 
638
/*======================*/
 
639
                                /* out: rec_get_offsets() of the
 
640
                                node pointer record */
 
641
        ulint*          offsets,/* in: work area for the return value */
 
642
        mem_heap_t*     heap,   /* in: memory heap to use */
 
643
        dict_index_t*   index,  /* in: b-tree index */
 
644
        buf_block_t*    block,  /* in: child page in the index */
 
645
        mtr_t*          mtr,    /* in: mtr */
 
646
        btr_cur_t*      cursor) /* out: cursor on node pointer record,
 
647
                                its page x-latched */
 
648
{
 
649
        rec_t*  rec
 
650
                = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
 
651
                                                                 block)));
 
652
        btr_cur_position(index, rec, block, cursor);
 
653
        return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
 
654
}
 
655
 
 
656
/****************************************************************
 
657
Seeks to the upper level node pointer to a page.
 
658
It is assumed that mtr holds an x-latch on the tree. */
 
659
static
 
660
void
 
661
btr_page_get_father(
 
662
/*================*/
 
663
        dict_index_t*   index,  /* in: b-tree index */
 
664
        buf_block_t*    block,  /* in: child page in the index */
 
665
        mtr_t*          mtr,    /* in: mtr */
 
666
        btr_cur_t*      cursor) /* out: cursor on node pointer record,
 
667
                                its page x-latched */
 
668
{
 
669
        mem_heap_t*     heap;
 
670
        rec_t*          rec
 
671
                = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
 
672
                                                                 block)));
 
673
        btr_cur_position(index, rec, block, cursor);
 
674
 
 
675
        heap = mem_heap_create(100);
 
676
        btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
 
677
        mem_heap_free(heap);
 
678
}
 
679
 
 
680
/****************************************************************
 
681
Creates the root node for a new index tree. */
 
682
UNIV_INTERN
 
683
ulint
 
684
btr_create(
 
685
/*=======*/
 
686
                                /* out: page number of the created root,
 
687
                                FIL_NULL if did not succeed */
 
688
        ulint           type,   /* in: type of the index */
 
689
        ulint           space,  /* in: space where created */
 
690
        ulint           zip_size,/* in: compressed page size in bytes
 
691
                                or 0 for uncompressed pages */
 
692
        dulint          index_id,/* in: index id */
 
693
        dict_index_t*   index,  /* in: index */
 
694
        mtr_t*          mtr)    /* in: mini-transaction handle */
 
695
{
 
696
        ulint           page_no;
 
697
        buf_block_t*    block;
 
698
        buf_frame_t*    frame;
 
699
        page_t*         page;
 
700
        page_zip_des_t* page_zip;
 
701
 
 
702
        /* Create the two new segments (one, in the case of an ibuf tree) for
 
703
        the index tree; the segment headers are put on the allocated root page
 
704
        (for an ibuf tree, not in the root, but on a separate ibuf header
 
705
        page) */
 
706
 
 
707
        if (type & DICT_IBUF) {
 
708
                /* Allocate first the ibuf header page */
 
709
                buf_block_t*    ibuf_hdr_block = fseg_create(
 
710
                        space, 0,
 
711
                        IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
 
712
 
 
713
#ifdef UNIV_SYNC_DEBUG
 
714
                buf_block_dbg_add_level(ibuf_hdr_block, SYNC_TREE_NODE_NEW);
 
715
#endif /* UNIV_SYNC_DEBUG */
 
716
                ut_ad(buf_block_get_page_no(ibuf_hdr_block)
 
717
                      == IBUF_HEADER_PAGE_NO);
 
718
                /* Allocate then the next page to the segment: it will be the
 
719
                tree root page */
 
720
 
 
721
                page_no = fseg_alloc_free_page(buf_block_get_frame(
 
722
                                                       ibuf_hdr_block)
 
723
                                               + IBUF_HEADER
 
724
                                               + IBUF_TREE_SEG_HEADER,
 
725
                                               IBUF_TREE_ROOT_PAGE_NO,
 
726
                                               FSP_UP, mtr);
 
727
                ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
 
728
 
 
729
                block = buf_page_get(space, zip_size, page_no,
 
730
                                     RW_X_LATCH, mtr);
 
731
        } else {
 
732
                block = fseg_create(space, 0,
 
733
                                    PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
 
734
        }
 
735
 
 
736
        if (block == NULL) {
 
737
 
 
738
                return(FIL_NULL);
 
739
        }
 
740
 
 
741
        page_no = buf_block_get_page_no(block);
 
742
        frame = buf_block_get_frame(block);
 
743
 
 
744
#ifdef UNIV_SYNC_DEBUG
 
745
        buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
 
746
#endif /* UNIV_SYNC_DEBUG */
 
747
 
 
748
        if (type & DICT_IBUF) {
 
749
                /* It is an insert buffer tree: initialize the free list */
 
750
 
 
751
                ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
 
752
 
 
753
                flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
 
754
        } else {
 
755
                /* It is a non-ibuf tree: create a file segment for leaf
 
756
                pages */
 
757
                fseg_create(space, page_no,
 
758
                            PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr);
 
759
                /* The fseg create acquires a second latch on the page,
 
760
                therefore we must declare it: */
 
761
#ifdef UNIV_SYNC_DEBUG
 
762
                buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
 
763
#endif /* UNIV_SYNC_DEBUG */
 
764
        }
 
765
 
 
766
        /* Create a new index page on the the allocated segment page */
 
767
        page_zip = buf_block_get_page_zip(block);
 
768
 
 
769
        if (UNIV_LIKELY_NULL(page_zip)) {
 
770
                page = page_create_zip(block, index, 0, mtr);
 
771
        } else {
 
772
                page = page_create(block, mtr,
 
773
                                   dict_table_is_comp(index->table));
 
774
                /* Set the level of the new index page */
 
775
                btr_page_set_level(page, NULL, 0, mtr);
 
776
        }
 
777
 
 
778
        block->check_index_page_at_flush = TRUE;
 
779
 
 
780
        /* Set the index id of the page */
 
781
        btr_page_set_index_id(page, page_zip, index_id, mtr);
 
782
 
 
783
        /* Set the next node and previous node fields */
 
784
        btr_page_set_next(page, page_zip, FIL_NULL, mtr);
 
785
        btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
 
786
 
 
787
        /* We reset the free bits for the page to allow creation of several
 
788
        trees in the same mtr, otherwise the latch on a bitmap page would
 
789
        prevent it because of the latching order */
 
790
 
 
791
        if (!(type & DICT_CLUSTERED)) {
 
792
                ibuf_reset_free_bits(block);
 
793
        }
 
794
 
 
795
        /* In the following assertion we test that two records of maximum
 
796
        allowed size fit on the root page: this fact is needed to ensure
 
797
        correctness of split algorithms */
 
798
 
 
799
        ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
 
800
 
 
801
        return(page_no);
 
802
}
 
803
 
 
804
/****************************************************************
 
805
Frees a B-tree except the root page, which MUST be freed after this
 
806
by calling btr_free_root. */
 
807
UNIV_INTERN
 
808
void
 
809
btr_free_but_not_root(
 
810
/*==================*/
 
811
        ulint   space,          /* in: space where created */
 
812
        ulint   zip_size,       /* in: compressed page size in bytes
 
813
                                or 0 for uncompressed pages */
 
814
        ulint   root_page_no)   /* in: root page number */
 
815
{
 
816
        ibool   finished;
 
817
        page_t* root;
 
818
        mtr_t   mtr;
 
819
 
 
820
leaf_loop:
 
821
        mtr_start(&mtr);
 
822
 
 
823
        root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
 
824
 
 
825
        /* NOTE: page hash indexes are dropped when a page is freed inside
 
826
        fsp0fsp. */
 
827
 
 
828
        finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
 
829
                                  &mtr);
 
830
        mtr_commit(&mtr);
 
831
 
 
832
        if (!finished) {
 
833
 
 
834
                goto leaf_loop;
 
835
        }
 
836
top_loop:
 
837
        mtr_start(&mtr);
 
838
 
 
839
        root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
 
840
 
 
841
        finished = fseg_free_step_not_header(
 
842
                root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
 
843
        mtr_commit(&mtr);
 
844
 
 
845
        if (!finished) {
 
846
 
 
847
                goto top_loop;
 
848
        }
 
849
}
 
850
 
 
851
/****************************************************************
 
852
Frees the B-tree root page. Other tree MUST already have been freed. */
 
853
UNIV_INTERN
 
854
void
 
855
btr_free_root(
 
856
/*==========*/
 
857
        ulint   space,          /* in: space where created */
 
858
        ulint   zip_size,       /* in: compressed page size in bytes
 
859
                                or 0 for uncompressed pages */
 
860
        ulint   root_page_no,   /* in: root page number */
 
861
        mtr_t*  mtr)            /* in: a mini-transaction which has already
 
862
                                been started */
 
863
{
 
864
        buf_block_t*    block;
 
865
        fseg_header_t*  header;
 
866
 
 
867
        block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
 
868
 
 
869
        btr_search_drop_page_hash_index(block);
 
870
 
 
871
        header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
872
 
 
873
        while (!fseg_free_step(header, mtr));
 
874
}
 
875
 
 
876
/*****************************************************************
 
877
Reorganizes an index page. */
 
878
static
 
879
ibool
 
880
btr_page_reorganize_low(
 
881
/*====================*/
 
882
        ibool           recovery,/* in: TRUE if called in recovery:
 
883
                                locks should not be updated, i.e.,
 
884
                                there cannot exist locks on the
 
885
                                page, and a hash index should not be
 
886
                                dropped: it cannot exist */
 
887
        buf_block_t*    block,  /* in: page to be reorganized */
 
888
        dict_index_t*   index,  /* in: record descriptor */
 
889
        mtr_t*          mtr)    /* in: mtr */
 
890
{
 
891
        page_t*         page            = buf_block_get_frame(block);
 
892
        page_zip_des_t* page_zip        = buf_block_get_page_zip(block);
 
893
        buf_block_t*    temp_block;
 
894
        page_t*         temp_page;
 
895
        ulint           log_mode;
 
896
        ulint           data_size1;
 
897
        ulint           data_size2;
 
898
        ulint           max_ins_size1;
 
899
        ulint           max_ins_size2;
 
900
        ibool           success         = FALSE;
 
901
 
 
902
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
903
        ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
 
904
#ifdef UNIV_ZIP_DEBUG
 
905
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
906
#endif /* UNIV_ZIP_DEBUG */
 
907
        data_size1 = page_get_data_size(page);
 
908
        max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
 
909
 
 
910
        /* Write the log record */
 
911
        mlog_open_and_write_index(mtr, page, index, page_is_comp(page)
 
912
                                  ? MLOG_COMP_PAGE_REORGANIZE
 
913
                                  : MLOG_PAGE_REORGANIZE, 0);
 
914
 
 
915
        /* Turn logging off */
 
916
        log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
 
917
 
 
918
        temp_block = buf_block_alloc(0);
 
919
        temp_page = temp_block->frame;
 
920
 
 
921
        /* Copy the old page to temporary space */
 
922
        buf_frame_copy(temp_page, page);
 
923
 
 
924
        if (UNIV_LIKELY(!recovery)) {
 
925
                btr_search_drop_page_hash_index(block);
 
926
        }
 
927
 
 
928
        /* Recreate the page: note that global data on page (possible
 
929
        segment headers, next page-field, etc.) is preserved intact */
 
930
 
 
931
        page_create(block, mtr, dict_table_is_comp(index->table));
 
932
        block->check_index_page_at_flush = TRUE;
 
933
 
 
934
        /* Copy the records from the temporary space to the recreated page;
 
935
        do not copy the lock bits yet */
 
936
 
 
937
        page_copy_rec_list_end_no_locks(block, temp_block,
 
938
                                        page_get_infimum_rec(temp_page),
 
939
                                        index, mtr);
 
940
        /* Copy max trx id to recreated page */
 
941
        page_set_max_trx_id(block, NULL, page_get_max_trx_id(temp_page));
 
942
 
 
943
        if (UNIV_LIKELY_NULL(page_zip)
 
944
            && UNIV_UNLIKELY
 
945
            (!page_zip_compress(page_zip, page, index, NULL))) {
 
946
 
 
947
                /* Restore the old page and exit. */
 
948
                buf_frame_copy(page, temp_page);
 
949
 
 
950
                goto func_exit;
 
951
        }
 
952
 
 
953
        if (UNIV_LIKELY(!recovery)) {
 
954
                /* Update the record lock bitmaps */
 
955
                lock_move_reorganize_page(block, temp_block);
 
956
        }
 
957
 
 
958
        data_size2 = page_get_data_size(page);
 
959
        max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
 
960
 
 
961
        if (UNIV_UNLIKELY(data_size1 != data_size2)
 
962
            || UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) {
 
963
                buf_page_print(page, 0);
 
964
                buf_page_print(temp_page, 0);
 
965
                fprintf(stderr,
 
966
                        "InnoDB: Error: page old data size %lu"
 
967
                        " new data size %lu\n"
 
968
                        "InnoDB: Error: page old max ins size %lu"
 
969
                        " new max ins size %lu\n"
 
970
                        "InnoDB: Submit a detailed bug report"
 
971
                        " to http://bugs.mysql.com\n",
 
972
                        (unsigned long) data_size1, (unsigned long) data_size2,
 
973
                        (unsigned long) max_ins_size1,
 
974
                        (unsigned long) max_ins_size2);
 
975
        } else {
 
976
                success = TRUE;
 
977
        }
 
978
 
 
979
func_exit:
 
980
#ifdef UNIV_ZIP_DEBUG
 
981
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
982
#endif /* UNIV_ZIP_DEBUG */
 
983
        buf_block_free(temp_block);
 
984
 
 
985
        /* Restore logging mode */
 
986
        mtr_set_log_mode(mtr, log_mode);
 
987
 
 
988
        return(success);
 
989
}
 
990
 
 
991
/*****************************************************************
 
992
Reorganizes an index page.
 
993
IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
 
994
page of a non-clustered index, the caller must update the insert
 
995
buffer free bits in the same mini-transaction in such a way that the
 
996
modification will be redo-logged. */
 
997
UNIV_INTERN
 
998
ibool
 
999
btr_page_reorganize(
 
1000
/*================*/
 
1001
                                /* out: TRUE on success, FALSE on failure */
 
1002
        buf_block_t*    block,  /* in: page to be reorganized */
 
1003
        dict_index_t*   index,  /* in: record descriptor */
 
1004
        mtr_t*          mtr)    /* in: mtr */
 
1005
{
 
1006
        return(btr_page_reorganize_low(FALSE, block, index, mtr));
 
1007
}
 
1008
 
 
1009
/***************************************************************
 
1010
Parses a redo log record of reorganizing a page. */
 
1011
UNIV_INTERN
 
1012
byte*
 
1013
btr_parse_page_reorganize(
 
1014
/*======================*/
 
1015
                                /* out: end of log record or NULL */
 
1016
        byte*           ptr,    /* in: buffer */
 
1017
        byte*           end_ptr __attribute__((unused)),
 
1018
                                /* in: buffer end */
 
1019
        dict_index_t*   index,  /* in: record descriptor */
 
1020
        buf_block_t*    block,  /* in: page to be reorganized, or NULL */
 
1021
        mtr_t*          mtr)    /* in: mtr or NULL */
 
1022
{
 
1023
        ut_ad(ptr && end_ptr);
 
1024
 
 
1025
        /* The record is empty, except for the record initial part */
 
1026
 
 
1027
        if (UNIV_LIKELY(block != NULL)) {
 
1028
                btr_page_reorganize_low(TRUE, block, index, mtr);
 
1029
        }
 
1030
 
 
1031
        return(ptr);
 
1032
}
 
1033
 
 
1034
/*****************************************************************
 
1035
Empties an index page. */
 
1036
static
 
1037
void
 
1038
btr_page_empty(
 
1039
/*===========*/
 
1040
        buf_block_t*    block,  /* in: page to be emptied */
 
1041
        page_zip_des_t* page_zip,/* out: compressed page, or NULL */
 
1042
        mtr_t*          mtr,    /* in: mtr */
 
1043
        dict_index_t*   index)  /* in: index of the page */
 
1044
{
 
1045
        page_t* page = buf_block_get_frame(block);
 
1046
 
 
1047
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
1048
#ifdef UNIV_ZIP_DEBUG
 
1049
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
1050
#endif /* UNIV_ZIP_DEBUG */
 
1051
 
 
1052
        btr_search_drop_page_hash_index(block);
 
1053
 
 
1054
        /* Recreate the page: note that global data on page (possible
 
1055
        segment headers, next page-field, etc.) is preserved intact */
 
1056
 
 
1057
        if (UNIV_LIKELY_NULL(page_zip)) {
 
1058
                page_create_zip(block, index,
 
1059
                                btr_page_get_level(page, mtr), mtr);
 
1060
        } else {
 
1061
                page_create(block, mtr, dict_table_is_comp(index->table));
 
1062
        }
 
1063
 
 
1064
        block->check_index_page_at_flush = TRUE;
 
1065
}
 
1066
 
 
1067
/*****************************************************************
 
1068
Makes tree one level higher by splitting the root, and inserts
 
1069
the tuple. It is assumed that mtr contains an x-latch on the tree.
 
1070
NOTE that the operation of this function must always succeed,
 
1071
we cannot reverse it: therefore enough free disk space must be
 
1072
guaranteed to be available before this function is called. */
 
1073
UNIV_INTERN
 
1074
rec_t*
 
1075
btr_root_raise_and_insert(
 
1076
/*======================*/
 
1077
                                /* out: inserted record */
 
1078
        btr_cur_t*      cursor, /* in: cursor at which to insert: must be
 
1079
                                on the root page; when the function returns,
 
1080
                                the cursor is positioned on the predecessor
 
1081
                                of the inserted record */
 
1082
        const dtuple_t* tuple,  /* in: tuple to insert */
 
1083
        ulint           n_ext,  /* in: number of externally stored columns */
 
1084
        mtr_t*          mtr)    /* in: mtr */
 
1085
{
 
1086
        dict_index_t*   index;
 
1087
        page_t*         root;
 
1088
        page_t*         new_page;
 
1089
        ulint           new_page_no;
 
1090
        rec_t*          rec;
 
1091
        mem_heap_t*     heap;
 
1092
        dtuple_t*       node_ptr;
 
1093
        ulint           level;
 
1094
        rec_t*          node_ptr_rec;
 
1095
        page_cur_t*     page_cursor;
 
1096
        page_zip_des_t* root_page_zip;
 
1097
        page_zip_des_t* new_page_zip;
 
1098
        buf_block_t*    root_block;
 
1099
        buf_block_t*    new_block;
 
1100
 
 
1101
        root = btr_cur_get_page(cursor);
 
1102
        root_block = btr_cur_get_block(cursor);
 
1103
        root_page_zip = buf_block_get_page_zip(root_block);
 
1104
#ifdef UNIV_ZIP_DEBUG
 
1105
        ut_a(!root_page_zip || page_zip_validate(root_page_zip, root));
 
1106
#endif /* UNIV_ZIP_DEBUG */
 
1107
        index = btr_cur_get_index(cursor);
 
1108
 
 
1109
        ut_ad(dict_index_get_page(index) == page_get_page_no(root));
 
1110
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 
1111
                                MTR_MEMO_X_LOCK));
 
1112
        ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
 
1113
        btr_search_drop_page_hash_index(root_block);
 
1114
 
 
1115
        /* Allocate a new page to the tree. Root splitting is done by first
 
1116
        moving the root records to the new page, emptying the root, putting
 
1117
        a node pointer to the new page, and then splitting the new page. */
 
1118
 
 
1119
        level = btr_page_get_level(root, mtr);
 
1120
 
 
1121
        new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr);
 
1122
        new_page = buf_block_get_frame(new_block);
 
1123
        new_page_zip = buf_block_get_page_zip(new_block);
 
1124
        ut_a(!new_page_zip == !root_page_zip);
 
1125
        ut_a(!new_page_zip
 
1126
             || page_zip_get_size(new_page_zip)
 
1127
             == page_zip_get_size(root_page_zip));
 
1128
 
 
1129
        btr_page_create(new_block, new_page_zip, index, level, mtr);
 
1130
 
 
1131
        /* Set the next node and previous node fields of new page */
 
1132
        btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
 
1133
        btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
 
1134
 
 
1135
        /* Copy the records from root to the new page one by one. */
 
1136
 
 
1137
        if (UNIV_UNLIKELY
 
1138
            (!page_copy_rec_list_end(new_block, root_block,
 
1139
                                     page_get_infimum_rec(root),
 
1140
                                     index, mtr))) {
 
1141
                ut_a(new_page_zip);
 
1142
 
 
1143
                /* Copy the page byte for byte. */
 
1144
                page_zip_copy(new_page_zip, new_page,
 
1145
                              root_page_zip, root, index, mtr);
 
1146
        }
 
1147
 
 
1148
        /* If this is a pessimistic insert which is actually done to
 
1149
        perform a pessimistic update then we have stored the lock
 
1150
        information of the record to be inserted on the infimum of the
 
1151
        root page: we cannot discard the lock structs on the root page */
 
1152
 
 
1153
        lock_update_root_raise(new_block, root_block);
 
1154
 
 
1155
        /* Create a memory heap where the node pointer is stored */
 
1156
        heap = mem_heap_create(100);
 
1157
 
 
1158
        rec = page_rec_get_next(page_get_infimum_rec(new_page));
 
1159
        new_page_no = buf_block_get_page_no(new_block);
 
1160
 
 
1161
        /* Build the node pointer (= node key and page address) for the
 
1162
        child */
 
1163
 
 
1164
        node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
 
1165
                                             level);
 
1166
        /* The node pointer must be marked as the predefined minimum record,
 
1167
        as there is no lower alphabetical limit to records in the leftmost
 
1168
        node of a level: */
 
1169
        dtuple_set_info_bits(node_ptr,
 
1170
                             dtuple_get_info_bits(node_ptr)
 
1171
                             | REC_INFO_MIN_REC_FLAG);
 
1172
 
 
1173
        /* Rebuild the root page to get free space */
 
1174
        if (UNIV_LIKELY_NULL(root_page_zip)) {
 
1175
                page_create_zip(root_block, index, level + 1, mtr);
 
1176
        } else {
 
1177
                page_create(root_block, mtr, dict_table_is_comp(index->table));
 
1178
                btr_page_set_level(root, NULL, level + 1, mtr);
 
1179
        }
 
1180
 
 
1181
        /* Set the next node and previous node fields, although
 
1182
        they should already have been set.  The previous node field
 
1183
        must be FIL_NULL if root_page_zip != NULL, because the
 
1184
        REC_INFO_MIN_REC_FLAG (of the first user record) will be
 
1185
        set if and only if btr_page_get_prev() == FIL_NULL. */
 
1186
        btr_page_set_next(root, root_page_zip, FIL_NULL, mtr);
 
1187
        btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr);
 
1188
 
 
1189
        root_block->check_index_page_at_flush = TRUE;
 
1190
 
 
1191
        page_cursor = btr_cur_get_page_cur(cursor);
 
1192
 
 
1193
        /* Insert node pointer to the root */
 
1194
 
 
1195
        page_cur_set_before_first(root_block, page_cursor);
 
1196
 
 
1197
        node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
 
1198
                                             index, 0, mtr);
 
1199
 
 
1200
        /* The root page should only contain the node pointer
 
1201
        to new_page at this point.  Thus, the data should fit. */
 
1202
        ut_a(node_ptr_rec);
 
1203
 
 
1204
        /* Free the memory heap */
 
1205
        mem_heap_free(heap);
 
1206
 
 
1207
        /* We play safe and reset the free bits for the new page */
 
1208
 
 
1209
#if 0
 
1210
        fprintf(stderr, "Root raise new page no %lu\n", new_page_no);
 
1211
#endif
 
1212
 
 
1213
        if (!dict_index_is_clust(index)) {
 
1214
                ibuf_reset_free_bits(new_block);
 
1215
        }
 
1216
 
 
1217
        /* Reposition the cursor to the child node */
 
1218
        page_cur_search(new_block, index, tuple,
 
1219
                        PAGE_CUR_LE, page_cursor);
 
1220
 
 
1221
        /* Split the child and insert tuple */
 
1222
        return(btr_page_split_and_insert(cursor, tuple, n_ext, mtr));
 
1223
}
 
1224
 
 
1225
/*****************************************************************
 
1226
Decides if the page should be split at the convergence point of inserts
 
1227
converging to the left. */
 
1228
UNIV_INTERN
 
1229
ibool
 
1230
btr_page_get_split_rec_to_left(
 
1231
/*===========================*/
 
1232
                                /* out: TRUE if split recommended */
 
1233
        btr_cur_t*      cursor, /* in: cursor at which to insert */
 
1234
        rec_t**         split_rec) /* out: if split recommended,
 
1235
                                the first record on upper half page,
 
1236
                                or NULL if tuple to be inserted should
 
1237
                                be first */
 
1238
{
 
1239
        page_t* page;
 
1240
        rec_t*  insert_point;
 
1241
        rec_t*  infimum;
 
1242
 
 
1243
        page = btr_cur_get_page(cursor);
 
1244
        insert_point = btr_cur_get_rec(cursor);
 
1245
 
 
1246
        if (page_header_get_ptr(page, PAGE_LAST_INSERT)
 
1247
            == page_rec_get_next(insert_point)) {
 
1248
 
 
1249
                infimum = page_get_infimum_rec(page);
 
1250
 
 
1251
                /* If the convergence is in the middle of a page, include also
 
1252
                the record immediately before the new insert to the upper
 
1253
                page. Otherwise, we could repeatedly move from page to page
 
1254
                lots of records smaller than the convergence point. */
 
1255
 
 
1256
                if (infimum != insert_point
 
1257
                    && page_rec_get_next(infimum) != insert_point) {
 
1258
 
 
1259
                        *split_rec = insert_point;
 
1260
                } else {
 
1261
                        *split_rec = page_rec_get_next(insert_point);
 
1262
                }
 
1263
 
 
1264
                return(TRUE);
 
1265
        }
 
1266
 
 
1267
        return(FALSE);
 
1268
}
 
1269
 
 
1270
/*****************************************************************
 
1271
Decides if the page should be split at the convergence point of inserts
 
1272
converging to the right. */
 
1273
UNIV_INTERN
 
1274
ibool
 
1275
btr_page_get_split_rec_to_right(
 
1276
/*============================*/
 
1277
                                /* out: TRUE if split recommended */
 
1278
        btr_cur_t*      cursor, /* in: cursor at which to insert */
 
1279
        rec_t**         split_rec) /* out: if split recommended,
 
1280
                                the first record on upper half page,
 
1281
                                or NULL if tuple to be inserted should
 
1282
                                be first */
 
1283
{
 
1284
        page_t* page;
 
1285
        rec_t*  insert_point;
 
1286
 
 
1287
        page = btr_cur_get_page(cursor);
 
1288
        insert_point = btr_cur_get_rec(cursor);
 
1289
 
 
1290
        /* We use eager heuristics: if the new insert would be right after
 
1291
        the previous insert on the same page, we assume that there is a
 
1292
        pattern of sequential inserts here. */
 
1293
 
 
1294
        if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT)
 
1295
                        == insert_point)) {
 
1296
 
 
1297
                rec_t*  next_rec;
 
1298
 
 
1299
                next_rec = page_rec_get_next(insert_point);
 
1300
 
 
1301
                if (page_rec_is_supremum(next_rec)) {
 
1302
split_at_new:
 
1303
                        /* Split at the new record to insert */
 
1304
                        *split_rec = NULL;
 
1305
                } else {
 
1306
                        rec_t*  next_next_rec = page_rec_get_next(next_rec);
 
1307
                        if (page_rec_is_supremum(next_next_rec)) {
 
1308
 
 
1309
                                goto split_at_new;
 
1310
                        }
 
1311
 
 
1312
                        /* If there are >= 2 user records up from the insert
 
1313
                        point, split all but 1 off. We want to keep one because
 
1314
                        then sequential inserts can use the adaptive hash
 
1315
                        index, as they can do the necessary checks of the right
 
1316
                        search position just by looking at the records on this
 
1317
                        page. */
 
1318
 
 
1319
                        *split_rec = next_next_rec;
 
1320
                }
 
1321
 
 
1322
                return(TRUE);
 
1323
        }
 
1324
 
 
1325
        return(FALSE);
 
1326
}
 
1327
 
 
1328
/*****************************************************************
 
1329
Calculates a split record such that the tuple will certainly fit on
 
1330
its half-page when the split is performed. We assume in this function
 
1331
only that the cursor page has at least one user record. */
 
1332
static
 
1333
rec_t*
 
1334
btr_page_get_sure_split_rec(
 
1335
/*========================*/
 
1336
                                /* out: split record, or NULL if tuple
 
1337
                                will be the first record on upper half-page */
 
1338
        btr_cur_t*      cursor, /* in: cursor at which insert should be made */
 
1339
        const dtuple_t* tuple,  /* in: tuple to insert */
 
1340
        ulint           n_ext)  /* in: number of externally stored columns */
 
1341
{
 
1342
        page_t*         page;
 
1343
        page_zip_des_t* page_zip;
 
1344
        ulint           insert_size;
 
1345
        ulint           free_space;
 
1346
        ulint           total_data;
 
1347
        ulint           total_n_recs;
 
1348
        ulint           total_space;
 
1349
        ulint           incl_data;
 
1350
        rec_t*          ins_rec;
 
1351
        rec_t*          rec;
 
1352
        rec_t*          next_rec;
 
1353
        ulint           n;
 
1354
        mem_heap_t*     heap;
 
1355
        ulint*          offsets;
 
1356
 
 
1357
        page = btr_cur_get_page(cursor);
 
1358
 
 
1359
        insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
 
1360
        free_space  = page_get_free_space_of_empty(page_is_comp(page));
 
1361
 
 
1362
        page_zip = btr_cur_get_page_zip(cursor);
 
1363
        if (UNIV_LIKELY_NULL(page_zip)) {
 
1364
                /* Estimate the free space of an empty compressed page. */
 
1365
                ulint   free_space_zip = page_zip_empty_size(
 
1366
                        cursor->index->n_fields,
 
1367
                        page_zip_get_size(page_zip));
 
1368
 
 
1369
                if (UNIV_LIKELY(free_space > (ulint) free_space_zip)) {
 
1370
                        free_space = (ulint) free_space_zip;
 
1371
                }
 
1372
        }
 
1373
 
 
1374
        /* free_space is now the free space of a created new page */
 
1375
 
 
1376
        total_data   = page_get_data_size(page) + insert_size;
 
1377
        total_n_recs = page_get_n_recs(page) + 1;
 
1378
        ut_ad(total_n_recs >= 2);
 
1379
        total_space  = total_data + page_dir_calc_reserved_space(total_n_recs);
 
1380
 
 
1381
        n = 0;
 
1382
        incl_data = 0;
 
1383
        ins_rec = btr_cur_get_rec(cursor);
 
1384
        rec = page_get_infimum_rec(page);
 
1385
 
 
1386
        heap = NULL;
 
1387
        offsets = NULL;
 
1388
 
 
1389
        /* We start to include records to the left half, and when the
 
1390
        space reserved by them exceeds half of total_space, then if
 
1391
        the included records fit on the left page, they will be put there
 
1392
        if something was left over also for the right page,
 
1393
        otherwise the last included record will be the first on the right
 
1394
        half page */
 
1395
 
 
1396
        do {
 
1397
                /* Decide the next record to include */
 
1398
                if (rec == ins_rec) {
 
1399
                        rec = NULL;     /* NULL denotes that tuple is
 
1400
                                        now included */
 
1401
                } else if (rec == NULL) {
 
1402
                        rec = page_rec_get_next(ins_rec);
 
1403
                } else {
 
1404
                        rec = page_rec_get_next(rec);
 
1405
                }
 
1406
 
 
1407
                if (rec == NULL) {
 
1408
                        /* Include tuple */
 
1409
                        incl_data += insert_size;
 
1410
                } else {
 
1411
                        offsets = rec_get_offsets(rec, cursor->index,
 
1412
                                                  offsets, ULINT_UNDEFINED,
 
1413
                                                  &heap);
 
1414
                        incl_data += rec_offs_size(offsets);
 
1415
                }
 
1416
 
 
1417
                n++;
 
1418
        } while (incl_data + page_dir_calc_reserved_space(n)
 
1419
                 < total_space / 2);
 
1420
 
 
1421
        if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
 
1422
                /* The next record will be the first on
 
1423
                the right half page if it is not the
 
1424
                supremum record of page */
 
1425
 
 
1426
                if (rec == ins_rec) {
 
1427
                        rec = NULL;
 
1428
 
 
1429
                        goto func_exit;
 
1430
                } else if (rec == NULL) {
 
1431
                        next_rec = page_rec_get_next(ins_rec);
 
1432
                } else {
 
1433
                        next_rec = page_rec_get_next(rec);
 
1434
                }
 
1435
                ut_ad(next_rec);
 
1436
                if (!page_rec_is_supremum(next_rec)) {
 
1437
                        rec = next_rec;
 
1438
                }
 
1439
        }
 
1440
 
 
1441
func_exit:
 
1442
        if (UNIV_LIKELY_NULL(heap)) {
 
1443
                mem_heap_free(heap);
 
1444
        }
 
1445
        return(rec);
 
1446
}
 
1447
 
 
1448
/*****************************************************************
 
1449
Returns TRUE if the insert fits on the appropriate half-page with the
 
1450
chosen split_rec. */
 
1451
static
 
1452
ibool
 
1453
btr_page_insert_fits(
 
1454
/*=================*/
 
1455
                                /* out: TRUE if fits */
 
1456
        btr_cur_t*      cursor, /* in: cursor at which insert
 
1457
                                should be made */
 
1458
        const rec_t*    split_rec,/* in: suggestion for first record
 
1459
                                on upper half-page, or NULL if
 
1460
                                tuple to be inserted should be first */
 
1461
        const ulint*    offsets,/* in: rec_get_offsets(
 
1462
                                split_rec, cursor->index) */
 
1463
        const dtuple_t* tuple,  /* in: tuple to insert */
 
1464
        ulint           n_ext,  /* in: number of externally stored columns */
 
1465
        mem_heap_t*     heap)   /* in: temporary memory heap */
 
1466
{
 
1467
        page_t*         page;
 
1468
        ulint           insert_size;
 
1469
        ulint           free_space;
 
1470
        ulint           total_data;
 
1471
        ulint           total_n_recs;
 
1472
        const rec_t*    rec;
 
1473
        const rec_t*    end_rec;
 
1474
        ulint*          offs;
 
1475
 
 
1476
        page = btr_cur_get_page(cursor);
 
1477
 
 
1478
        ut_ad(!split_rec == !offsets);
 
1479
        ut_ad(!offsets
 
1480
              || !page_is_comp(page) == !rec_offs_comp(offsets));
 
1481
        ut_ad(!offsets
 
1482
              || rec_offs_validate(split_rec, cursor->index, offsets));
 
1483
 
 
1484
        insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
 
1485
        free_space  = page_get_free_space_of_empty(page_is_comp(page));
 
1486
 
 
1487
        /* free_space is now the free space of a created new page */
 
1488
 
 
1489
        total_data   = page_get_data_size(page) + insert_size;
 
1490
        total_n_recs = page_get_n_recs(page) + 1;
 
1491
 
 
1492
        /* We determine which records (from rec to end_rec, not including
 
1493
        end_rec) will end up on the other half page from tuple when it is
 
1494
        inserted. */
 
1495
 
 
1496
        if (split_rec == NULL) {
 
1497
                rec = page_rec_get_next(page_get_infimum_rec(page));
 
1498
                end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
 
1499
 
 
1500
        } else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) {
 
1501
 
 
1502
                rec = page_rec_get_next(page_get_infimum_rec(page));
 
1503
                end_rec = split_rec;
 
1504
        } else {
 
1505
                rec = split_rec;
 
1506
                end_rec = page_get_supremum_rec(page);
 
1507
        }
 
1508
 
 
1509
        if (total_data + page_dir_calc_reserved_space(total_n_recs)
 
1510
            <= free_space) {
 
1511
 
 
1512
                /* Ok, there will be enough available space on the
 
1513
                half page where the tuple is inserted */
 
1514
 
 
1515
                return(TRUE);
 
1516
        }
 
1517
 
 
1518
        offs = NULL;
 
1519
 
 
1520
        while (rec != end_rec) {
 
1521
                /* In this loop we calculate the amount of reserved
 
1522
                space after rec is removed from page. */
 
1523
 
 
1524
                offs = rec_get_offsets(rec, cursor->index, offs,
 
1525
                                       ULINT_UNDEFINED, &heap);
 
1526
 
 
1527
                total_data -= rec_offs_size(offs);
 
1528
                total_n_recs--;
 
1529
 
 
1530
                if (total_data + page_dir_calc_reserved_space(total_n_recs)
 
1531
                    <= free_space) {
 
1532
 
 
1533
                        /* Ok, there will be enough available space on the
 
1534
                        half page where the tuple is inserted */
 
1535
 
 
1536
                        return(TRUE);
 
1537
                }
 
1538
 
 
1539
                rec = page_rec_get_next_const(rec);
 
1540
        }
 
1541
 
 
1542
        return(FALSE);
 
1543
}
 
1544
 
 
1545
/***********************************************************
 
1546
Inserts a data tuple to a tree on a non-leaf level. It is assumed
 
1547
that mtr holds an x-latch on the tree. */
 
1548
UNIV_INTERN
 
1549
void
 
1550
btr_insert_on_non_leaf_level(
 
1551
/*=========================*/
 
1552
        dict_index_t*   index,  /* in: index */
 
1553
        ulint           level,  /* in: level, must be > 0 */
 
1554
        dtuple_t*       tuple,  /* in: the record to be inserted */
 
1555
        mtr_t*          mtr)    /* in: mtr */
 
1556
{
 
1557
        big_rec_t*      dummy_big_rec;
 
1558
        btr_cur_t       cursor;
 
1559
        ulint           err;
 
1560
        rec_t*          rec;
 
1561
 
 
1562
        ut_ad(level > 0);
 
1563
 
 
1564
        btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
 
1565
                                    BTR_CONT_MODIFY_TREE,
 
1566
                                    &cursor, 0, mtr);
 
1567
 
 
1568
        err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
 
1569
                                         | BTR_KEEP_SYS_FLAG
 
1570
                                         | BTR_NO_UNDO_LOG_FLAG,
 
1571
                                         &cursor, tuple, &rec,
 
1572
                                         &dummy_big_rec, 0, NULL, mtr);
 
1573
        ut_a(err == DB_SUCCESS);
 
1574
}
 
1575
 
 
1576
/******************************************************************
 
1577
Attaches the halves of an index page on the appropriate level in an
 
1578
index tree. */
 
1579
static
 
1580
void
 
1581
btr_attach_half_pages(
 
1582
/*==================*/
 
1583
        dict_index_t*   index,          /* in: the index tree */
 
1584
        buf_block_t*    block,          /* in/out: page to be split */
 
1585
        rec_t*          split_rec,      /* in: first record on upper
 
1586
                                        half page */
 
1587
        buf_block_t*    new_block,      /* in/out: the new half page */
 
1588
        ulint           direction,      /* in: FSP_UP or FSP_DOWN */
 
1589
        mtr_t*          mtr)            /* in: mtr */
 
1590
{
 
1591
        ulint           space;
 
1592
        ulint           zip_size;
 
1593
        ulint           prev_page_no;
 
1594
        ulint           next_page_no;
 
1595
        ulint           level;
 
1596
        page_t*         page            = buf_block_get_frame(block);
 
1597
        page_t*         lower_page;
 
1598
        page_t*         upper_page;
 
1599
        ulint           lower_page_no;
 
1600
        ulint           upper_page_no;
 
1601
        page_zip_des_t* lower_page_zip;
 
1602
        page_zip_des_t* upper_page_zip;
 
1603
        dtuple_t*       node_ptr_upper;
 
1604
        mem_heap_t*     heap;
 
1605
 
 
1606
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
1607
        ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
 
1608
 
 
1609
        /* Create a memory heap where the data tuple is stored */
 
1610
        heap = mem_heap_create(1024);
 
1611
 
 
1612
        /* Based on split direction, decide upper and lower pages */
 
1613
        if (direction == FSP_DOWN) {
 
1614
 
 
1615
                btr_cur_t       cursor;
 
1616
                ulint*          offsets;
 
1617
 
 
1618
                lower_page = buf_block_get_frame(new_block);
 
1619
                lower_page_no = buf_block_get_page_no(new_block);
 
1620
                lower_page_zip = buf_block_get_page_zip(new_block);
 
1621
                upper_page = buf_block_get_frame(block);
 
1622
                upper_page_no = buf_block_get_page_no(block);
 
1623
                upper_page_zip = buf_block_get_page_zip(block);
 
1624
 
 
1625
                /* Look up the index for the node pointer to page */
 
1626
                offsets = btr_page_get_father_block(NULL, heap, index,
 
1627
                                                    block, mtr, &cursor);
 
1628
 
 
1629
                /* Replace the address of the old child node (= page) with the
 
1630
                address of the new lower half */
 
1631
 
 
1632
                btr_node_ptr_set_child_page_no(
 
1633
                        btr_cur_get_rec(&cursor),
 
1634
                        btr_cur_get_page_zip(&cursor),
 
1635
                        offsets, lower_page_no, mtr);
 
1636
                mem_heap_empty(heap);
 
1637
        } else {
 
1638
                lower_page = buf_block_get_frame(block);
 
1639
                lower_page_no = buf_block_get_page_no(block);
 
1640
                lower_page_zip = buf_block_get_page_zip(block);
 
1641
                upper_page = buf_block_get_frame(new_block);
 
1642
                upper_page_no = buf_block_get_page_no(new_block);
 
1643
                upper_page_zip = buf_block_get_page_zip(new_block);
 
1644
        }
 
1645
 
 
1646
        /* Get the level of the split pages */
 
1647
        level = btr_page_get_level(buf_block_get_frame(block), mtr);
 
1648
 
 
1649
        /* Build the node pointer (= node key and page address) for the upper
 
1650
        half */
 
1651
 
 
1652
        node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
 
1653
                                                   upper_page_no, heap, level);
 
1654
 
 
1655
        /* Insert it next to the pointer to the lower half. Note that this
 
1656
        may generate recursion leading to a split on the higher level. */
 
1657
 
 
1658
        btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr);
 
1659
 
 
1660
        /* Free the memory heap */
 
1661
        mem_heap_free(heap);
 
1662
 
 
1663
        /* Get the previous and next pages of page */
 
1664
 
 
1665
        prev_page_no = btr_page_get_prev(page, mtr);
 
1666
        next_page_no = btr_page_get_next(page, mtr);
 
1667
        space = buf_block_get_space(block);
 
1668
        zip_size = buf_block_get_zip_size(block);
 
1669
 
 
1670
        /* Update page links of the level */
 
1671
 
 
1672
        if (prev_page_no != FIL_NULL) {
 
1673
                buf_block_t*    prev_block = btr_block_get(space, zip_size,
 
1674
                                                           prev_page_no,
 
1675
                                                           RW_X_LATCH, mtr);
 
1676
#ifdef UNIV_BTR_DEBUG
 
1677
                ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
 
1678
                ut_a(btr_page_get_next(prev_block->frame, mtr)
 
1679
                     == buf_block_get_page_no(block));
 
1680
#endif /* UNIV_BTR_DEBUG */
 
1681
 
 
1682
                btr_page_set_next(buf_block_get_frame(prev_block),
 
1683
                                  buf_block_get_page_zip(prev_block),
 
1684
                                  lower_page_no, mtr);
 
1685
        }
 
1686
 
 
1687
        if (next_page_no != FIL_NULL) {
 
1688
                buf_block_t*    next_block = btr_block_get(space, zip_size,
 
1689
                                                           next_page_no,
 
1690
                                                           RW_X_LATCH, mtr);
 
1691
#ifdef UNIV_BTR_DEBUG
 
1692
                ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
 
1693
                ut_a(btr_page_get_prev(next_block->frame, mtr)
 
1694
                     == page_get_page_no(page));
 
1695
#endif /* UNIV_BTR_DEBUG */
 
1696
 
 
1697
                btr_page_set_prev(buf_block_get_frame(next_block),
 
1698
                                  buf_block_get_page_zip(next_block),
 
1699
                                  upper_page_no, mtr);
 
1700
        }
 
1701
 
 
1702
        btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr);
 
1703
        btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
 
1704
        btr_page_set_level(lower_page, lower_page_zip, level, mtr);
 
1705
 
 
1706
        btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
 
1707
        btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr);
 
1708
        btr_page_set_level(upper_page, upper_page_zip, level, mtr);
 
1709
}
 
1710
 
 
1711
/*****************************************************************
 
1712
Splits an index page to halves and inserts the tuple. It is assumed
 
1713
that mtr holds an x-latch to the index tree. NOTE: the tree x-latch
 
1714
is released within this function! NOTE that the operation of this
 
1715
function must always succeed, we cannot reverse it: therefore
 
1716
enough free disk space must be guaranteed to be available before
 
1717
this function is called. */
 
1718
UNIV_INTERN
 
1719
rec_t*
 
1720
btr_page_split_and_insert(
 
1721
/*======================*/
 
1722
                                /* out: inserted record; NOTE: the tree
 
1723
                                x-latch is released! NOTE: 2 free disk
 
1724
                                pages must be available! */
 
1725
        btr_cur_t*      cursor, /* in: cursor at which to insert; when the
 
1726
                                function returns, the cursor is positioned
 
1727
                                on the predecessor of the inserted record */
 
1728
        const dtuple_t* tuple,  /* in: tuple to insert */
 
1729
        ulint           n_ext,  /* in: number of externally stored columns */
 
1730
        mtr_t*          mtr)    /* in: mtr */
 
1731
{
 
1732
        buf_block_t*    block;
 
1733
        page_t*         page;
 
1734
        page_zip_des_t* page_zip;
 
1735
        ulint           page_no;
 
1736
        byte            direction;
 
1737
        ulint           hint_page_no;
 
1738
        buf_block_t*    new_block;
 
1739
        page_t*         new_page;
 
1740
        page_zip_des_t* new_page_zip;
 
1741
        rec_t*          split_rec;
 
1742
        buf_block_t*    left_block;
 
1743
        buf_block_t*    right_block;
 
1744
        buf_block_t*    insert_block;
 
1745
        page_t*         insert_page;
 
1746
        page_cur_t*     page_cursor;
 
1747
        rec_t*          first_rec;
 
1748
        byte*           buf = 0; /* remove warning */
 
1749
        rec_t*          move_limit;
 
1750
        ibool           insert_will_fit;
 
1751
        ibool           insert_left;
 
1752
        ulint           n_iterations = 0;
 
1753
        rec_t*          rec;
 
1754
        mem_heap_t*     heap;
 
1755
        ulint           n_uniq;
 
1756
        ulint*          offsets;
 
1757
 
 
1758
        heap = mem_heap_create(1024);
 
1759
        n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
 
1760
func_start:
 
1761
        mem_heap_empty(heap);
 
1762
        offsets = NULL;
 
1763
 
 
1764
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
 
1765
                                MTR_MEMO_X_LOCK));
 
1766
#ifdef UNIV_SYNC_DEBUG
 
1767
        ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
 
1768
#endif /* UNIV_SYNC_DEBUG */
 
1769
 
 
1770
        block = btr_cur_get_block(cursor);
 
1771
        page = buf_block_get_frame(block);
 
1772
        page_zip = buf_block_get_page_zip(block);
 
1773
 
 
1774
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
1775
        ut_ad(page_get_n_recs(page) >= 1);
 
1776
 
 
1777
        page_no = buf_block_get_page_no(block);
 
1778
 
 
1779
        /* 1. Decide the split record; split_rec == NULL means that the
 
1780
        tuple to be inserted should be the first record on the upper
 
1781
        half-page */
 
1782
 
 
1783
        if (n_iterations > 0) {
 
1784
                direction = FSP_UP;
 
1785
                hint_page_no = page_no + 1;
 
1786
                split_rec = btr_page_get_sure_split_rec(cursor, tuple, n_ext);
 
1787
 
 
1788
        } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
 
1789
                direction = FSP_UP;
 
1790
                hint_page_no = page_no + 1;
 
1791
 
 
1792
        } else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
 
1793
                direction = FSP_DOWN;
 
1794
                hint_page_no = page_no - 1;
 
1795
        } else {
 
1796
                direction = FSP_UP;
 
1797
                hint_page_no = page_no + 1;
 
1798
                split_rec = page_get_middle_rec(page);
 
1799
        }
 
1800
 
 
1801
        /* 2. Allocate a new page to the index */
 
1802
        new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
 
1803
                                   btr_page_get_level(page, mtr), mtr);
 
1804
        new_page = buf_block_get_frame(new_block);
 
1805
        new_page_zip = buf_block_get_page_zip(new_block);
 
1806
        btr_page_create(new_block, new_page_zip, cursor->index,
 
1807
                        btr_page_get_level(page, mtr), mtr);
 
1808
 
 
1809
        /* 3. Calculate the first record on the upper half-page, and the
 
1810
        first record (move_limit) on original page which ends up on the
 
1811
        upper half */
 
1812
 
 
1813
        if (split_rec) {
 
1814
                first_rec = move_limit = split_rec;
 
1815
 
 
1816
                offsets = rec_get_offsets(split_rec, cursor->index, offsets,
 
1817
                                          n_uniq, &heap);
 
1818
 
 
1819
                insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0;
 
1820
 
 
1821
                if (UNIV_UNLIKELY(!insert_left && new_page_zip
 
1822
                                  && n_iterations > 0)) {
 
1823
                        /* If a compressed page has already been split,
 
1824
                        avoid further splits by inserting the record
 
1825
                        to an empty page. */
 
1826
                        split_rec = NULL;
 
1827
                        goto insert_right;
 
1828
                }
 
1829
        } else {
 
1830
insert_right:
 
1831
                insert_left = FALSE;
 
1832
                buf = mem_alloc(rec_get_converted_size(cursor->index,
 
1833
                                                       tuple, n_ext));
 
1834
 
 
1835
                first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
 
1836
                                                      tuple, n_ext);
 
1837
                move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
 
1838
        }
 
1839
 
 
1840
        /* 4. Do first the modifications in the tree structure */
 
1841
 
 
1842
        btr_attach_half_pages(cursor->index, block,
 
1843
                              first_rec, new_block, direction, mtr);
 
1844
 
 
1845
        /* If the split is made on the leaf level and the insert will fit
 
1846
        on the appropriate half-page, we may release the tree x-latch.
 
1847
        We can then move the records after releasing the tree latch,
 
1848
        thus reducing the tree latch contention. */
 
1849
 
 
1850
        if (split_rec) {
 
1851
                insert_will_fit = !new_page_zip
 
1852
                        && btr_page_insert_fits(cursor, split_rec,
 
1853
                                                offsets, tuple, n_ext, heap);
 
1854
        } else {
 
1855
                mem_free(buf);
 
1856
                insert_will_fit = !new_page_zip
 
1857
                        && btr_page_insert_fits(cursor, NULL,
 
1858
                                                NULL, tuple, n_ext, heap);
 
1859
        }
 
1860
 
 
1861
        if (insert_will_fit && page_is_leaf(page)) {
 
1862
 
 
1863
                mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
 
1864
                                 MTR_MEMO_X_LOCK);
 
1865
        }
 
1866
 
 
1867
        /* 5. Move then the records to the new page */
 
1868
        if (direction == FSP_DOWN) {
 
1869
                /*              fputs("Split left\n", stderr); */
 
1870
 
 
1871
                if (UNIV_UNLIKELY
 
1872
                    (!page_move_rec_list_start(new_block, block, move_limit,
 
1873
                                               cursor->index, mtr))) {
 
1874
                        /* For some reason, compressing new_page failed,
 
1875
                        even though it should contain fewer records than
 
1876
                        the original page.  Copy the page byte for byte
 
1877
                        and then delete the records from both pages
 
1878
                        as appropriate.  Deleting will always succeed. */
 
1879
                        ut_a(new_page_zip);
 
1880
 
 
1881
                        page_zip_copy(new_page_zip, new_page,
 
1882
                                      page_zip, page, cursor->index, mtr);
 
1883
                        page_delete_rec_list_end(move_limit - page + new_page,
 
1884
                                                 new_block, cursor->index,
 
1885
                                                 ULINT_UNDEFINED,
 
1886
                                                 ULINT_UNDEFINED, mtr);
 
1887
                        page_delete_rec_list_start(move_limit, block,
 
1888
                                                   cursor->index, mtr);
 
1889
                }
 
1890
 
 
1891
                left_block = new_block;
 
1892
                right_block = block;
 
1893
 
 
1894
                lock_update_split_left(right_block, left_block);
 
1895
        } else {
 
1896
                /*              fputs("Split right\n", stderr); */
 
1897
 
 
1898
                if (UNIV_UNLIKELY
 
1899
                    (!page_move_rec_list_end(new_block, block, move_limit,
 
1900
                                             cursor->index, mtr))) {
 
1901
                        /* For some reason, compressing new_page failed,
 
1902
                        even though it should contain fewer records than
 
1903
                        the original page.  Copy the page byte for byte
 
1904
                        and then delete the records from both pages
 
1905
                        as appropriate.  Deleting will always succeed. */
 
1906
                        ut_a(new_page_zip);
 
1907
 
 
1908
                        page_zip_copy(new_page_zip, new_page,
 
1909
                                      page_zip, page, cursor->index, mtr);
 
1910
                        page_delete_rec_list_start(move_limit - page
 
1911
                                                   + new_page, new_block,
 
1912
                                                   cursor->index, mtr);
 
1913
                        page_delete_rec_list_end(move_limit, block,
 
1914
                                                 cursor->index,
 
1915
                                                 ULINT_UNDEFINED,
 
1916
                                                 ULINT_UNDEFINED, mtr);
 
1917
                }
 
1918
 
 
1919
                left_block = block;
 
1920
                right_block = new_block;
 
1921
 
 
1922
                lock_update_split_right(right_block, left_block);
 
1923
        }
 
1924
 
 
1925
#ifdef UNIV_ZIP_DEBUG
 
1926
        if (UNIV_LIKELY_NULL(page_zip)) {
 
1927
                ut_a(page_zip_validate(page_zip, page));
 
1928
                ut_a(page_zip_validate(new_page_zip, new_page));
 
1929
        }
 
1930
#endif /* UNIV_ZIP_DEBUG */
 
1931
 
 
1932
        /* At this point, split_rec, move_limit and first_rec may point
 
1933
        to garbage on the old page. */
 
1934
 
 
1935
        /* 6. The split and the tree modification is now completed. Decide the
 
1936
        page where the tuple should be inserted */
 
1937
 
 
1938
        if (insert_left) {
 
1939
                insert_block = left_block;
 
1940
        } else {
 
1941
                insert_block = right_block;
 
1942
        }
 
1943
 
 
1944
        insert_page = buf_block_get_frame(insert_block);
 
1945
 
 
1946
        /* 7. Reposition the cursor for insert and try insertion */
 
1947
        page_cursor = btr_cur_get_page_cur(cursor);
 
1948
 
 
1949
        page_cur_search(insert_block, cursor->index, tuple,
 
1950
                        PAGE_CUR_LE, page_cursor);
 
1951
 
 
1952
        rec = page_cur_tuple_insert(page_cursor, tuple,
 
1953
                                    cursor->index, n_ext, mtr);
 
1954
 
 
1955
#ifdef UNIV_ZIP_DEBUG
 
1956
        {
 
1957
                page_zip_des_t* insert_page_zip
 
1958
                        = buf_block_get_page_zip(insert_block);
 
1959
                ut_a(!insert_page_zip
 
1960
                     || page_zip_validate(insert_page_zip, insert_page));
 
1961
        }
 
1962
#endif /* UNIV_ZIP_DEBUG */
 
1963
 
 
1964
        if (UNIV_LIKELY(rec != NULL)) {
 
1965
 
 
1966
                goto func_exit;
 
1967
        }
 
1968
 
 
1969
        /* 8. If insert did not fit, try page reorganization */
 
1970
 
 
1971
        if (UNIV_UNLIKELY
 
1972
            (!btr_page_reorganize(insert_block, cursor->index, mtr))) {
 
1973
 
 
1974
                goto insert_failed;
 
1975
        }
 
1976
 
 
1977
        page_cur_search(insert_block, cursor->index, tuple,
 
1978
                        PAGE_CUR_LE, page_cursor);
 
1979
        rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
 
1980
                                    n_ext, mtr);
 
1981
 
 
1982
        if (UNIV_UNLIKELY(rec == NULL)) {
 
1983
                /* The insert did not fit on the page: loop back to the
 
1984
                start of the function for a new split */
 
1985
insert_failed:
 
1986
                /* We play safe and reset the free bits for new_page */
 
1987
                if (!dict_index_is_clust(cursor->index)) {
 
1988
                        ibuf_reset_free_bits(new_block);
 
1989
                }
 
1990
 
 
1991
                /* fprintf(stderr, "Split second round %lu\n",
 
1992
                page_get_page_no(page)); */
 
1993
                n_iterations++;
 
1994
                ut_ad(n_iterations < 2
 
1995
                      || buf_block_get_page_zip(insert_block));
 
1996
                ut_ad(!insert_will_fit);
 
1997
 
 
1998
                goto func_start;
 
1999
        }
 
2000
 
 
2001
func_exit:
 
2002
        /* Insert fit on the page: update the free bits for the
 
2003
        left and right pages in the same mtr */
 
2004
 
 
2005
        if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) {
 
2006
                ibuf_update_free_bits_for_two_pages_low(
 
2007
                        buf_block_get_zip_size(left_block),
 
2008
                        left_block, right_block, mtr);
 
2009
        }
 
2010
 
 
2011
#if 0
 
2012
        fprintf(stderr, "Split and insert done %lu %lu\n",
 
2013
                buf_block_get_page_no(left_block),
 
2014
                buf_block_get_page_no(right_block));
 
2015
#endif
 
2016
 
 
2017
        ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
 
2018
        ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
 
2019
 
 
2020
        mem_heap_free(heap);
 
2021
        return(rec);
 
2022
}
 
2023
 
 
2024
/*****************************************************************
 
2025
Removes a page from the level list of pages. */
 
2026
static
 
2027
void
 
2028
btr_level_list_remove(
 
2029
/*==================*/
 
2030
        ulint           space,  /* in: space where removed */
 
2031
        ulint           zip_size,/* in: compressed page size in bytes
 
2032
                                or 0 for uncompressed pages */
 
2033
        page_t*         page,   /* in: page to remove */
 
2034
        mtr_t*          mtr)    /* in: mtr */
 
2035
{
 
2036
        ulint   prev_page_no;
 
2037
        ulint   next_page_no;
 
2038
 
 
2039
        ut_ad(page && mtr);
 
2040
        ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
 
2041
        ut_ad(space == page_get_space_id(page));
 
2042
        /* Get the previous and next page numbers of page */
 
2043
 
 
2044
        prev_page_no = btr_page_get_prev(page, mtr);
 
2045
        next_page_no = btr_page_get_next(page, mtr);
 
2046
 
 
2047
        /* Update page links of the level */
 
2048
 
 
2049
        if (prev_page_no != FIL_NULL) {
 
2050
                buf_block_t*    prev_block
 
2051
                        = btr_block_get(space, zip_size, prev_page_no,
 
2052
                                        RW_X_LATCH, mtr);
 
2053
                page_t*         prev_page
 
2054
                        = buf_block_get_frame(prev_block);
 
2055
#ifdef UNIV_BTR_DEBUG
 
2056
                ut_a(page_is_comp(prev_page) == page_is_comp(page));
 
2057
                ut_a(btr_page_get_next(prev_page, mtr)
 
2058
                     == page_get_page_no(page));
 
2059
#endif /* UNIV_BTR_DEBUG */
 
2060
 
 
2061
                btr_page_set_next(prev_page,
 
2062
                                  buf_block_get_page_zip(prev_block),
 
2063
                                  next_page_no, mtr);
 
2064
        }
 
2065
 
 
2066
        if (next_page_no != FIL_NULL) {
 
2067
                buf_block_t*    next_block
 
2068
                        = btr_block_get(space, zip_size, next_page_no,
 
2069
                                        RW_X_LATCH, mtr);
 
2070
                page_t*         next_page
 
2071
                        = buf_block_get_frame(next_block);
 
2072
#ifdef UNIV_BTR_DEBUG
 
2073
                ut_a(page_is_comp(next_page) == page_is_comp(page));
 
2074
                ut_a(btr_page_get_prev(next_page, mtr)
 
2075
                     == page_get_page_no(page));
 
2076
#endif /* UNIV_BTR_DEBUG */
 
2077
 
 
2078
                btr_page_set_prev(next_page,
 
2079
                                  buf_block_get_page_zip(next_block),
 
2080
                                  prev_page_no, mtr);
 
2081
        }
 
2082
}
 
2083
 
 
2084
/********************************************************************
 
2085
Writes the redo log record for setting an index record as the predefined
 
2086
minimum record. */
 
2087
UNIV_INLINE
 
2088
void
 
2089
btr_set_min_rec_mark_log(
 
2090
/*=====================*/
 
2091
        rec_t*  rec,    /* in: record */
 
2092
        byte    type,   /* in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */
 
2093
        mtr_t*  mtr)    /* in: mtr */
 
2094
{
 
2095
        mlog_write_initial_log_record(rec, type, mtr);
 
2096
 
 
2097
        /* Write rec offset as a 2-byte ulint */
 
2098
        mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
 
2099
}
 
2100
 
 
2101
/********************************************************************
 
2102
Parses the redo log record for setting an index record as the predefined
 
2103
minimum record. */
 
2104
UNIV_INTERN
 
2105
byte*
 
2106
btr_parse_set_min_rec_mark(
 
2107
/*=======================*/
 
2108
                        /* out: end of log record or NULL */
 
2109
        byte*   ptr,    /* in: buffer */
 
2110
        byte*   end_ptr,/* in: buffer end */
 
2111
        ulint   comp,   /* in: nonzero=compact page format */
 
2112
        page_t* page,   /* in: page or NULL */
 
2113
        mtr_t*  mtr)    /* in: mtr or NULL */
 
2114
{
 
2115
        rec_t*  rec;
 
2116
 
 
2117
        if (end_ptr < ptr + 2) {
 
2118
 
 
2119
                return(NULL);
 
2120
        }
 
2121
 
 
2122
        if (page) {
 
2123
                ut_a(!page_is_comp(page) == !comp);
 
2124
 
 
2125
                rec = page + mach_read_from_2(ptr);
 
2126
 
 
2127
                btr_set_min_rec_mark(rec, mtr);
 
2128
        }
 
2129
 
 
2130
        return(ptr + 2);
 
2131
}
 
2132
 
 
2133
/********************************************************************
 
2134
Sets a record as the predefined minimum record. */
 
2135
UNIV_INTERN
 
2136
void
 
2137
btr_set_min_rec_mark(
 
2138
/*=================*/
 
2139
        rec_t*  rec,    /* in: record */
 
2140
        mtr_t*  mtr)    /* in: mtr */
 
2141
{
 
2142
        ulint   info_bits;
 
2143
 
 
2144
        if (UNIV_LIKELY(page_rec_is_comp(rec))) {
 
2145
                info_bits = rec_get_info_bits(rec, TRUE);
 
2146
 
 
2147
                rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
 
2148
 
 
2149
                btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
 
2150
        } else {
 
2151
                info_bits = rec_get_info_bits(rec, FALSE);
 
2152
 
 
2153
                rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
 
2154
 
 
2155
                btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
 
2156
        }
 
2157
}
 
2158
 
 
2159
/*****************************************************************
 
2160
Deletes on the upper level the node pointer to a page. */
 
2161
UNIV_INTERN
 
2162
void
 
2163
btr_node_ptr_delete(
 
2164
/*================*/
 
2165
        dict_index_t*   index,  /* in: index tree */
 
2166
        buf_block_t*    block,  /* in: page whose node pointer is deleted */
 
2167
        mtr_t*          mtr)    /* in: mtr */
 
2168
{
 
2169
        btr_cur_t       cursor;
 
2170
        ibool           compressed;
 
2171
        ulint           err;
 
2172
 
 
2173
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2174
 
 
2175
        /* Delete node pointer on father page */
 
2176
        btr_page_get_father(index, block, mtr, &cursor);
 
2177
 
 
2178
        compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, FALSE,
 
2179
                                                mtr);
 
2180
        ut_a(err == DB_SUCCESS);
 
2181
 
 
2182
        if (!compressed) {
 
2183
                btr_cur_compress_if_useful(&cursor, mtr);
 
2184
        }
 
2185
}
 
2186
 
 
2187
/*****************************************************************
 
2188
If page is the only on its level, this function moves its records to the
 
2189
father page, thus reducing the tree height. */
 
2190
static
 
2191
void
 
2192
btr_lift_page_up(
 
2193
/*=============*/
 
2194
        dict_index_t*   index,  /* in: index tree */
 
2195
        buf_block_t*    block,  /* in: page which is the only on its level;
 
2196
                                must not be empty: use
 
2197
                                btr_discard_only_page_on_level if the last
 
2198
                                record from the page should be removed */
 
2199
        mtr_t*          mtr)    /* in: mtr */
 
2200
{
 
2201
        buf_block_t*    father_block;
 
2202
        page_t*         father_page;
 
2203
        ulint           page_level;
 
2204
        page_zip_des_t* father_page_zip;
 
2205
        page_t*         page            = buf_block_get_frame(block);
 
2206
        ulint           root_page_no;
 
2207
        buf_block_t*    blocks[BTR_MAX_LEVELS];
 
2208
        ulint           n_blocks;       /* last used index in blocks[] */
 
2209
        ulint           i;
 
2210
 
 
2211
        ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
 
2212
        ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
 
2213
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2214
 
 
2215
        page_level = btr_page_get_level(page, mtr);
 
2216
        root_page_no = dict_index_get_page(index);
 
2217
 
 
2218
        {
 
2219
                btr_cur_t       cursor;
 
2220
                mem_heap_t*     heap    = mem_heap_create(100);
 
2221
                ulint*          offsets;
 
2222
                buf_block_t*    b;
 
2223
 
 
2224
                offsets = btr_page_get_father_block(NULL, heap, index,
 
2225
                                                    block, mtr, &cursor);
 
2226
                father_block = btr_cur_get_block(&cursor);
 
2227
                father_page_zip = buf_block_get_page_zip(father_block);
 
2228
                father_page = buf_block_get_frame(father_block);
 
2229
 
 
2230
                n_blocks = 0;
 
2231
 
 
2232
                /* Store all ancestor pages so we can reset their
 
2233
                levels later on.  We have to do all the searches on
 
2234
                the tree now because later on, after we've replaced
 
2235
                the first level, the tree is in an inconsistent state
 
2236
                and can not be searched. */
 
2237
                for (b = father_block;
 
2238
                     buf_block_get_page_no(b) != root_page_no; ) {
 
2239
                        ut_a(n_blocks < BTR_MAX_LEVELS);
 
2240
 
 
2241
                        offsets = btr_page_get_father_block(offsets, heap,
 
2242
                                                            index, b,
 
2243
                                                            mtr, &cursor);
 
2244
 
 
2245
                        blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
 
2246
                }
 
2247
 
 
2248
                mem_heap_free(heap);
 
2249
        }
 
2250
 
 
2251
        btr_search_drop_page_hash_index(block);
 
2252
 
 
2253
        /* Make the father empty */
 
2254
        btr_page_empty(father_block, father_page_zip, mtr, index);
 
2255
        /* Set the level before inserting records, because
 
2256
        page_zip_compress() requires that the first user record
 
2257
        on a non-leaf page has the min_rec_mark set. */
 
2258
        btr_page_set_level(father_page, father_page_zip, page_level, mtr);
 
2259
 
 
2260
        /* Copy the records to the father page one by one. */
 
2261
        if (UNIV_UNLIKELY
 
2262
            (!page_copy_rec_list_end(father_block, block,
 
2263
                                     page_get_infimum_rec(page),
 
2264
                                     index, mtr))) {
 
2265
                const page_zip_des_t*   page_zip
 
2266
                        = buf_block_get_page_zip(block);
 
2267
                ut_a(father_page_zip);
 
2268
                ut_a(page_zip);
 
2269
 
 
2270
                /* Copy the page byte for byte. */
 
2271
                page_zip_copy(father_page_zip, father_page,
 
2272
                              page_zip, page, index, mtr);
 
2273
        }
 
2274
 
 
2275
        lock_update_copy_and_discard(father_block, block);
 
2276
 
 
2277
        /* Go upward to root page, decrementing levels by one. */
 
2278
        for (i = 0; i < n_blocks; i++, page_level++) {
 
2279
                page_t* page = buf_block_get_frame(blocks[i]);
 
2280
 
 
2281
                ut_ad(btr_page_get_level(page, mtr) == page_level + 1);
 
2282
 
 
2283
                btr_page_set_level(page, buf_block_get_page_zip(blocks[i]),
 
2284
                                   page_level, mtr);
 
2285
        }
 
2286
 
 
2287
        /* Free the file page */
 
2288
        btr_page_free(index, block, mtr);
 
2289
 
 
2290
        /* We play safe and reset the free bits for the father */
 
2291
        if (!dict_index_is_clust(index)) {
 
2292
                ibuf_reset_free_bits(father_block);
 
2293
        }
 
2294
        ut_ad(page_validate(father_page, index));
 
2295
        ut_ad(btr_check_node_ptr(index, father_block, mtr));
 
2296
}
 
2297
 
 
2298
/*****************************************************************
 
2299
Tries to merge the page first to the left immediate brother if such a
 
2300
brother exists, and the node pointers to the current page and to the brother
 
2301
reside on the same page. If the left brother does not satisfy these
 
2302
conditions, looks at the right brother. If the page is the only one on that
 
2303
level lifts the records of the page to the father page, thus reducing the
 
2304
tree height. It is assumed that mtr holds an x-latch on the tree and on the
 
2305
page. If cursor is on the leaf level, mtr must also hold x-latches to the
 
2306
brothers, if they exist. */
 
2307
UNIV_INTERN
 
2308
ibool
 
2309
btr_compress(
 
2310
/*=========*/
 
2311
                                /* out: TRUE on success */
 
2312
        btr_cur_t*      cursor, /* in: cursor on the page to merge or lift;
 
2313
                                the page must not be empty: in record delete
 
2314
                                use btr_discard_page if the page would become
 
2315
                                empty */
 
2316
        mtr_t*          mtr)    /* in: mtr */
 
2317
{
 
2318
        dict_index_t*   index;
 
2319
        ulint           space;
 
2320
        ulint           zip_size;
 
2321
        ulint           left_page_no;
 
2322
        ulint           right_page_no;
 
2323
        buf_block_t*    merge_block;
 
2324
        page_t*         merge_page;
 
2325
        page_zip_des_t* merge_page_zip;
 
2326
        ibool           is_left;
 
2327
        buf_block_t*    block;
 
2328
        page_t*         page;
 
2329
        btr_cur_t       father_cursor;
 
2330
        mem_heap_t*     heap;
 
2331
        ulint*          offsets;
 
2332
        ulint           data_size;
 
2333
        ulint           n_recs;
 
2334
        ulint           max_ins_size;
 
2335
        ulint           max_ins_size_reorg;
 
2336
        ulint           level;
 
2337
 
 
2338
        block = btr_cur_get_block(cursor);
 
2339
        page = btr_cur_get_page(cursor);
 
2340
        index = btr_cur_get_index(cursor);
 
2341
        ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table));
 
2342
 
 
2343
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 
2344
                                MTR_MEMO_X_LOCK));
 
2345
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2346
        level = btr_page_get_level(page, mtr);
 
2347
        space = dict_index_get_space(index);
 
2348
        zip_size = dict_table_zip_size(index->table);
 
2349
 
 
2350
        left_page_no = btr_page_get_prev(page, mtr);
 
2351
        right_page_no = btr_page_get_next(page, mtr);
 
2352
 
 
2353
#if 0
 
2354
        fprintf(stderr, "Merge left page %lu right %lu \n",
 
2355
                left_page_no, right_page_no);
 
2356
#endif
 
2357
 
 
2358
        heap = mem_heap_create(100);
 
2359
        offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
 
2360
                                            &father_cursor);
 
2361
 
 
2362
        /* Decide the page to which we try to merge and which will inherit
 
2363
        the locks */
 
2364
 
 
2365
        is_left = left_page_no != FIL_NULL;
 
2366
 
 
2367
        if (is_left) {
 
2368
 
 
2369
                merge_block = btr_block_get(space, zip_size, left_page_no,
 
2370
                                            RW_X_LATCH, mtr);
 
2371
                merge_page = buf_block_get_frame(merge_block);
 
2372
#ifdef UNIV_BTR_DEBUG
 
2373
                ut_a(btr_page_get_next(merge_page, mtr)
 
2374
                     == buf_block_get_page_no(block));
 
2375
#endif /* UNIV_BTR_DEBUG */
 
2376
        } else if (right_page_no != FIL_NULL) {
 
2377
 
 
2378
                merge_block = btr_block_get(space, zip_size, right_page_no,
 
2379
                                            RW_X_LATCH, mtr);
 
2380
                merge_page = buf_block_get_frame(merge_block);
 
2381
#ifdef UNIV_BTR_DEBUG
 
2382
                ut_a(btr_page_get_prev(merge_page, mtr)
 
2383
                     == buf_block_get_page_no(block));
 
2384
#endif /* UNIV_BTR_DEBUG */
 
2385
        } else {
 
2386
                /* The page is the only one on the level, lift the records
 
2387
                to the father */
 
2388
                btr_lift_page_up(index, block, mtr);
 
2389
                mem_heap_free(heap);
 
2390
                return(TRUE);
 
2391
        }
 
2392
 
 
2393
        n_recs = page_get_n_recs(page);
 
2394
        data_size = page_get_data_size(page);
 
2395
#ifdef UNIV_BTR_DEBUG
 
2396
        ut_a(page_is_comp(merge_page) == page_is_comp(page));
 
2397
#endif /* UNIV_BTR_DEBUG */
 
2398
 
 
2399
        max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
 
2400
                merge_page, n_recs);
 
2401
        if (data_size > max_ins_size_reorg) {
 
2402
 
 
2403
                /* No space for merge */
 
2404
err_exit:
 
2405
                /* We play it safe and reset the free bits. */
 
2406
                if (zip_size
 
2407
                    && page_is_leaf(merge_page)
 
2408
                    && !dict_index_is_clust(index)) {
 
2409
                        ibuf_reset_free_bits(merge_block);
 
2410
                }
 
2411
 
 
2412
                mem_heap_free(heap);
 
2413
                return(FALSE);
 
2414
        }
 
2415
 
 
2416
        ut_ad(page_validate(merge_page, index));
 
2417
 
 
2418
        max_ins_size = page_get_max_insert_size(merge_page, n_recs);
 
2419
 
 
2420
        if (UNIV_UNLIKELY(data_size > max_ins_size)) {
 
2421
 
 
2422
                /* We have to reorganize merge_page */
 
2423
 
 
2424
                if (UNIV_UNLIKELY(!btr_page_reorganize(merge_block,
 
2425
                                                       index, mtr))) {
 
2426
 
 
2427
                        goto err_exit;
 
2428
                }
 
2429
 
 
2430
                max_ins_size = page_get_max_insert_size(merge_page, n_recs);
 
2431
 
 
2432
                ut_ad(page_validate(merge_page, index));
 
2433
                ut_ad(max_ins_size == max_ins_size_reorg);
 
2434
 
 
2435
                if (UNIV_UNLIKELY(data_size > max_ins_size)) {
 
2436
 
 
2437
                        /* Add fault tolerance, though this should
 
2438
                        never happen */
 
2439
 
 
2440
                        goto err_exit;
 
2441
                }
 
2442
        }
 
2443
 
 
2444
        merge_page_zip = buf_block_get_page_zip(merge_block);
 
2445
#ifdef UNIV_ZIP_DEBUG
 
2446
        if (UNIV_LIKELY_NULL(merge_page_zip)) {
 
2447
                const page_zip_des_t*   page_zip
 
2448
                        = buf_block_get_page_zip(block);
 
2449
                ut_a(page_zip);
 
2450
                ut_a(page_zip_validate(merge_page_zip, merge_page));
 
2451
                ut_a(page_zip_validate(page_zip, page));
 
2452
        }
 
2453
#endif /* UNIV_ZIP_DEBUG */
 
2454
 
 
2455
        /* Move records to the merge page */
 
2456
        if (is_left) {
 
2457
                rec_t*  orig_pred = page_copy_rec_list_start(
 
2458
                        merge_block, block, page_get_supremum_rec(page),
 
2459
                        index, mtr);
 
2460
 
 
2461
                if (UNIV_UNLIKELY(!orig_pred)) {
 
2462
                        goto err_exit;
 
2463
                }
 
2464
 
 
2465
                btr_search_drop_page_hash_index(block);
 
2466
 
 
2467
                /* Remove the page from the level list */
 
2468
                btr_level_list_remove(space, zip_size, page, mtr);
 
2469
 
 
2470
                btr_node_ptr_delete(index, block, mtr);
 
2471
                lock_update_merge_left(merge_block, orig_pred, block);
 
2472
        } else {
 
2473
                rec_t*          orig_succ;
 
2474
#ifdef UNIV_BTR_DEBUG
 
2475
                byte            fil_page_prev[4];
 
2476
#endif /* UNIV_BTR_DEBUG */
 
2477
 
 
2478
                if (UNIV_LIKELY_NULL(merge_page_zip)) {
 
2479
                        /* The function page_zip_compress(), which will be
 
2480
                        invoked by page_copy_rec_list_end() below,
 
2481
                        requires that FIL_PAGE_PREV be FIL_NULL.
 
2482
                        Clear the field, but prepare to restore it. */
 
2483
#ifdef UNIV_BTR_DEBUG
 
2484
                        memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
 
2485
#endif /* UNIV_BTR_DEBUG */
 
2486
#if FIL_NULL != 0xffffffff
 
2487
# error "FIL_NULL != 0xffffffff"
 
2488
#endif
 
2489
                        memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
 
2490
                }
 
2491
 
 
2492
                orig_succ = page_copy_rec_list_end(merge_block, block,
 
2493
                                                   page_get_infimum_rec(page),
 
2494
                                                   cursor->index, mtr);
 
2495
 
 
2496
                if (UNIV_UNLIKELY(!orig_succ)) {
 
2497
                        ut_a(merge_page_zip);
 
2498
#ifdef UNIV_BTR_DEBUG
 
2499
                        /* FIL_PAGE_PREV was restored from merge_page_zip. */
 
2500
                        ut_a(!memcmp(fil_page_prev,
 
2501
                                     merge_page + FIL_PAGE_PREV, 4));
 
2502
#endif /* UNIV_BTR_DEBUG */
 
2503
                        goto err_exit;
 
2504
                }
 
2505
 
 
2506
                btr_search_drop_page_hash_index(block);
 
2507
 
 
2508
#ifdef UNIV_BTR_DEBUG
 
2509
                if (UNIV_LIKELY_NULL(merge_page_zip)) {
 
2510
                        /* Restore FIL_PAGE_PREV in order to avoid an assertion
 
2511
                        failure in btr_level_list_remove(), which will set
 
2512
                        the field again to FIL_NULL.  Even though this makes
 
2513
                        merge_page and merge_page_zip inconsistent for a
 
2514
                        split second, it is harmless, because the pages
 
2515
                        are X-latched. */
 
2516
                        memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
 
2517
                }
 
2518
#endif /* UNIV_BTR_DEBUG */
 
2519
 
 
2520
                /* Remove the page from the level list */
 
2521
                btr_level_list_remove(space, zip_size, page, mtr);
 
2522
 
 
2523
                /* Replace the address of the old child node (= page) with the
 
2524
                address of the merge page to the right */
 
2525
 
 
2526
                btr_node_ptr_set_child_page_no(
 
2527
                        btr_cur_get_rec(&father_cursor),
 
2528
                        btr_cur_get_page_zip(&father_cursor),
 
2529
                        offsets, right_page_no, mtr);
 
2530
                btr_node_ptr_delete(index, merge_block, mtr);
 
2531
 
 
2532
                lock_update_merge_right(merge_block, orig_succ, block);
 
2533
        }
 
2534
 
 
2535
        mem_heap_free(heap);
 
2536
 
 
2537
        if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
 
2538
                /* Update the free bits of the B-tree page in the
 
2539
                insert buffer bitmap.  This has to be done in a
 
2540
                separate mini-transaction that is committed before the
 
2541
                main mini-transaction.  We cannot update the insert
 
2542
                buffer bitmap in this mini-transaction, because
 
2543
                btr_compress() can be invoked recursively without
 
2544
                committing the mini-transaction in between.  Since
 
2545
                insert buffer bitmap pages have a lower rank than
 
2546
                B-tree pages, we must not access other pages in the
 
2547
                same mini-transaction after accessing an insert buffer
 
2548
                bitmap page. */
 
2549
 
 
2550
                /* The free bits in the insert buffer bitmap must
 
2551
                never exceed the free space on a page.  It is safe to
 
2552
                decrement or reset the bits in the bitmap in a
 
2553
                mini-transaction that is committed before the
 
2554
                mini-transaction that affects the free space. */
 
2555
 
 
2556
                /* It is unsafe to increment the bits in a separately
 
2557
                committed mini-transaction, because in crash recovery,
 
2558
                the free bits could momentarily be set too high. */
 
2559
 
 
2560
                if (zip_size) {
 
2561
                        /* Because the free bits may be incremented
 
2562
                        and we cannot update the insert buffer bitmap
 
2563
                        in the same mini-transaction, the only safe
 
2564
                        thing we can do here is the pessimistic
 
2565
                        approach: reset the free bits. */
 
2566
                        ibuf_reset_free_bits(merge_block);
 
2567
                } else {
 
2568
                        /* On uncompressed pages, the free bits will
 
2569
                        never increase here.  Thus, it is safe to
 
2570
                        write the bits accurately in a separate
 
2571
                        mini-transaction. */
 
2572
                        ibuf_update_free_bits_if_full(merge_block,
 
2573
                                                      UNIV_PAGE_SIZE,
 
2574
                                                      ULINT_UNDEFINED);
 
2575
                }
 
2576
        }
 
2577
 
 
2578
        ut_ad(page_validate(merge_page, index));
 
2579
 
 
2580
        /* Free the file page */
 
2581
        btr_page_free(index, block, mtr);
 
2582
 
 
2583
        ut_ad(btr_check_node_ptr(index, merge_block, mtr));
 
2584
        return(TRUE);
 
2585
}
 
2586
 
 
2587
/*****************************************************************
 
2588
Discards a page that is the only page on its level. */
 
2589
static
 
2590
void
 
2591
btr_discard_only_page_on_level(
 
2592
/*===========================*/
 
2593
        dict_index_t*   index,  /* in: index tree */
 
2594
        buf_block_t*    block,  /* in: page which is the only on its level */
 
2595
        mtr_t*          mtr)    /* in: mtr */
 
2596
{
 
2597
        btr_cur_t       father_cursor;
 
2598
        buf_block_t*    father_block;
 
2599
        page_t*         father_page;
 
2600
        page_zip_des_t* father_page_zip;
 
2601
        page_t*         page            = buf_block_get_frame(block);
 
2602
        ulint           page_level;
 
2603
 
 
2604
        ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
 
2605
        ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
 
2606
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2607
        btr_search_drop_page_hash_index(block);
 
2608
 
 
2609
        btr_page_get_father(index, block, mtr, &father_cursor);
 
2610
        father_block = btr_cur_get_block(&father_cursor);
 
2611
        father_page_zip = buf_block_get_page_zip(father_block);
 
2612
        father_page = buf_block_get_frame(father_block);
 
2613
 
 
2614
        page_level = btr_page_get_level(page, mtr);
 
2615
 
 
2616
        lock_update_discard(father_block, PAGE_HEAP_NO_SUPREMUM, block);
 
2617
 
 
2618
        btr_page_set_level(father_page, father_page_zip, page_level, mtr);
 
2619
 
 
2620
        /* Free the file page */
 
2621
        btr_page_free(index, block, mtr);
 
2622
 
 
2623
        if (UNIV_LIKELY(buf_block_get_page_no(father_block)
 
2624
                        == dict_index_get_page(index))) {
 
2625
                /* The father is the root page */
 
2626
 
 
2627
                btr_page_empty(father_block, father_page_zip, mtr, index);
 
2628
 
 
2629
                /* We play safe and reset the free bits for the father */
 
2630
                if (!dict_index_is_clust(index)) {
 
2631
                        ibuf_reset_free_bits(father_block);
 
2632
                }
 
2633
        } else {
 
2634
                ut_ad(page_get_n_recs(father_page) == 1);
 
2635
 
 
2636
                btr_discard_only_page_on_level(index, father_block, mtr);
 
2637
        }
 
2638
}
 
2639
 
 
2640
/*****************************************************************
 
2641
Discards a page from a B-tree. This is used to remove the last record from
 
2642
a B-tree page: the whole page must be removed at the same time. This cannot
 
2643
be used for the root page, which is allowed to be empty. */
 
2644
UNIV_INTERN
 
2645
void
 
2646
btr_discard_page(
 
2647
/*=============*/
 
2648
        btr_cur_t*      cursor, /* in: cursor on the page to discard: not on
 
2649
                                the root page */
 
2650
        mtr_t*          mtr)    /* in: mtr */
 
2651
{
 
2652
        dict_index_t*   index;
 
2653
        ulint           space;
 
2654
        ulint           zip_size;
 
2655
        ulint           left_page_no;
 
2656
        ulint           right_page_no;
 
2657
        buf_block_t*    merge_block;
 
2658
        page_t*         merge_page;
 
2659
        buf_block_t*    block;
 
2660
        page_t*         page;
 
2661
        rec_t*          node_ptr;
 
2662
 
 
2663
        block = btr_cur_get_block(cursor);
 
2664
        index = btr_cur_get_index(cursor);
 
2665
 
 
2666
        ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block));
 
2667
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 
2668
                                MTR_MEMO_X_LOCK));
 
2669
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2670
        space = dict_index_get_space(index);
 
2671
        zip_size = dict_table_zip_size(index->table);
 
2672
 
 
2673
        /* Decide the page which will inherit the locks */
 
2674
 
 
2675
        left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
 
2676
        right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr);
 
2677
 
 
2678
        if (left_page_no != FIL_NULL) {
 
2679
                merge_block = btr_block_get(space, zip_size, left_page_no,
 
2680
                                            RW_X_LATCH, mtr);
 
2681
                merge_page = buf_block_get_frame(merge_block);
 
2682
#ifdef UNIV_BTR_DEBUG
 
2683
                ut_a(btr_page_get_next(merge_page, mtr)
 
2684
                     == buf_block_get_page_no(block));
 
2685
#endif /* UNIV_BTR_DEBUG */
 
2686
        } else if (right_page_no != FIL_NULL) {
 
2687
                merge_block = btr_block_get(space, zip_size, right_page_no,
 
2688
                                            RW_X_LATCH, mtr);
 
2689
                merge_page = buf_block_get_frame(merge_block);
 
2690
#ifdef UNIV_BTR_DEBUG
 
2691
                ut_a(btr_page_get_prev(merge_page, mtr)
 
2692
                     == buf_block_get_page_no(block));
 
2693
#endif /* UNIV_BTR_DEBUG */
 
2694
        } else {
 
2695
                btr_discard_only_page_on_level(index, block, mtr);
 
2696
 
 
2697
                return;
 
2698
        }
 
2699
 
 
2700
        page = buf_block_get_frame(block);
 
2701
        ut_a(page_is_comp(merge_page) == page_is_comp(page));
 
2702
        btr_search_drop_page_hash_index(block);
 
2703
 
 
2704
        if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
 
2705
 
 
2706
                /* We have to mark the leftmost node pointer on the right
 
2707
                side page as the predefined minimum record */
 
2708
                node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
 
2709
 
 
2710
                ut_ad(page_rec_is_user_rec(node_ptr));
 
2711
 
 
2712
                /* This will make page_zip_validate() fail on merge_page
 
2713
                until btr_level_list_remove() completes.  This is harmless,
 
2714
                because everything will take place within a single
 
2715
                mini-transaction and because writing to the redo log
 
2716
                is an atomic operation (performed by mtr_commit()). */
 
2717
                btr_set_min_rec_mark(node_ptr, mtr);
 
2718
        }
 
2719
 
 
2720
        btr_node_ptr_delete(index, block, mtr);
 
2721
 
 
2722
        /* Remove the page from the level list */
 
2723
        btr_level_list_remove(space, zip_size, page, mtr);
 
2724
#ifdef UNIV_ZIP_DEBUG
 
2725
        {
 
2726
                page_zip_des_t* merge_page_zip
 
2727
                        = buf_block_get_page_zip(merge_block);
 
2728
                ut_a(!merge_page_zip
 
2729
                     || page_zip_validate(merge_page_zip, merge_page));
 
2730
        }
 
2731
#endif /* UNIV_ZIP_DEBUG */
 
2732
 
 
2733
        if (left_page_no != FIL_NULL) {
 
2734
                lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
 
2735
                                    block);
 
2736
        } else {
 
2737
                lock_update_discard(merge_block,
 
2738
                                    lock_get_min_heap_no(merge_block),
 
2739
                                    block);
 
2740
        }
 
2741
 
 
2742
        /* Free the file page */
 
2743
        btr_page_free(index, block, mtr);
 
2744
 
 
2745
        ut_ad(btr_check_node_ptr(index, merge_block, mtr));
 
2746
}
 
2747
 
 
2748
#ifdef UNIV_BTR_PRINT
 
2749
/*****************************************************************
 
2750
Prints size info of a B-tree. */
 
2751
UNIV_INTERN
 
2752
void
 
2753
btr_print_size(
 
2754
/*===========*/
 
2755
        dict_index_t*   index)  /* in: index tree */
 
2756
{
 
2757
        page_t*         root;
 
2758
        fseg_header_t*  seg;
 
2759
        mtr_t           mtr;
 
2760
 
 
2761
        if (dict_index_is_ibuf(index)) {
 
2762
                fputs("Sorry, cannot print info of an ibuf tree:"
 
2763
                      " use ibuf functions\n", stderr);
 
2764
 
 
2765
                return;
 
2766
        }
 
2767
 
 
2768
        mtr_start(&mtr);
 
2769
 
 
2770
        root = btr_root_get(index, &mtr);
 
2771
 
 
2772
        seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
2773
 
 
2774
        fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
 
2775
        fseg_print(seg, &mtr);
 
2776
 
 
2777
        if (!(index->type & DICT_UNIVERSAL)) {
 
2778
 
 
2779
                seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
2780
 
 
2781
                fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
 
2782
                fseg_print(seg, &mtr);
 
2783
        }
 
2784
 
 
2785
        mtr_commit(&mtr);
 
2786
}
 
2787
 
 
2788
/****************************************************************
 
2789
Prints recursively index tree pages. */
 
2790
static
 
2791
void
 
2792
btr_print_recursive(
 
2793
/*================*/
 
2794
        dict_index_t*   index,  /* in: index tree */
 
2795
        buf_block_t*    block,  /* in: index page */
 
2796
        ulint           width,  /* in: print this many entries from start
 
2797
                                and end */
 
2798
        mem_heap_t**    heap,   /* in/out: heap for rec_get_offsets() */
 
2799
        ulint**         offsets,/* in/out: buffer for rec_get_offsets() */
 
2800
        mtr_t*          mtr)    /* in: mtr */
 
2801
{
 
2802
        const page_t*   page    = buf_block_get_frame(block);
 
2803
        page_cur_t      cursor;
 
2804
        ulint           n_recs;
 
2805
        ulint           i       = 0;
 
2806
        mtr_t           mtr2;
 
2807
 
 
2808
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2809
        fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
 
2810
                (ulong) btr_page_get_level(page, mtr),
 
2811
                (ulong) buf_block_get_page_no(block));
 
2812
 
 
2813
        page_print(block, index, width, width);
 
2814
 
 
2815
        n_recs = page_get_n_recs(page);
 
2816
 
 
2817
        page_cur_set_before_first(block, &cursor);
 
2818
        page_cur_move_to_next(&cursor);
 
2819
 
 
2820
        while (!page_cur_is_after_last(&cursor)) {
 
2821
 
 
2822
                if (page_is_leaf(page)) {
 
2823
 
 
2824
                        /* If this is the leaf level, do nothing */
 
2825
 
 
2826
                } else if ((i <= width) || (i >= n_recs - width)) {
 
2827
 
 
2828
                        const rec_t*    node_ptr;
 
2829
 
 
2830
                        mtr_start(&mtr2);
 
2831
 
 
2832
                        node_ptr = page_cur_get_rec(&cursor);
 
2833
 
 
2834
                        *offsets = rec_get_offsets(node_ptr, index, *offsets,
 
2835
                                                   ULINT_UNDEFINED, heap);
 
2836
                        btr_print_recursive(index,
 
2837
                                            btr_node_ptr_get_child(node_ptr,
 
2838
                                                                   index,
 
2839
                                                                   *offsets,
 
2840
                                                                   &mtr2),
 
2841
                                            width, heap, offsets, &mtr2);
 
2842
                        mtr_commit(&mtr2);
 
2843
                }
 
2844
 
 
2845
                page_cur_move_to_next(&cursor);
 
2846
                i++;
 
2847
        }
 
2848
}
 
2849
 
 
2850
/******************************************************************
 
2851
Prints directories and other info of all nodes in the tree. */
 
2852
UNIV_INTERN
 
2853
void
 
2854
btr_print_index(
 
2855
/*============*/
 
2856
        dict_index_t*   index,  /* in: index */
 
2857
        ulint           width)  /* in: print this many entries from start
 
2858
                                and end */
 
2859
{
 
2860
        mtr_t           mtr;
 
2861
        buf_block_t*    root;
 
2862
        mem_heap_t*     heap    = NULL;
 
2863
        ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
2864
        ulint*          offsets = offsets_;
 
2865
        rec_offs_init(offsets_);
 
2866
 
 
2867
        fputs("--------------------------\n"
 
2868
              "INDEX TREE PRINT\n", stderr);
 
2869
 
 
2870
        mtr_start(&mtr);
 
2871
 
 
2872
        root = btr_root_block_get(index, &mtr);
 
2873
 
 
2874
        btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
 
2875
        if (UNIV_LIKELY_NULL(heap)) {
 
2876
                mem_heap_free(heap);
 
2877
        }
 
2878
 
 
2879
        mtr_commit(&mtr);
 
2880
 
 
2881
        btr_validate_index(index, NULL);
 
2882
}
 
2883
#endif /* UNIV_BTR_PRINT */
 
2884
 
 
2885
#ifdef UNIV_DEBUG
 
2886
/****************************************************************
 
2887
Checks that the node pointer to a page is appropriate. */
 
2888
UNIV_INTERN
 
2889
ibool
 
2890
btr_check_node_ptr(
 
2891
/*===============*/
 
2892
                                /* out: TRUE */
 
2893
        dict_index_t*   index,  /* in: index tree */
 
2894
        buf_block_t*    block,  /* in: index page */
 
2895
        mtr_t*          mtr)    /* in: mtr */
 
2896
{
 
2897
        mem_heap_t*     heap;
 
2898
        dtuple_t*       tuple;
 
2899
        ulint*          offsets;
 
2900
        btr_cur_t       cursor;
 
2901
        page_t*         page = buf_block_get_frame(block);
 
2902
 
 
2903
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2904
        if (dict_index_get_page(index) == buf_block_get_page_no(block)) {
 
2905
 
 
2906
                return(TRUE);
 
2907
        }
 
2908
 
 
2909
        heap = mem_heap_create(256);
 
2910
        offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
 
2911
                                            &cursor);
 
2912
 
 
2913
        if (page_is_leaf(page)) {
 
2914
 
 
2915
                goto func_exit;
 
2916
        }
 
2917
 
 
2918
        tuple = dict_index_build_node_ptr(
 
2919
                index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
 
2920
                btr_page_get_level(page, mtr));
 
2921
 
 
2922
        ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
 
2923
func_exit:
 
2924
        mem_heap_free(heap);
 
2925
 
 
2926
        return(TRUE);
 
2927
}
 
2928
#endif /* UNIV_DEBUG */
 
2929
 
 
2930
/****************************************************************
 
2931
Display identification information for a record. */
 
2932
static
 
2933
void
 
2934
btr_index_rec_validate_report(
 
2935
/*==========================*/
 
2936
        const page_t*           page,   /* in: index page */
 
2937
        const rec_t*            rec,    /* in: index record */
 
2938
        const dict_index_t*     index)  /* in: index */
 
2939
{
 
2940
        fputs("InnoDB: Record in ", stderr);
 
2941
        dict_index_name_print(stderr, NULL, index);
 
2942
        fprintf(stderr, ", page %lu, at offset %lu\n",
 
2943
                page_get_page_no(page), (ulint) page_offset(rec));
 
2944
}
 
2945
 
 
2946
/****************************************************************
 
2947
Checks the size and number of fields in a record based on the definition of
 
2948
the index. */
 
2949
UNIV_INTERN
 
2950
ibool
 
2951
btr_index_rec_validate(
 
2952
/*===================*/
 
2953
                                                /* out: TRUE if ok */
 
2954
        const rec_t*            rec,            /* in: index record */
 
2955
        const dict_index_t*     index,          /* in: index */
 
2956
        ibool                   dump_on_error)  /* in: TRUE if the function
 
2957
                                                should print hex dump of record
 
2958
                                                and page on error */
 
2959
{
 
2960
        ulint           len;
 
2961
        ulint           n;
 
2962
        ulint           i;
 
2963
        const page_t*   page;
 
2964
        mem_heap_t*     heap    = NULL;
 
2965
        ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
2966
        ulint*          offsets = offsets_;
 
2967
        rec_offs_init(offsets_);
 
2968
 
 
2969
        page = page_align(rec);
 
2970
 
 
2971
        if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
 
2972
                /* The insert buffer index tree can contain records from any
 
2973
                other index: we cannot check the number of fields or
 
2974
                their length */
 
2975
 
 
2976
                return(TRUE);
 
2977
        }
 
2978
 
 
2979
        if (UNIV_UNLIKELY((ibool)!!page_is_comp(page)
 
2980
                          != dict_table_is_comp(index->table))) {
 
2981
                btr_index_rec_validate_report(page, rec, index);
 
2982
                fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
 
2983
                        (ulong) !!page_is_comp(page),
 
2984
                        (ulong) dict_table_is_comp(index->table));
 
2985
 
 
2986
                return(FALSE);
 
2987
        }
 
2988
 
 
2989
        n = dict_index_get_n_fields(index);
 
2990
 
 
2991
        if (!page_is_comp(page)
 
2992
            && UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) {
 
2993
                btr_index_rec_validate_report(page, rec, index);
 
2994
                fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
 
2995
                        (ulong) rec_get_n_fields_old(rec), (ulong) n);
 
2996
 
 
2997
                if (dump_on_error) {
 
2998
                        buf_page_print(page, 0);
 
2999
 
 
3000
                        fputs("InnoDB: corrupt record ", stderr);
 
3001
                        rec_print_old(stderr, rec);
 
3002
                        putc('\n', stderr);
 
3003
                }
 
3004
                return(FALSE);
 
3005
        }
 
3006
 
 
3007
        offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
 
3008
 
 
3009
        for (i = 0; i < n; i++) {
 
3010
                ulint   fixed_size = dict_col_get_fixed_size(
 
3011
                        dict_index_get_nth_col(index, i));
 
3012
 
 
3013
                rec_get_nth_field_offs(offsets, i, &len);
 
3014
 
 
3015
                /* Note that if fixed_size != 0, it equals the
 
3016
                length of a fixed-size column in the clustered index.
 
3017
                A prefix index of the column is of fixed, but different
 
3018
                length.  When fixed_size == 0, prefix_len is the maximum
 
3019
                length of the prefix index column. */
 
3020
 
 
3021
                if ((dict_index_get_nth_field(index, i)->prefix_len == 0
 
3022
                     && len != UNIV_SQL_NULL && fixed_size
 
3023
                     && len != fixed_size)
 
3024
                    || (dict_index_get_nth_field(index, i)->prefix_len > 0
 
3025
                        && len != UNIV_SQL_NULL
 
3026
                        && len
 
3027
                        > dict_index_get_nth_field(index, i)->prefix_len)) {
 
3028
 
 
3029
                        btr_index_rec_validate_report(page, rec, index);
 
3030
                        fprintf(stderr,
 
3031
                                "InnoDB: field %lu len is %lu,"
 
3032
                                " should be %lu\n",
 
3033
                                (ulong) i, (ulong) len, (ulong) fixed_size);
 
3034
 
 
3035
                        if (dump_on_error) {
 
3036
                                buf_page_print(page, 0);
 
3037
 
 
3038
                                fputs("InnoDB: corrupt record ", stderr);
 
3039
                                rec_print_new(stderr, rec, offsets);
 
3040
                                putc('\n', stderr);
 
3041
                        }
 
3042
                        if (UNIV_LIKELY_NULL(heap)) {
 
3043
                                mem_heap_free(heap);
 
3044
                        }
 
3045
                        return(FALSE);
 
3046
                }
 
3047
        }
 
3048
 
 
3049
        if (UNIV_LIKELY_NULL(heap)) {
 
3050
                mem_heap_free(heap);
 
3051
        }
 
3052
        return(TRUE);
 
3053
}
 
3054
 
 
3055
/****************************************************************
 
3056
Checks the size and number of fields in records based on the definition of
 
3057
the index. */
 
3058
static
 
3059
ibool
 
3060
btr_index_page_validate(
 
3061
/*====================*/
 
3062
                                /* out: TRUE if ok */
 
3063
        buf_block_t*    block,  /* in: index page */
 
3064
        dict_index_t*   index)  /* in: index */
 
3065
{
 
3066
        page_cur_t      cur;
 
3067
        ibool           ret     = TRUE;
 
3068
 
 
3069
        page_cur_set_before_first(block, &cur);
 
3070
        page_cur_move_to_next(&cur);
 
3071
 
 
3072
        for (;;) {
 
3073
                if (page_cur_is_after_last(&cur)) {
 
3074
 
 
3075
                        break;
 
3076
                }
 
3077
 
 
3078
                if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
 
3079
 
 
3080
                        return(FALSE);
 
3081
                }
 
3082
 
 
3083
                page_cur_move_to_next(&cur);
 
3084
        }
 
3085
 
 
3086
        return(ret);
 
3087
}
 
3088
 
 
3089
/****************************************************************
 
3090
Report an error on one page of an index tree. */
 
3091
static
 
3092
void
 
3093
btr_validate_report1(
 
3094
/*=================*/
 
3095
                                        /* out: TRUE if ok */
 
3096
        dict_index_t*           index,  /* in: index */
 
3097
        ulint                   level,  /* in: B-tree level */
 
3098
        const buf_block_t*      block)  /* in: index page */
 
3099
{
 
3100
        fprintf(stderr, "InnoDB: Error in page %lu of ",
 
3101
                buf_block_get_page_no(block));
 
3102
        dict_index_name_print(stderr, NULL, index);
 
3103
        if (level) {
 
3104
                fprintf(stderr, ", index tree level %lu", level);
 
3105
        }
 
3106
        putc('\n', stderr);
 
3107
}
 
3108
 
 
3109
/****************************************************************
 
3110
Report an error on two pages of an index tree. */
 
3111
static
 
3112
void
 
3113
btr_validate_report2(
 
3114
/*=================*/
 
3115
                                        /* out: TRUE if ok */
 
3116
        const dict_index_t*     index,  /* in: index */
 
3117
        ulint                   level,  /* in: B-tree level */
 
3118
        const buf_block_t*      block1, /* in: first index page */
 
3119
        const buf_block_t*      block2) /* in: second index page */
 
3120
{
 
3121
        fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
 
3122
                buf_block_get_page_no(block1),
 
3123
                buf_block_get_page_no(block2));
 
3124
        dict_index_name_print(stderr, NULL, index);
 
3125
        if (level) {
 
3126
                fprintf(stderr, ", index tree level %lu", level);
 
3127
        }
 
3128
        putc('\n', stderr);
 
3129
}
 
3130
 
 
3131
/****************************************************************
 
3132
Validates index tree level. */
 
3133
static
 
3134
ibool
 
3135
btr_validate_level(
 
3136
/*===============*/
 
3137
                                /* out: TRUE if ok */
 
3138
        dict_index_t*   index,  /* in: index tree */
 
3139
        trx_t*          trx,    /* in: transaction or NULL */
 
3140
        ulint           level)  /* in: level number */
 
3141
{
 
3142
        ulint           space;
 
3143
        ulint           zip_size;
 
3144
        buf_block_t*    block;
 
3145
        page_t*         page;
 
3146
        buf_block_t*    right_block = 0; /* remove warning */
 
3147
        page_t*         right_page = 0; /* remove warning */
 
3148
        page_t*         father_page;
 
3149
        btr_cur_t       node_cur;
 
3150
        btr_cur_t       right_node_cur;
 
3151
        rec_t*          rec;
 
3152
        ulint           right_page_no;
 
3153
        ulint           left_page_no;
 
3154
        page_cur_t      cursor;
 
3155
        dtuple_t*       node_ptr_tuple;
 
3156
        ibool           ret     = TRUE;
 
3157
        mtr_t           mtr;
 
3158
        mem_heap_t*     heap    = mem_heap_create(256);
 
3159
        ulint*          offsets = NULL;
 
3160
        ulint*          offsets2= NULL;
 
3161
#ifdef UNIV_ZIP_DEBUG
 
3162
        page_zip_des_t* page_zip;
 
3163
#endif /* UNIV_ZIP_DEBUG */
 
3164
 
 
3165
        mtr_start(&mtr);
 
3166
 
 
3167
        mtr_x_lock(dict_index_get_lock(index), &mtr);
 
3168
 
 
3169
        block = btr_root_block_get(index, &mtr);
 
3170
        page = buf_block_get_frame(block);
 
3171
 
 
3172
        space = dict_index_get_space(index);
 
3173
        zip_size = dict_table_zip_size(index->table);
 
3174
 
 
3175
        while (level != btr_page_get_level(page, &mtr)) {
 
3176
                const rec_t*    node_ptr;
 
3177
 
 
3178
                ut_a(space == buf_block_get_space(block));
 
3179
                ut_a(space == page_get_space_id(page));
 
3180
#ifdef UNIV_ZIP_DEBUG
 
3181
                page_zip = buf_block_get_page_zip(block);
 
3182
                ut_a(!page_zip || page_zip_validate(page_zip, page));
 
3183
#endif /* UNIV_ZIP_DEBUG */
 
3184
                ut_a(!page_is_leaf(page));
 
3185
 
 
3186
                page_cur_set_before_first(block, &cursor);
 
3187
                page_cur_move_to_next(&cursor);
 
3188
 
 
3189
                node_ptr = page_cur_get_rec(&cursor);
 
3190
                offsets = rec_get_offsets(node_ptr, index, offsets,
 
3191
                                          ULINT_UNDEFINED, &heap);
 
3192
                block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
 
3193
                page = buf_block_get_frame(block);
 
3194
        }
 
3195
 
 
3196
        /* Now we are on the desired level. Loop through the pages on that
 
3197
        level. */
 
3198
loop:
 
3199
        if (trx_is_interrupted(trx)) {
 
3200
                mtr_commit(&mtr);
 
3201
                mem_heap_free(heap);
 
3202
                return(ret);
 
3203
        }
 
3204
        mem_heap_empty(heap);
 
3205
        offsets = offsets2 = NULL;
 
3206
        mtr_x_lock(dict_index_get_lock(index), &mtr);
 
3207
 
 
3208
#ifdef UNIV_ZIP_DEBUG
 
3209
        page_zip = buf_block_get_page_zip(block);
 
3210
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
3211
#endif /* UNIV_ZIP_DEBUG */
 
3212
 
 
3213
        /* Check ordering etc. of records */
 
3214
 
 
3215
        if (!page_validate(page, index)) {
 
3216
                btr_validate_report1(index, level, block);
 
3217
 
 
3218
                ret = FALSE;
 
3219
        } else if (level == 0) {
 
3220
                /* We are on level 0. Check that the records have the right
 
3221
                number of fields, and field lengths are right. */
 
3222
 
 
3223
                if (!btr_index_page_validate(block, index)) {
 
3224
 
 
3225
                        ret = FALSE;
 
3226
                }
 
3227
        }
 
3228
 
 
3229
        ut_a(btr_page_get_level(page, &mtr) == level);
 
3230
 
 
3231
        right_page_no = btr_page_get_next(page, &mtr);
 
3232
        left_page_no = btr_page_get_prev(page, &mtr);
 
3233
 
 
3234
        ut_a(page_get_n_recs(page) > 0 || (level == 0
 
3235
                                           && page_get_page_no(page)
 
3236
                                           == dict_index_get_page(index)));
 
3237
 
 
3238
        if (right_page_no != FIL_NULL) {
 
3239
                const rec_t*    right_rec;
 
3240
                right_block = btr_block_get(space, zip_size, right_page_no,
 
3241
                                            RW_X_LATCH, &mtr);
 
3242
                right_page = buf_block_get_frame(right_block);
 
3243
                if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
 
3244
                                  != page_get_page_no(page))) {
 
3245
                        btr_validate_report2(index, level, block, right_block);
 
3246
                        fputs("InnoDB: broken FIL_PAGE_NEXT"
 
3247
                              " or FIL_PAGE_PREV links\n", stderr);
 
3248
                        buf_page_print(page, 0);
 
3249
                        buf_page_print(right_page, 0);
 
3250
 
 
3251
                        ret = FALSE;
 
3252
                }
 
3253
 
 
3254
                if (UNIV_UNLIKELY(page_is_comp(right_page)
 
3255
                                  != page_is_comp(page))) {
 
3256
                        btr_validate_report2(index, level, block, right_block);
 
3257
                        fputs("InnoDB: 'compact' flag mismatch\n", stderr);
 
3258
                        buf_page_print(page, 0);
 
3259
                        buf_page_print(right_page, 0);
 
3260
 
 
3261
                        ret = FALSE;
 
3262
 
 
3263
                        goto node_ptr_fails;
 
3264
                }
 
3265
 
 
3266
                rec = page_rec_get_prev(page_get_supremum_rec(page));
 
3267
                right_rec = page_rec_get_next(page_get_infimum_rec(
 
3268
                                                      right_page));
 
3269
                offsets = rec_get_offsets(rec, index,
 
3270
                                          offsets, ULINT_UNDEFINED, &heap);
 
3271
                offsets2 = rec_get_offsets(right_rec, index,
 
3272
                                           offsets2, ULINT_UNDEFINED, &heap);
 
3273
                if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec,
 
3274
                                              offsets, offsets2,
 
3275
                                              index) >= 0)) {
 
3276
 
 
3277
                        btr_validate_report2(index, level, block, right_block);
 
3278
 
 
3279
                        fputs("InnoDB: records in wrong order"
 
3280
                              " on adjacent pages\n", stderr);
 
3281
 
 
3282
                        buf_page_print(page, 0);
 
3283
                        buf_page_print(right_page, 0);
 
3284
 
 
3285
                        fputs("InnoDB: record ", stderr);
 
3286
                        rec = page_rec_get_prev(page_get_supremum_rec(page));
 
3287
                        rec_print(stderr, rec, index);
 
3288
                        putc('\n', stderr);
 
3289
                        fputs("InnoDB: record ", stderr);
 
3290
                        rec = page_rec_get_next(
 
3291
                                page_get_infimum_rec(right_page));
 
3292
                        rec_print(stderr, rec, index);
 
3293
                        putc('\n', stderr);
 
3294
 
 
3295
                        ret = FALSE;
 
3296
                }
 
3297
        }
 
3298
 
 
3299
        if (level > 0 && left_page_no == FIL_NULL) {
 
3300
                ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
 
3301
                             page_rec_get_next(page_get_infimum_rec(page)),
 
3302
                             page_is_comp(page)));
 
3303
        }
 
3304
 
 
3305
        if (buf_block_get_page_no(block) != dict_index_get_page(index)) {
 
3306
 
 
3307
                /* Check father node pointers */
 
3308
 
 
3309
                rec_t*  node_ptr;
 
3310
 
 
3311
                offsets = btr_page_get_father_block(offsets, heap, index,
 
3312
                                                    block, &mtr, &node_cur);
 
3313
                father_page = btr_cur_get_page(&node_cur);
 
3314
                node_ptr = btr_cur_get_rec(&node_cur);
 
3315
 
 
3316
                btr_cur_position(
 
3317
                        index, page_rec_get_prev(page_get_supremum_rec(page)),
 
3318
                        block, &node_cur);
 
3319
                offsets = btr_page_get_father_node_ptr(offsets, heap,
 
3320
                                                       &node_cur, &mtr);
 
3321
 
 
3322
                if (UNIV_UNLIKELY(node_ptr != btr_cur_get_rec(&node_cur))
 
3323
                    || UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr,
 
3324
                                                                    offsets)
 
3325
                                     != buf_block_get_page_no(block))) {
 
3326
 
 
3327
                        btr_validate_report1(index, level, block);
 
3328
 
 
3329
                        fputs("InnoDB: node pointer to the page is wrong\n",
 
3330
                              stderr);
 
3331
 
 
3332
                        buf_page_print(father_page, 0);
 
3333
                        buf_page_print(page, 0);
 
3334
 
 
3335
                        fputs("InnoDB: node ptr ", stderr);
 
3336
                        rec_print(stderr, node_ptr, index);
 
3337
 
 
3338
                        rec = btr_cur_get_rec(&node_cur);
 
3339
                        fprintf(stderr, "\n"
 
3340
                                "InnoDB: node ptr child page n:o %lu\n",
 
3341
                                (ulong) btr_node_ptr_get_child_page_no(
 
3342
                                        rec, offsets));
 
3343
 
 
3344
                        fputs("InnoDB: record on page ", stderr);
 
3345
                        rec_print_new(stderr, rec, offsets);
 
3346
                        putc('\n', stderr);
 
3347
                        ret = FALSE;
 
3348
 
 
3349
                        goto node_ptr_fails;
 
3350
                }
 
3351
 
 
3352
                if (!page_is_leaf(page)) {
 
3353
                        node_ptr_tuple = dict_index_build_node_ptr(
 
3354
                                index,
 
3355
                                page_rec_get_next(page_get_infimum_rec(page)),
 
3356
                                0, heap, btr_page_get_level(page, &mtr));
 
3357
 
 
3358
                        if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
 
3359
                                           offsets)) {
 
3360
                                const rec_t* first_rec = page_rec_get_next(
 
3361
                                        page_get_infimum_rec(page));
 
3362
 
 
3363
                                btr_validate_report1(index, level, block);
 
3364
 
 
3365
                                buf_page_print(father_page, 0);
 
3366
                                buf_page_print(page, 0);
 
3367
 
 
3368
                                fputs("InnoDB: Error: node ptrs differ"
 
3369
                                      " on levels > 0\n"
 
3370
                                      "InnoDB: node ptr ", stderr);
 
3371
                                rec_print_new(stderr, node_ptr, offsets);
 
3372
                                fputs("InnoDB: first rec ", stderr);
 
3373
                                rec_print(stderr, first_rec, index);
 
3374
                                putc('\n', stderr);
 
3375
                                ret = FALSE;
 
3376
 
 
3377
                                goto node_ptr_fails;
 
3378
                        }
 
3379
                }
 
3380
 
 
3381
                if (left_page_no == FIL_NULL) {
 
3382
                        ut_a(node_ptr == page_rec_get_next(
 
3383
                                     page_get_infimum_rec(father_page)));
 
3384
                        ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
 
3385
                }
 
3386
 
 
3387
                if (right_page_no == FIL_NULL) {
 
3388
                        ut_a(node_ptr == page_rec_get_prev(
 
3389
                                     page_get_supremum_rec(father_page)));
 
3390
                        ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
 
3391
                } else {
 
3392
                        const rec_t*    right_node_ptr
 
3393
                                = page_rec_get_next(node_ptr);
 
3394
 
 
3395
                        offsets = btr_page_get_father_block(
 
3396
                                offsets, heap, index, right_block,
 
3397
                                &mtr, &right_node_cur);
 
3398
                        if (right_node_ptr
 
3399
                            != page_get_supremum_rec(father_page)) {
 
3400
 
 
3401
                                if (btr_cur_get_rec(&right_node_cur)
 
3402
                                    != right_node_ptr) {
 
3403
                                        ret = FALSE;
 
3404
                                        fputs("InnoDB: node pointer to"
 
3405
                                              " the right page is wrong\n",
 
3406
                                              stderr);
 
3407
 
 
3408
                                        btr_validate_report1(index, level,
 
3409
                                                             block);
 
3410
 
 
3411
                                        buf_page_print(father_page, 0);
 
3412
                                        buf_page_print(page, 0);
 
3413
                                        buf_page_print(right_page, 0);
 
3414
                                }
 
3415
                        } else {
 
3416
                                page_t* right_father_page
 
3417
                                        = btr_cur_get_page(&right_node_cur);
 
3418
 
 
3419
                                if (btr_cur_get_rec(&right_node_cur)
 
3420
                                    != page_rec_get_next(
 
3421
                                            page_get_infimum_rec(
 
3422
                                                    right_father_page))) {
 
3423
                                        ret = FALSE;
 
3424
                                        fputs("InnoDB: node pointer 2 to"
 
3425
                                              " the right page is wrong\n",
 
3426
                                              stderr);
 
3427
 
 
3428
                                        btr_validate_report1(index, level,
 
3429
                                                             block);
 
3430
 
 
3431
                                        buf_page_print(father_page, 0);
 
3432
                                        buf_page_print(right_father_page, 0);
 
3433
                                        buf_page_print(page, 0);
 
3434
                                        buf_page_print(right_page, 0);
 
3435
                                }
 
3436
 
 
3437
                                if (page_get_page_no(right_father_page)
 
3438
                                    != btr_page_get_next(father_page, &mtr)) {
 
3439
 
 
3440
                                        ret = FALSE;
 
3441
                                        fputs("InnoDB: node pointer 3 to"
 
3442
                                              " the right page is wrong\n",
 
3443
                                              stderr);
 
3444
 
 
3445
                                        btr_validate_report1(index, level,
 
3446
                                                             block);
 
3447
 
 
3448
                                        buf_page_print(father_page, 0);
 
3449
                                        buf_page_print(right_father_page, 0);
 
3450
                                        buf_page_print(page, 0);
 
3451
                                        buf_page_print(right_page, 0);
 
3452
                                }
 
3453
                        }
 
3454
                }
 
3455
        }
 
3456
 
 
3457
node_ptr_fails:
 
3458
        /* Commit the mini-transaction to release the latch on 'page'.
 
3459
        Re-acquire the latch on right_page, which will become 'page'
 
3460
        on the next loop.  The page has already been checked. */
 
3461
        mtr_commit(&mtr);
 
3462
 
 
3463
        if (right_page_no != FIL_NULL) {
 
3464
                mtr_start(&mtr);
 
3465
 
 
3466
                block = btr_block_get(space, zip_size, right_page_no,
 
3467
                                      RW_X_LATCH, &mtr);
 
3468
                page = buf_block_get_frame(block);
 
3469
 
 
3470
                goto loop;
 
3471
        }
 
3472
 
 
3473
        mem_heap_free(heap);
 
3474
        return(ret);
 
3475
}
 
3476
 
 
3477
/******************************************************************
 
3478
Checks the consistency of an index tree. */
 
3479
UNIV_INTERN
 
3480
ibool
 
3481
btr_validate_index(
 
3482
/*===============*/
 
3483
                                /* out: TRUE if ok */
 
3484
        dict_index_t*   index,  /* in: index */
 
3485
        trx_t*          trx)    /* in: transaction or NULL */
 
3486
{
 
3487
        mtr_t   mtr;
 
3488
        page_t* root;
 
3489
        ulint   i;
 
3490
        ulint   n;
 
3491
 
 
3492
        mtr_start(&mtr);
 
3493
        mtr_x_lock(dict_index_get_lock(index), &mtr);
 
3494
 
 
3495
        root = btr_root_get(index, &mtr);
 
3496
        n = btr_page_get_level(root, &mtr);
 
3497
 
 
3498
        for (i = 0; i <= n && !trx_is_interrupted(trx); i++) {
 
3499
                if (!btr_validate_level(index, trx, n - i)) {
 
3500
 
 
3501
                        mtr_commit(&mtr);
 
3502
 
 
3503
                        return(FALSE);
 
3504
                }
 
3505
        }
 
3506
 
 
3507
        mtr_commit(&mtr);
 
3508
 
 
3509
        return(TRUE);
 
3510
}