~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/******************************************************
2
The B-tree
3
4
(c) 1994-1996 Innobase Oy
5
6
Created 6/2/1994 Heikki Tuuri
7
*******************************************************/
8
9
#include "btr0btr.h"
10
11
#ifdef UNIV_NONINL
12
#include "btr0btr.ic"
13
#endif
14
15
#include "fsp0fsp.h"
16
#include "page0page.h"
17
#include "btr0cur.h"
18
#include "btr0sea.h"
19
#include "btr0pcur.h"
20
#include "rem0cmp.h"
21
#include "lock0lock.h"
22
#include "ibuf0ibuf.h"
23
#include "trx0trx.h"
24
25
/*
26
Latching strategy of the InnoDB B-tree
27
--------------------------------------
28
A tree latch protects all non-leaf nodes of the tree. Each node of a tree
29
also has a latch of its own.
30
31
A B-tree operation normally first acquires an S-latch on the tree. It
32
searches down the tree and releases the tree latch when it has the
33
leaf node latch. To save CPU time we do not acquire any latch on
34
non-leaf nodes of the tree during a search, those pages are only bufferfixed.
35
36
If an operation needs to restructure the tree, it acquires an X-latch on
37
the tree before searching to a leaf node. If it needs, for example, to
38
split a leaf,
39
(1) InnoDB decides the split point in the leaf,
40
(2) allocates a new page,
41
(3) inserts the appropriate node pointer to the first non-leaf level,
42
(4) releases the tree X-latch,
43
(5) and then moves records from the leaf to the new allocated page.
44
45
Node pointers
46
-------------
47
Leaf pages of a B-tree contain the index records stored in the
48
tree. On levels n > 0 we store 'node pointers' to pages on level
49
n - 1. For each page there is exactly one node pointer stored:
50
thus the our tree is an ordinary B-tree, not a B-link tree.
51
52
A node pointer contains a prefix P of an index record. The prefix
53
is long enough so that it determines an index record uniquely.
54
The file page number of the child page is added as the last
55
field. To the child page we can store node pointers or index records
56
which are >= P in the alphabetical order, but < P1 if there is
57
a next node pointer on the level, and P1 is its prefix.
58
59
If a node pointer with a prefix P points to a non-leaf child,
60
then the leftmost record in the child must have the same
61
prefix P. If it points to a leaf node, the child is not required
62
to contain any record with a prefix equal to P. The leaf case
63
is decided this way to allow arbitrary deletions in a leaf node
64
without touching upper levels of the tree.
65
66
We have predefined a special minimum record which we
67
define as the smallest record in any alphabetical order.
68
A minimum record is denoted by setting a bit in the record
69
header. A minimum record acts as the prefix of a node pointer
70
which points to a leftmost node on any level of the tree.
71
72
File page allocation
73
--------------------
74
In the root node of a B-tree there are two file segment headers.
75
The leaf pages of a tree are allocated from one file segment, to
76
make them consecutive on disk if possible. From the other file segment
77
we allocate pages for the non-leaf levels of the tree.
78
*/
79
80
/****************************************************************
81
Returns the upper level node pointer to a page. It is assumed that
82
mtr holds an x-latch on the tree. */
83
static
84
rec_t*
85
btr_page_get_father_node_ptr(
86
/*=========================*/
87
				/* out: pointer to node pointer record */
88
	dict_index_t*	index,	/* in: index tree */
89
	page_t*		page,	/* in: page: must contain at least one
90
				user record */
91
	mtr_t*		mtr);	/* in: mtr */
92
/*****************************************************************
93
Empties an index page. */
94
static
95
void
96
btr_page_empty(
97
/*===========*/
98
	page_t*	page,	/* in: page to be emptied */
99
	mtr_t*	mtr);	/* in: mtr */
100
/*****************************************************************
101
Returns TRUE if the insert fits on the appropriate half-page
102
with the chosen split_rec. */
103
static
104
ibool
105
btr_page_insert_fits(
106
/*=================*/
107
					/* out: TRUE if fits */
108
	btr_cur_t*	cursor,		/* in: cursor at which insert
109
					should be made */
110
	rec_t*		split_rec,	/* in: suggestion for first record
111
					on upper half-page, or NULL if
112
					tuple should be first */
113
	const ulint*	offsets,	/* in: rec_get_offsets(
114
					split_rec, cursor->index) */
115
	dtuple_t*	tuple,		/* in: tuple to insert */
116
	mem_heap_t*	heap);		/* in: temporary memory heap */
117
118
/******************************************************************
119
Gets the root node of a tree and x-latches it. */
120
121
page_t*
122
btr_root_get(
123
/*=========*/
124
				/* out: root page, x-latched */
125
	dict_index_t*	index,	/* in: index tree */
126
	mtr_t*		mtr)	/* in: mtr */
127
{
128
	ulint	space;
129
	ulint	root_page_no;
130
	page_t*	root;
131
132
	space = dict_index_get_space(index);
133
	root_page_no = dict_index_get_page(index);
134
135
	root = btr_page_get(space, root_page_no, RW_X_LATCH, mtr);
136
	ut_a((ibool)!!page_is_comp(root) == dict_table_is_comp(index->table));
137
138
	return(root);
139
}
140
141
/*****************************************************************
142
Gets pointer to the previous user record in the tree. It is assumed that
143
the caller has appropriate latches on the page and its neighbor. */
144
145
rec_t*
146
btr_get_prev_user_rec(
147
/*==================*/
148
			/* out: previous user record, NULL if there is none */
149
	rec_t*	rec,	/* in: record on leaf level */
150
	mtr_t*	mtr)	/* in: mtr holding a latch on the page, and if
151
			needed, also to the previous page */
152
{
153
	page_t*	page;
154
	page_t*	prev_page;
155
	ulint	prev_page_no;
156
	ulint	space;
157
158
	if (!page_rec_is_infimum(rec)) {
159
160
		rec_t*	prev_rec = page_rec_get_prev(rec);
161
162
		if (!page_rec_is_infimum(prev_rec)) {
163
164
			return(prev_rec);
165
		}
166
	}
167
168
	page = buf_frame_align(rec);
169
	prev_page_no = btr_page_get_prev(page, mtr);
170
	space = buf_frame_get_space_id(page);
171
172
	if (prev_page_no != FIL_NULL) {
173
174
		prev_page = buf_page_get_with_no_latch(space, prev_page_no,
175
						       mtr);
176
		/* The caller must already have a latch to the brother */
177
		ut_ad((mtr_memo_contains(mtr, buf_block_align(prev_page),
178
					 MTR_MEMO_PAGE_S_FIX))
179
		      || (mtr_memo_contains(mtr, buf_block_align(prev_page),
180
					    MTR_MEMO_PAGE_X_FIX)));
181
		ut_a(page_is_comp(prev_page) == page_is_comp(page));
182
#ifdef UNIV_BTR_DEBUG
183
		ut_a(btr_page_get_next(prev_page, mtr)
184
		     == buf_frame_get_page_no(page));
185
#endif /* UNIV_BTR_DEBUG */
186
187
		return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
188
	}
189
190
	return(NULL);
191
}
192
193
/*****************************************************************
194
Gets pointer to the next user record in the tree. It is assumed that the
195
caller has appropriate latches on the page and its neighbor. */
196
197
rec_t*
198
btr_get_next_user_rec(
199
/*==================*/
200
			/* out: next user record, NULL if there is none */
201
	rec_t*	rec,	/* in: record on leaf level */
202
	mtr_t*	mtr)	/* in: mtr holding a latch on the page, and if
203
			needed, also to the next page */
204
{
205
	page_t*	page;
206
	page_t*	next_page;
207
	ulint	next_page_no;
208
	ulint	space;
209
210
	if (!page_rec_is_supremum(rec)) {
211
212
		rec_t*	next_rec = page_rec_get_next(rec);
213
214
		if (!page_rec_is_supremum(next_rec)) {
215
216
			return(next_rec);
217
		}
218
	}
219
220
	page = buf_frame_align(rec);
221
	next_page_no = btr_page_get_next(page, mtr);
222
	space = buf_frame_get_space_id(page);
223
224
	if (next_page_no != FIL_NULL) {
225
226
		next_page = buf_page_get_with_no_latch(space, next_page_no,
227
						       mtr);
228
		/* The caller must already have a latch to the brother */
229
		ut_ad((mtr_memo_contains(mtr, buf_block_align(next_page),
230
					 MTR_MEMO_PAGE_S_FIX))
231
		      || (mtr_memo_contains(mtr, buf_block_align(next_page),
232
					    MTR_MEMO_PAGE_X_FIX)));
233
#ifdef UNIV_BTR_DEBUG
234
		ut_a(btr_page_get_prev(next_page, mtr)
235
		     == buf_frame_get_page_no(page));
236
#endif /* UNIV_BTR_DEBUG */
237
238
		ut_a(page_is_comp(next_page) == page_is_comp(page));
239
		return(page_rec_get_next(page_get_infimum_rec(next_page)));
240
	}
241
242
	return(NULL);
243
}
244
245
/******************************************************************
246
Creates a new index page (not the root, and also not
247
used in page reorganization). */
248
static
249
void
250
btr_page_create(
251
/*============*/
252
	page_t*		page,	/* in: page to be created */
253
	dict_index_t*	index,	/* in: index */
254
	mtr_t*		mtr)	/* in: mtr */
255
{
256
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
257
				MTR_MEMO_PAGE_X_FIX));
258
	page_create(page, mtr, dict_table_is_comp(index->table));
259
	buf_block_align(page)->check_index_page_at_flush = TRUE;
260
261
	btr_page_set_index_id(page, index->id, mtr);
262
}
263
264
/******************************************************************
265
Allocates a new file page to be used in an ibuf tree. Takes the page from
266
the free list of the tree, which must contain pages! */
267
static
268
page_t*
269
btr_page_alloc_for_ibuf(
270
/*====================*/
271
				/* out: new allocated page, x-latched */
272
	dict_index_t*	index,	/* in: index tree */
273
	mtr_t*		mtr)	/* in: mtr */
274
{
275
	fil_addr_t	node_addr;
276
	page_t*		root;
277
	page_t*		new_page;
278
279
	root = btr_root_get(index, mtr);
280
281
	node_addr = flst_get_first(root + PAGE_HEADER
282
				   + PAGE_BTR_IBUF_FREE_LIST, mtr);
283
	ut_a(node_addr.page != FIL_NULL);
284
285
	new_page = buf_page_get(dict_index_get_space(index), node_addr.page,
286
				RW_X_LATCH, mtr);
287
#ifdef UNIV_SYNC_DEBUG
288
	buf_page_dbg_add_level(new_page, SYNC_TREE_NODE_NEW);
289
#endif /* UNIV_SYNC_DEBUG */
290
291
	flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
292
		    new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
293
		    mtr);
294
	ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
295
			    mtr));
296
297
	return(new_page);
298
}
299
300
/******************************************************************
301
Allocates a new file page to be used in an index tree. NOTE: we assume
302
that the caller has made the reservation for free extents! */
303
304
page_t*
305
btr_page_alloc(
306
/*===========*/
307
					/* out: new allocated page, x-latched;
308
					NULL if out of space */
309
	dict_index_t*	index,		/* in: index */
310
	ulint		hint_page_no,	/* in: hint of a good page */
311
	byte		file_direction,	/* in: direction where a possible
312
					page split is made */
313
	ulint		level,		/* in: level where the page is placed
314
					in the tree */
315
	mtr_t*		mtr)		/* in: mtr */
316
{
317
	fseg_header_t*	seg_header;
318
	page_t*		root;
319
	page_t*		new_page;
320
	ulint		new_page_no;
321
322
	if (index->type & DICT_IBUF) {
323
324
		return(btr_page_alloc_for_ibuf(index, mtr));
325
	}
326
327
	root = btr_root_get(index, mtr);
328
329
	if (level == 0) {
330
		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
331
	} else {
332
		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
333
	}
334
335
	/* Parameter TRUE below states that the caller has made the
336
	reservation for free extents, and thus we know that a page can
337
	be allocated: */
338
339
	new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
340
						   file_direction, TRUE, mtr);
341
	if (new_page_no == FIL_NULL) {
342
343
		return(NULL);
344
	}
345
346
	new_page = buf_page_get(dict_index_get_space(index), new_page_no,
347
				RW_X_LATCH, mtr);
348
#ifdef UNIV_SYNC_DEBUG
349
	buf_page_dbg_add_level(new_page, SYNC_TREE_NODE_NEW);
350
#endif /* UNIV_SYNC_DEBUG */
351
352
	return(new_page);
353
}
354
355
/******************************************************************
356
Gets the number of pages in a B-tree. */
357
358
ulint
359
btr_get_size(
360
/*=========*/
361
				/* out: number of pages */
362
	dict_index_t*	index,	/* in: index */
363
	ulint		flag)	/* in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
364
{
365
	fseg_header_t*	seg_header;
366
	page_t*		root;
367
	ulint		n;
368
	ulint		dummy;
369
	mtr_t		mtr;
370
371
	mtr_start(&mtr);
372
373
	mtr_s_lock(dict_index_get_lock(index), &mtr);
374
375
	root = btr_root_get(index, &mtr);
376
377
	if (flag == BTR_N_LEAF_PAGES) {
378
		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
379
380
		fseg_n_reserved_pages(seg_header, &n, &mtr);
381
382
	} else if (flag == BTR_TOTAL_SIZE) {
383
		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
384
385
		n = fseg_n_reserved_pages(seg_header, &dummy, &mtr);
386
387
		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
388
389
		n += fseg_n_reserved_pages(seg_header, &dummy, &mtr);
390
	} else {
391
		ut_error;
392
	}
393
394
	mtr_commit(&mtr);
395
396
	return(n);
397
}
398
399
/******************************************************************
400
Frees a page used in an ibuf tree. Puts the page to the free list of the
401
ibuf tree. */
402
static
403
void
404
btr_page_free_for_ibuf(
405
/*===================*/
406
	dict_index_t*	index,	/* in: index tree */
407
	page_t*		page,	/* in: page to be freed, x-latched */
408
	mtr_t*		mtr)	/* in: mtr */
409
{
410
	page_t*		root;
411
412
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
413
				MTR_MEMO_PAGE_X_FIX));
414
	root = btr_root_get(index, mtr);
415
416
	flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
417
		       page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
418
419
	ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
420
			    mtr));
421
}
422
423
/******************************************************************
424
Frees a file page used in an index tree. Can be used also to (BLOB)
425
external storage pages, because the page level 0 can be given as an
426
argument. */
427
428
void
429
btr_page_free_low(
430
/*==============*/
431
	dict_index_t*	index,	/* in: index tree */
432
	page_t*		page,	/* in: page to be freed, x-latched */
433
	ulint		level,	/* in: page level */
434
	mtr_t*		mtr)	/* in: mtr */
435
{
436
	fseg_header_t*	seg_header;
437
	page_t*		root;
438
	ulint		space;
439
	ulint		page_no;
440
441
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
442
				MTR_MEMO_PAGE_X_FIX));
443
	/* The page gets invalid for optimistic searches: increment the frame
444
	modify clock */
445
446
	buf_frame_modify_clock_inc(page);
447
448
	if (index->type & DICT_IBUF) {
449
450
		btr_page_free_for_ibuf(index, page, mtr);
451
452
		return;
453
	}
454
455
	root = btr_root_get(index, mtr);
456
457
	if (level == 0) {
458
		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
459
	} else {
460
		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
461
	}
462
463
	space = buf_frame_get_space_id(page);
464
	page_no = buf_frame_get_page_no(page);
465
466
	fseg_free_page(seg_header, space, page_no, mtr);
467
}
468
469
/******************************************************************
470
Frees a file page used in an index tree. NOTE: cannot free field external
471
storage pages because the page must contain info on its level. */
472
473
void
474
btr_page_free(
475
/*==========*/
476
	dict_index_t*	index,	/* in: index tree */
477
	page_t*		page,	/* in: page to be freed, x-latched */
478
	mtr_t*		mtr)	/* in: mtr */
479
{
480
	ulint		level;
481
482
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
483
				MTR_MEMO_PAGE_X_FIX));
484
	level = btr_page_get_level(page, mtr);
485
486
	btr_page_free_low(index, page, level, mtr);
487
}
488
489
/******************************************************************
490
Sets the child node file address in a node pointer. */
491
UNIV_INLINE
492
void
493
btr_node_ptr_set_child_page_no(
494
/*===========================*/
495
	rec_t*		rec,	/* in: node pointer record */
496
	const ulint*	offsets,/* in: array returned by rec_get_offsets() */
497
	ulint		page_no,/* in: child node address */
498
	mtr_t*		mtr)	/* in: mtr */
499
{
500
	byte*	field;
501
	ulint	len;
502
503
	ut_ad(rec_offs_validate(rec, NULL, offsets));
504
	ut_ad(0 < btr_page_get_level(buf_frame_align(rec), mtr));
505
	ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
506
507
	/* The child address is in the last field */
508
	field = rec_get_nth_field(rec, offsets,
509
				  rec_offs_n_fields(offsets) - 1, &len);
510
511
	ut_ad(len == 4);
512
513
	mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
514
}
515
516
/****************************************************************
517
Returns the child page of a node pointer and x-latches it. */
518
static
519
page_t*
520
btr_node_ptr_get_child(
521
/*===================*/
522
				/* out: child page, x-latched */
523
	rec_t*		node_ptr,/* in: node pointer */
524
	const ulint*	offsets,/* in: array returned by rec_get_offsets() */
525
	mtr_t*		mtr)	/* in: mtr */
526
{
527
	ulint	page_no;
528
	ulint	space;
529
	page_t*	page;
530
531
	ut_ad(rec_offs_validate(node_ptr, NULL, offsets));
532
	space = buf_frame_get_space_id(node_ptr);
533
	page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
534
535
	page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
536
537
	return(page);
538
}
539
540
/****************************************************************
541
Returns the upper level node pointer to a page. It is assumed that mtr holds
542
an x-latch on the tree. */
543
static
544
rec_t*
545
btr_page_get_father_for_rec(
546
/*========================*/
547
				/* out: pointer to node pointer record,
548
				its page x-latched */
549
	dict_index_t*	index,	/* in: index tree */
550
	page_t*		page,	/* in: page: must contain at least one
551
				user record */
552
	rec_t*		user_rec,/* in: user_record on page */
553
	mtr_t*		mtr)	/* in: mtr */
554
{
555
	mem_heap_t*	heap;
556
	dtuple_t*	tuple;
557
	btr_cur_t	cursor;
558
	rec_t*		node_ptr;
559
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
560
	ulint*		offsets	= offsets_;
561
	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
562
563
	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
564
				MTR_MEMO_X_LOCK));
565
	ut_a(page_rec_is_user_rec(user_rec));
566
567
	ut_ad(dict_index_get_page(index) != buf_frame_get_page_no(page));
568
569
	heap = mem_heap_create(100);
570
571
	tuple = dict_index_build_node_ptr(index, user_rec, 0, heap,
572
					  btr_page_get_level(page, mtr));
573
574
	btr_cur_search_to_nth_level(index,
575
				    btr_page_get_level(page, mtr) + 1,
576
				    tuple, PAGE_CUR_LE,
577
				    BTR_CONT_MODIFY_TREE, &cursor, 0, mtr);
578
579
	node_ptr = btr_cur_get_rec(&cursor);
580
	offsets = rec_get_offsets(node_ptr, index, offsets,
581
				  ULINT_UNDEFINED, &heap);
582
583
	if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets)
584
			  != buf_frame_get_page_no(page))) {
585
		rec_t*	print_rec;
586
		fputs("InnoDB: Dump of the child page:\n", stderr);
587
		buf_page_print(buf_frame_align(page));
588
		fputs("InnoDB: Dump of the parent page:\n", stderr);
589
		buf_page_print(buf_frame_align(node_ptr));
590
591
		fputs("InnoDB: Corruption of an index tree: table ", stderr);
592
		ut_print_name(stderr, NULL, TRUE, index->table_name);
593
		fputs(", index ", stderr);
594
		ut_print_name(stderr, NULL, FALSE, index->name);
595
		fprintf(stderr, ",\n"
596
			"InnoDB: father ptr page no %lu, child page no %lu\n",
597
			(ulong)
598
			btr_node_ptr_get_child_page_no(node_ptr, offsets),
599
			(ulong) buf_frame_get_page_no(page));
600
		print_rec = page_rec_get_next(page_get_infimum_rec(page));
601
		offsets = rec_get_offsets(print_rec, index,
602
					  offsets, ULINT_UNDEFINED, &heap);
603
		page_rec_print(print_rec, offsets);
604
		offsets = rec_get_offsets(node_ptr, index, offsets,
605
					  ULINT_UNDEFINED, &heap);
606
		page_rec_print(node_ptr, offsets);
607
608
		fputs("InnoDB: You should dump + drop + reimport the table"
609
		      " to fix the\n"
610
		      "InnoDB: corruption. If the crash happens at "
611
		      "the database startup, see\n"
612
		      "InnoDB: http://dev.mysql.com/doc/refman/5.1/en/"
613
		      "forcing-recovery.html about\n"
614
		      "InnoDB: forcing recovery. "
615
		      "Then dump + drop + reimport.\n", stderr);
616
	}
617
618
	ut_a(btr_node_ptr_get_child_page_no(node_ptr, offsets)
619
	     == buf_frame_get_page_no(page));
620
	mem_heap_free(heap);
621
622
	return(node_ptr);
623
}
624
625
/****************************************************************
626
Returns the upper level node pointer to a page. It is assumed that
627
mtr holds an x-latch on the tree. */
628
static
629
rec_t*
630
btr_page_get_father_node_ptr(
631
/*=========================*/
632
				/* out: pointer to node pointer record */
633
	dict_index_t*	index,	/* in: index tree */
634
	page_t*		page,	/* in: page: must contain at least one
635
				user record */
636
	mtr_t*		mtr)	/* in: mtr */
637
{
638
	return(btr_page_get_father_for_rec(
639
		       index, page,
640
		       page_rec_get_next(page_get_infimum_rec(page)), mtr));
641
}
642
643
/****************************************************************
644
Creates the root node for a new index tree. */
645
646
ulint
647
btr_create(
648
/*=======*/
649
			/* out: page number of the created root, FIL_NULL if
650
			did not succeed */
651
	ulint	type,	/* in: type of the index */
652
	ulint	space,	/* in: space where created */
653
	dulint	index_id,/* in: index id */
654
	ulint	comp,	/* in: nonzero=compact page format */
655
	mtr_t*	mtr)	/* in: mini-transaction handle */
656
{
657
	ulint		page_no;
658
	buf_frame_t*	ibuf_hdr_frame;
659
	buf_frame_t*	frame;
660
	page_t*		page;
661
662
	/* Create the two new segments (one, in the case of an ibuf tree) for
663
	the index tree; the segment headers are put on the allocated root page
664
	(for an ibuf tree, not in the root, but on a separate ibuf header
665
	page) */
666
667
	if (type & DICT_IBUF) {
668
		/* Allocate first the ibuf header page */
669
		ibuf_hdr_frame = fseg_create(
670
			space, 0, IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
671
672
#ifdef UNIV_SYNC_DEBUG
673
		buf_page_dbg_add_level(ibuf_hdr_frame, SYNC_TREE_NODE_NEW);
674
#endif /* UNIV_SYNC_DEBUG */
675
		ut_ad(buf_frame_get_page_no(ibuf_hdr_frame)
676
		      == IBUF_HEADER_PAGE_NO);
677
		/* Allocate then the next page to the segment: it will be the
678
		tree root page */
679
680
		page_no = fseg_alloc_free_page(ibuf_hdr_frame + IBUF_HEADER
681
					       + IBUF_TREE_SEG_HEADER,
682
					       IBUF_TREE_ROOT_PAGE_NO,
683
					       FSP_UP, mtr);
684
		ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
685
686
		frame = buf_page_get(space, page_no, RW_X_LATCH, mtr);
687
	} else {
688
		frame = fseg_create(space, 0, PAGE_HEADER + PAGE_BTR_SEG_TOP,
689
				    mtr);
690
	}
691
692
	if (frame == NULL) {
693
694
		return(FIL_NULL);
695
	}
696
697
	page_no = buf_frame_get_page_no(frame);
698
699
#ifdef UNIV_SYNC_DEBUG
700
	buf_page_dbg_add_level(frame, SYNC_TREE_NODE_NEW);
701
#endif /* UNIV_SYNC_DEBUG */
702
703
	if (type & DICT_IBUF) {
704
		/* It is an insert buffer tree: initialize the free list */
705
706
		ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
707
708
		flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
709
	} else {
710
		/* It is a non-ibuf tree: create a file segment for leaf
711
		pages */
712
		fseg_create(space, page_no, PAGE_HEADER + PAGE_BTR_SEG_LEAF,
713
			    mtr);
714
		/* The fseg create acquires a second latch on the page,
715
		therefore we must declare it: */
716
#ifdef UNIV_SYNC_DEBUG
717
		buf_page_dbg_add_level(frame, SYNC_TREE_NODE_NEW);
718
#endif /* UNIV_SYNC_DEBUG */
719
	}
720
721
	/* Create a new index page on the the allocated segment page */
722
	page = page_create(frame, mtr, comp);
723
	buf_block_align(page)->check_index_page_at_flush = TRUE;
724
725
	/* Set the index id of the page */
726
	btr_page_set_index_id(page, index_id, mtr);
727
728
	/* Set the level of the new index page */
729
	btr_page_set_level(page, 0, mtr);
730
731
	/* Set the next node and previous node fields */
732
	btr_page_set_next(page, FIL_NULL, mtr);
733
	btr_page_set_prev(page, FIL_NULL, mtr);
734
735
	/* We reset the free bits for the page to allow creation of several
736
	trees in the same mtr, otherwise the latch on a bitmap page would
737
	prevent it because of the latching order */
738
739
	ibuf_reset_free_bits_with_type(type, page);
740
741
	/* In the following assertion we test that two records of maximum
742
	allowed size fit on the root page: this fact is needed to ensure
743
	correctness of split algorithms */
744
745
	ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
746
747
	return(page_no);
748
}
749
750
/****************************************************************
751
Frees a B-tree except the root page, which MUST be freed after this
752
by calling btr_free_root. */
753
754
void
755
btr_free_but_not_root(
756
/*==================*/
757
	ulint	space,		/* in: space where created */
758
	ulint	root_page_no)	/* in: root page number */
759
{
760
	ibool	finished;
761
	page_t*	root;
762
	mtr_t	mtr;
763
764
leaf_loop:
765
	mtr_start(&mtr);
766
767
	root = btr_page_get(space, root_page_no, RW_X_LATCH, &mtr);
768
769
	/* NOTE: page hash indexes are dropped when a page is freed inside
770
	fsp0fsp. */
771
772
	finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
773
				  &mtr);
774
	mtr_commit(&mtr);
775
776
	if (!finished) {
777
778
		goto leaf_loop;
779
	}
780
top_loop:
781
	mtr_start(&mtr);
782
783
	root = btr_page_get(space, root_page_no, RW_X_LATCH, &mtr);
784
785
	finished = fseg_free_step_not_header(
786
		root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
787
	mtr_commit(&mtr);
788
789
	if (!finished) {
790
791
		goto top_loop;
792
	}
793
}
794
795
/****************************************************************
796
Frees the B-tree root page. Other tree MUST already have been freed. */
797
798
void
799
btr_free_root(
800
/*==========*/
801
	ulint	space,		/* in: space where created */
802
	ulint	root_page_no,	/* in: root page number */
803
	mtr_t*	mtr)		/* in: a mini-transaction which has already
804
				been started */
805
{
806
	ibool	finished;
807
	page_t*	root;
808
809
	root = btr_page_get(space, root_page_no, RW_X_LATCH, mtr);
810
811
	btr_search_drop_page_hash_index(root);
812
top_loop:
813
	finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
814
	if (!finished) {
815
816
		goto top_loop;
817
	}
818
}
819
820
/*****************************************************************
821
Reorganizes an index page. */
822
static
823
void
824
btr_page_reorganize_low(
825
/*====================*/
826
	ibool		recovery,/* in: TRUE if called in recovery:
827
				locks should not be updated, i.e.,
828
				there cannot exist locks on the
829
				page, and a hash index should not be
830
				dropped: it cannot exist */
831
	page_t*		page,	/* in: page to be reorganized */
832
	dict_index_t*	index,	/* in: record descriptor */
833
	mtr_t*		mtr)	/* in: mtr */
834
{
835
	page_t*	new_page;
836
	ulint	log_mode;
837
	ulint	data_size1;
838
	ulint	data_size2;
839
	ulint	max_ins_size1;
840
	ulint	max_ins_size2;
841
842
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
843
				MTR_MEMO_PAGE_X_FIX));
844
	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
845
	data_size1 = page_get_data_size(page);
846
	max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
847
848
	/* Write the log record */
849
	mlog_open_and_write_index(mtr, page, index, page_is_comp(page)
850
				  ? MLOG_COMP_PAGE_REORGANIZE
851
				  : MLOG_PAGE_REORGANIZE, 0);
852
853
	/* Turn logging off */
854
	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
855
856
	new_page = buf_frame_alloc();
857
858
	/* Copy the old page to temporary space */
859
	buf_frame_copy(new_page, page);
860
861
	if (!recovery) {
862
		btr_search_drop_page_hash_index(page);
863
	}
864
865
	/* Recreate the page: note that global data on page (possible
866
	segment headers, next page-field, etc.) is preserved intact */
867
868
	page_create(page, mtr, page_is_comp(page));
869
	buf_block_align(page)->check_index_page_at_flush = TRUE;
870
871
	/* Copy the records from the temporary space to the recreated page;
872
	do not copy the lock bits yet */
873
874
	page_copy_rec_list_end_no_locks(page, new_page,
875
					page_get_infimum_rec(new_page),
876
					index, mtr);
877
	/* Copy max trx id to recreated page */
878
	page_set_max_trx_id(page, page_get_max_trx_id(new_page));
879
880
	if (!recovery) {
881
		/* Update the record lock bitmaps */
882
		lock_move_reorganize_page(page, new_page);
883
	}
884
885
	data_size2 = page_get_data_size(page);
886
	max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
887
888
	if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) {
889
		buf_page_print(page);
890
		buf_page_print(new_page);
891
		fprintf(stderr,
892
			"InnoDB: Error: page old data size %lu"
893
			" new data size %lu\n"
894
			"InnoDB: Error: page old max ins size %lu"
895
			" new max ins size %lu\n"
896
			"InnoDB: Submit a detailed bug report"
897
			" to http://bugs.mysql.com\n",
898
			(unsigned long) data_size1, (unsigned long) data_size2,
899
			(unsigned long) max_ins_size1,
900
			(unsigned long) max_ins_size2);
901
	}
902
903
	buf_frame_free(new_page);
904
905
	/* Restore logging mode */
906
	mtr_set_log_mode(mtr, log_mode);
907
}
908
909
/*****************************************************************
910
Reorganizes an index page. */
911
912
void
913
btr_page_reorganize(
914
/*================*/
915
	page_t*		page,	/* in: page to be reorganized */
916
	dict_index_t*	index,	/* in: record descriptor */
917
	mtr_t*		mtr)	/* in: mtr */
918
{
919
	btr_page_reorganize_low(FALSE, page, index, mtr);
920
}
921
922
/***************************************************************
923
Parses a redo log record of reorganizing a page. */
924
925
byte*
926
btr_parse_page_reorganize(
927
/*======================*/
928
				/* out: end of log record or NULL */
929
	byte*		ptr,	/* in: buffer */
930
	byte*		end_ptr __attribute__((unused)),
931
				/* in: buffer end */
932
	dict_index_t*	index,	/* in: record descriptor */
933
	page_t*		page,	/* in: page or NULL */
934
	mtr_t*		mtr)	/* in: mtr or NULL */
935
{
936
	ut_ad(ptr && end_ptr);
937
938
	/* The record is empty, except for the record initial part */
939
940
	if (page) {
941
		btr_page_reorganize_low(TRUE, page, index, mtr);
942
	}
943
944
	return(ptr);
945
}
946
947
/*****************************************************************
948
Empties an index page. */
949
static
950
void
951
btr_page_empty(
952
/*===========*/
953
	page_t*	page,	/* in: page to be emptied */
954
	mtr_t*	mtr)	/* in: mtr */
955
{
956
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
957
				MTR_MEMO_PAGE_X_FIX));
958
	btr_search_drop_page_hash_index(page);
959
960
	/* Recreate the page: note that global data on page (possible
961
	segment headers, next page-field, etc.) is preserved intact */
962
963
	page_create(page, mtr, page_is_comp(page));
964
	buf_block_align(page)->check_index_page_at_flush = TRUE;
965
}
966
967
/*****************************************************************
968
Makes tree one level higher by splitting the root, and inserts
969
the tuple. It is assumed that mtr contains an x-latch on the tree.
970
NOTE that the operation of this function must always succeed,
971
we cannot reverse it: therefore enough free disk space must be
972
guaranteed to be available before this function is called. */
973
974
rec_t*
975
btr_root_raise_and_insert(
976
/*======================*/
977
				/* out: inserted record */
978
	btr_cur_t*	cursor,	/* in: cursor at which to insert: must be
979
				on the root page; when the function returns,
980
				the cursor is positioned on the predecessor
981
				of the inserted record */
982
	dtuple_t*	tuple,	/* in: tuple to insert */
983
	mtr_t*		mtr)	/* in: mtr */
984
{
985
	dict_index_t*	index;
986
	page_t*		root;
987
	page_t*		new_page;
988
	ulint		new_page_no;
989
	rec_t*		rec;
990
	mem_heap_t*	heap;
991
	dtuple_t*	node_ptr;
992
	ulint		level;
993
	rec_t*		node_ptr_rec;
994
	page_cur_t*	page_cursor;
995
996
	root = btr_cur_get_page(cursor);
997
	index = btr_cur_get_index(cursor);
998
999
	ut_ad(dict_index_get_page(index) == buf_frame_get_page_no(root));
1000
	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1001
				MTR_MEMO_X_LOCK));
1002
	ut_ad(mtr_memo_contains(mtr, buf_block_align(root),
1003
				MTR_MEMO_PAGE_X_FIX));
1004
	btr_search_drop_page_hash_index(root);
1005
1006
	/* Allocate a new page to the tree. Root splitting is done by first
1007
	moving the root records to the new page, emptying the root, putting
1008
	a node pointer to the new page, and then splitting the new page. */
1009
1010
	new_page = btr_page_alloc(index, 0, FSP_NO_DIR,
1011
				  btr_page_get_level(root, mtr), mtr);
1012
1013
	btr_page_create(new_page, index, mtr);
1014
1015
	level = btr_page_get_level(root, mtr);
1016
1017
	/* Set the levels of the new index page and root page */
1018
	btr_page_set_level(new_page, level, mtr);
1019
	btr_page_set_level(root, level + 1, mtr);
1020
1021
	/* Set the next node and previous node fields of new page */
1022
	btr_page_set_next(new_page, FIL_NULL, mtr);
1023
	btr_page_set_prev(new_page, FIL_NULL, mtr);
1024
1025
	/* Move the records from root to the new page */
1026
1027
	page_move_rec_list_end(new_page, root, page_get_infimum_rec(root),
1028
			       index, mtr);
1029
	/* If this is a pessimistic insert which is actually done to
1030
	perform a pessimistic update then we have stored the lock
1031
	information of the record to be inserted on the infimum of the
1032
	root page: we cannot discard the lock structs on the root page */
1033
1034
	lock_update_root_raise(new_page, root);
1035
1036
	/* Create a memory heap where the node pointer is stored */
1037
	heap = mem_heap_create(100);
1038
1039
	rec = page_rec_get_next(page_get_infimum_rec(new_page));
1040
	new_page_no = buf_frame_get_page_no(new_page);
1041
1042
	/* Build the node pointer (= node key and page address) for the
1043
	child */
1044
1045
	node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
1046
					     level);
1047
	/* Reorganize the root to get free space */
1048
	btr_page_reorganize(root, index, mtr);
1049
1050
	page_cursor = btr_cur_get_page_cur(cursor);
1051
1052
	/* Insert node pointer to the root */
1053
1054
	page_cur_set_before_first(root, page_cursor);
1055
1056
	node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
1057
					     index, mtr);
1058
1059
	ut_ad(node_ptr_rec);
1060
1061
	/* The node pointer must be marked as the predefined minimum record,
1062
	as there is no lower alphabetical limit to records in the leftmost
1063
	node of a level: */
1064
1065
	btr_set_min_rec_mark(node_ptr_rec, page_is_comp(root), mtr);
1066
1067
	/* Free the memory heap */
1068
	mem_heap_free(heap);
1069
1070
	/* We play safe and reset the free bits for the new page */
1071
1072
#if 0
1073
	fprintf(stderr, "Root raise new page no %lu\n",
1074
		buf_frame_get_page_no(new_page));
1075
#endif
1076
1077
	ibuf_reset_free_bits(index, new_page);
1078
	/* Reposition the cursor to the child node */
1079
	page_cur_search(new_page, index, tuple,
1080
			PAGE_CUR_LE, page_cursor);
1081
1082
	/* Split the child and insert tuple */
1083
	return(btr_page_split_and_insert(cursor, tuple, mtr));
1084
}
1085
1086
/*****************************************************************
1087
Decides if the page should be split at the convergence point of inserts
1088
converging to the left. */
1089
1090
ibool
1091
btr_page_get_split_rec_to_left(
1092
/*===========================*/
1093
				/* out: TRUE if split recommended */
1094
	btr_cur_t*	cursor,	/* in: cursor at which to insert */
1095
	rec_t**		split_rec) /* out: if split recommended,
1096
				the first record on upper half page,
1097
				or NULL if tuple to be inserted should
1098
				be first */
1099
{
1100
	page_t*	page;
1101
	rec_t*	insert_point;
1102
	rec_t*	infimum;
1103
1104
	page = btr_cur_get_page(cursor);
1105
	insert_point = btr_cur_get_rec(cursor);
1106
1107
	if (page_header_get_ptr(page, PAGE_LAST_INSERT)
1108
	    == page_rec_get_next(insert_point)) {
1109
1110
		infimum = page_get_infimum_rec(page);
1111
1112
		/* If the convergence is in the middle of a page, include also
1113
		the record immediately before the new insert to the upper
1114
		page. Otherwise, we could repeatedly move from page to page
1115
		lots of records smaller than the convergence point. */
1116
1117
		if (infimum != insert_point
1118
		    && page_rec_get_next(infimum) != insert_point) {
1119
1120
			*split_rec = insert_point;
1121
		} else {
1122
			*split_rec = page_rec_get_next(insert_point);
1123
		}
1124
1125
		return(TRUE);
1126
	}
1127
1128
	return(FALSE);
1129
}
1130
1131
/*****************************************************************
1132
Decides if the page should be split at the convergence point of inserts
1133
converging to the right. */
1134
1135
ibool
1136
btr_page_get_split_rec_to_right(
1137
/*============================*/
1138
				/* out: TRUE if split recommended */
1139
	btr_cur_t*	cursor,	/* in: cursor at which to insert */
1140
	rec_t**		split_rec) /* out: if split recommended,
1141
				the first record on upper half page,
1142
				or NULL if tuple to be inserted should
1143
				be first */
1144
{
1145
	page_t*	page;
1146
	rec_t*	insert_point;
1147
1148
	page = btr_cur_get_page(cursor);
1149
	insert_point = btr_cur_get_rec(cursor);
1150
1151
	/* We use eager heuristics: if the new insert would be right after
1152
	the previous insert on the same page, we assume that there is a
1153
	pattern of sequential inserts here. */
1154
1155
	if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT)
1156
			== insert_point)) {
1157
1158
		rec_t*	next_rec;
1159
1160
		next_rec = page_rec_get_next(insert_point);
1161
1162
		if (page_rec_is_supremum(next_rec)) {
1163
split_at_new:
1164
			/* Split at the new record to insert */
1165
			*split_rec = NULL;
1166
		} else {
1167
			rec_t*	next_next_rec = page_rec_get_next(next_rec);
1168
			if (page_rec_is_supremum(next_next_rec)) {
1169
1170
				goto split_at_new;
1171
			}
1172
1173
			/* If there are >= 2 user records up from the insert
1174
			point, split all but 1 off. We want to keep one because
1175
			then sequential inserts can use the adaptive hash
1176
			index, as they can do the necessary checks of the right
1177
			search position just by looking at the records on this
1178
			page. */
1179
1180
			*split_rec = next_next_rec;
1181
		}
1182
1183
		return(TRUE);
1184
	}
1185
1186
	return(FALSE);
1187
}
1188
1189
/*****************************************************************
1190
Calculates a split record such that the tuple will certainly fit on
1191
its half-page when the split is performed. We assume in this function
1192
only that the cursor page has at least one user record. */
1193
static
1194
rec_t*
1195
btr_page_get_sure_split_rec(
1196
/*========================*/
1197
					/* out: split record, or NULL if
1198
					tuple will be the first record on
1199
					upper half-page */
1200
	btr_cur_t*	cursor,		/* in: cursor at which insert
1201
					should be made */
1202
	dtuple_t*	tuple)		/* in: tuple to insert */
1203
{
1204
	page_t*	page;
1205
	ulint	insert_size;
1206
	ulint	free_space;
1207
	ulint	total_data;
1208
	ulint	total_n_recs;
1209
	ulint	total_space;
1210
	ulint	incl_data;
1211
	rec_t*	ins_rec;
1212
	rec_t*	rec;
1213
	rec_t*	next_rec;
1214
	ulint	n;
1215
	mem_heap_t* heap;
1216
	ulint*	offsets;
1217
1218
	page = btr_cur_get_page(cursor);
1219
1220
	insert_size = rec_get_converted_size(cursor->index, tuple);
1221
	free_space  = page_get_free_space_of_empty(page_is_comp(page));
1222
1223
	/* free_space is now the free space of a created new page */
1224
1225
	total_data   = page_get_data_size(page) + insert_size;
1226
	total_n_recs = page_get_n_recs(page) + 1;
1227
	ut_ad(total_n_recs >= 2);
1228
	total_space  = total_data + page_dir_calc_reserved_space(total_n_recs);
1229
1230
	n = 0;
1231
	incl_data = 0;
1232
	ins_rec = btr_cur_get_rec(cursor);
1233
	rec = page_get_infimum_rec(page);
1234
1235
	heap = NULL;
1236
	offsets = NULL;
1237
1238
	/* We start to include records to the left half, and when the
1239
	space reserved by them exceeds half of total_space, then if
1240
	the included records fit on the left page, they will be put there
1241
	if something was left over also for the right page,
1242
	otherwise the last included record will be the first on the right
1243
	half page */
1244
1245
	for (;;) {
1246
		/* Decide the next record to include */
1247
		if (rec == ins_rec) {
1248
			rec = NULL;	/* NULL denotes that tuple is
1249
					now included */
1250
		} else if (rec == NULL) {
1251
			rec = page_rec_get_next(ins_rec);
1252
		} else {
1253
			rec = page_rec_get_next(rec);
1254
		}
1255
1256
		if (rec == NULL) {
1257
			/* Include tuple */
1258
			incl_data += insert_size;
1259
		} else {
1260
			offsets = rec_get_offsets(rec, cursor->index,
1261
						  offsets, ULINT_UNDEFINED,
1262
						  &heap);
1263
			incl_data += rec_offs_size(offsets);
1264
		}
1265
1266
		n++;
1267
1268
		if (incl_data + page_dir_calc_reserved_space(n)
1269
		    >= total_space / 2) {
1270
1271
			if (incl_data + page_dir_calc_reserved_space(n)
1272
			    <= free_space) {
1273
				/* The next record will be the first on
1274
				the right half page if it is not the
1275
				supremum record of page */
1276
1277
				if (rec == ins_rec) {
1278
					rec = NULL;
1279
1280
					goto func_exit;
1281
				} else if (rec == NULL) {
1282
					next_rec = page_rec_get_next(ins_rec);
1283
				} else {
1284
					next_rec = page_rec_get_next(rec);
1285
				}
1286
				ut_ad(next_rec);
1287
				if (!page_rec_is_supremum(next_rec)) {
1288
					rec = next_rec;
1289
				}
1290
			}
1291
1292
func_exit:
1293
			if (UNIV_LIKELY_NULL(heap)) {
1294
				mem_heap_free(heap);
1295
			}
1296
			return(rec);
1297
		}
1298
	}
1299
}
1300
1301
/*****************************************************************
1302
Returns TRUE if the insert fits on the appropriate half-page with the
1303
chosen split_rec. */
1304
static
1305
ibool
1306
btr_page_insert_fits(
1307
/*=================*/
1308
					/* out: TRUE if fits */
1309
	btr_cur_t*	cursor,		/* in: cursor at which insert
1310
					should be made */
1311
	rec_t*		split_rec,	/* in: suggestion for first record
1312
					on upper half-page, or NULL if
1313
					tuple to be inserted should be first */
1314
	const ulint*	offsets,	/* in: rec_get_offsets(
1315
					split_rec, cursor->index) */
1316
	dtuple_t*	tuple,		/* in: tuple to insert */
1317
	mem_heap_t*	heap)		/* in: temporary memory heap */
1318
{
1319
	page_t*	page;
1320
	ulint	insert_size;
1321
	ulint	free_space;
1322
	ulint	total_data;
1323
	ulint	total_n_recs;
1324
	rec_t*	rec;
1325
	rec_t*	end_rec;
1326
	ulint*	offs;
1327
1328
	page = btr_cur_get_page(cursor);
1329
1330
	ut_ad(!split_rec == !offsets);
1331
	ut_ad(!offsets
1332
	      || !page_is_comp(page) == !rec_offs_comp(offsets));
1333
	ut_ad(!offsets
1334
	      || rec_offs_validate(split_rec, cursor->index, offsets));
1335
1336
	insert_size = rec_get_converted_size(cursor->index, tuple);
1337
	free_space  = page_get_free_space_of_empty(page_is_comp(page));
1338
1339
	/* free_space is now the free space of a created new page */
1340
1341
	total_data   = page_get_data_size(page) + insert_size;
1342
	total_n_recs = page_get_n_recs(page) + 1;
1343
1344
	/* We determine which records (from rec to end_rec, not including
1345
	end_rec) will end up on the other half page from tuple when it is
1346
	inserted. */
1347
1348
	if (split_rec == NULL) {
1349
		rec = page_rec_get_next(page_get_infimum_rec(page));
1350
		end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
1351
1352
	} else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) {
1353
1354
		rec = page_rec_get_next(page_get_infimum_rec(page));
1355
		end_rec = split_rec;
1356
	} else {
1357
		rec = split_rec;
1358
		end_rec = page_get_supremum_rec(page);
1359
	}
1360
1361
	if (total_data + page_dir_calc_reserved_space(total_n_recs)
1362
	    <= free_space) {
1363
1364
		/* Ok, there will be enough available space on the
1365
		half page where the tuple is inserted */
1366
1367
		return(TRUE);
1368
	}
1369
1370
	offs = NULL;
1371
1372
	while (rec != end_rec) {
1373
		/* In this loop we calculate the amount of reserved
1374
		space after rec is removed from page. */
1375
1376
		offs = rec_get_offsets(rec, cursor->index, offs,
1377
				       ULINT_UNDEFINED, &heap);
1378
1379
		total_data -= rec_offs_size(offs);
1380
		total_n_recs--;
1381
1382
		if (total_data + page_dir_calc_reserved_space(total_n_recs)
1383
		    <= free_space) {
1384
1385
			/* Ok, there will be enough available space on the
1386
			half page where the tuple is inserted */
1387
1388
			return(TRUE);
1389
		}
1390
1391
		rec = page_rec_get_next(rec);
1392
	}
1393
1394
	return(FALSE);
1395
}
1396
1397
/***********************************************************
1398
Inserts a data tuple to a tree on a non-leaf level. It is assumed
1399
that mtr holds an x-latch on the tree. */
1400
1401
void
1402
btr_insert_on_non_leaf_level(
1403
/*=========================*/
1404
	dict_index_t*	index,	/* in: index */
1405
	ulint		level,	/* in: level, must be > 0 */
1406
	dtuple_t*	tuple,	/* in: the record to be inserted */
1407
	mtr_t*		mtr)	/* in: mtr */
1408
{
1409
	big_rec_t*	dummy_big_rec;
1410
	btr_cur_t	cursor;
1411
	ulint		err;
1412
	rec_t*		rec;
1413
1414
	ut_ad(level > 0);
1415
1416
	btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
1417
				    BTR_CONT_MODIFY_TREE,
1418
				    &cursor, 0, mtr);
1419
1420
	err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
1421
					 | BTR_KEEP_SYS_FLAG
1422
					 | BTR_NO_UNDO_LOG_FLAG,
1423
					 &cursor, tuple, &rec,
1424
					 &dummy_big_rec, NULL, mtr);
1425
	ut_a(err == DB_SUCCESS);
1426
}
1427
1428
/******************************************************************
1429
Attaches the halves of an index page on the appropriate level in an
1430
index tree. */
1431
static
1432
void
1433
btr_attach_half_pages(
1434
/*==================*/
1435
	dict_index_t*	index,		/* in: the index tree */
1436
	page_t*		page,		/* in: page to be split */
1437
	rec_t*		split_rec,	/* in: first record on upper
1438
					half page */
1439
	page_t*		new_page,	/* in: the new half page */
1440
	ulint		direction,	/* in: FSP_UP or FSP_DOWN */
1441
	mtr_t*		mtr)		/* in: mtr */
1442
{
1443
	ulint		space;
1444
	rec_t*		node_ptr;
1445
	page_t*		prev_page;
1446
	page_t*		next_page;
1447
	ulint		prev_page_no;
1448
	ulint		next_page_no;
1449
	ulint		level;
1450
	page_t*		lower_page;
1451
	page_t*		upper_page;
1452
	ulint		lower_page_no;
1453
	ulint		upper_page_no;
1454
	dtuple_t*	node_ptr_upper;
1455
	mem_heap_t*	heap;
1456
1457
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1458
				MTR_MEMO_PAGE_X_FIX));
1459
	ut_ad(mtr_memo_contains(mtr, buf_block_align(new_page),
1460
				MTR_MEMO_PAGE_X_FIX));
1461
	ut_a(page_is_comp(page) == page_is_comp(new_page));
1462
1463
	/* Create a memory heap where the data tuple is stored */
1464
	heap = mem_heap_create(1024);
1465
1466
	/* Based on split direction, decide upper and lower pages */
1467
	if (direction == FSP_DOWN) {
1468
1469
		lower_page_no = buf_frame_get_page_no(new_page);
1470
		upper_page_no = buf_frame_get_page_no(page);
1471
		lower_page = new_page;
1472
		upper_page = page;
1473
1474
		/* Look up the index for the node pointer to page */
1475
		node_ptr = btr_page_get_father_node_ptr(index, page, mtr);
1476
1477
		/* Replace the address of the old child node (= page) with the
1478
		address of the new lower half */
1479
1480
		btr_node_ptr_set_child_page_no(node_ptr,
1481
					       rec_get_offsets(
1482
						       node_ptr, index,
1483
						       NULL, ULINT_UNDEFINED,
1484
						       &heap),
1485
					       lower_page_no, mtr);
1486
		mem_heap_empty(heap);
1487
	} else {
1488
		lower_page_no = buf_frame_get_page_no(page);
1489
		upper_page_no = buf_frame_get_page_no(new_page);
1490
		lower_page = page;
1491
		upper_page = new_page;
1492
	}
1493
1494
	/* Get the level of the split pages */
1495
	level = btr_page_get_level(page, mtr);
1496
1497
	/* Build the node pointer (= node key and page address) for the upper
1498
	half */
1499
1500
	node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
1501
						   upper_page_no, heap, level);
1502
1503
	/* Insert it next to the pointer to the lower half. Note that this
1504
	may generate recursion leading to a split on the higher level. */
1505
1506
	btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr);
1507
1508
	/* Free the memory heap */
1509
	mem_heap_free(heap);
1510
1511
	/* Get the previous and next pages of page */
1512
1513
	prev_page_no = btr_page_get_prev(page, mtr);
1514
	next_page_no = btr_page_get_next(page, mtr);
1515
	space = buf_frame_get_space_id(page);
1516
1517
	/* Update page links of the level */
1518
1519
	if (prev_page_no != FIL_NULL) {
1520
1521
		prev_page = btr_page_get(space, prev_page_no, RW_X_LATCH, mtr);
1522
		ut_a(page_is_comp(prev_page) == page_is_comp(page));
1523
#ifdef UNIV_BTR_DEBUG
1524
		ut_a(btr_page_get_next(prev_page, mtr)
1525
		     == buf_frame_get_page_no(page));
1526
#endif /* UNIV_BTR_DEBUG */
1527
1528
		btr_page_set_next(prev_page, lower_page_no, mtr);
1529
	}
1530
1531
	if (next_page_no != FIL_NULL) {
1532
1533
		next_page = btr_page_get(space, next_page_no, RW_X_LATCH, mtr);
1534
		ut_a(page_is_comp(next_page) == page_is_comp(page));
1535
1536
		btr_page_set_prev(next_page, upper_page_no, mtr);
1537
	}
1538
1539
	btr_page_set_prev(lower_page, prev_page_no, mtr);
1540
	btr_page_set_next(lower_page, upper_page_no, mtr);
1541
	btr_page_set_level(lower_page, level, mtr);
1542
1543
	btr_page_set_prev(upper_page, lower_page_no, mtr);
1544
	btr_page_set_next(upper_page, next_page_no, mtr);
1545
	btr_page_set_level(upper_page, level, mtr);
1546
}
1547
1548
/*****************************************************************
1549
Splits an index page to halves and inserts the tuple. It is assumed
1550
that mtr holds an x-latch to the index tree. NOTE: the tree x-latch
1551
is released within this function! NOTE that the operation of this
1552
function must always succeed, we cannot reverse it: therefore
1553
enough free disk space must be guaranteed to be available before
1554
this function is called. */
1555
1556
rec_t*
1557
btr_page_split_and_insert(
1558
/*======================*/
1559
				/* out: inserted record; NOTE: the tree
1560
				x-latch is released! NOTE: 2 free disk
1561
				pages must be available! */
1562
	btr_cur_t*	cursor,	/* in: cursor at which to insert; when the
1563
				function returns, the cursor is positioned
1564
				on the predecessor of the inserted record */
1565
	dtuple_t*	tuple,	/* in: tuple to insert */
1566
	mtr_t*		mtr)	/* in: mtr */
1567
{
1568
	page_t*		page;
1569
	ulint		page_no;
1570
	byte		direction;
1571
	ulint		hint_page_no;
1572
	page_t*		new_page;
1573
	rec_t*		split_rec;
1574
	page_t*		left_page;
1575
	page_t*		right_page;
1576
	page_t*		insert_page;
1577
	page_cur_t*	page_cursor;
1578
	rec_t*		first_rec;
1579
	byte*		buf = 0; /* remove warning */
1580
	rec_t*		move_limit;
1581
	ibool		insert_will_fit;
1582
	ulint		n_iterations = 0;
1583
	rec_t*		rec;
1584
	mem_heap_t*	heap;
1585
	ulint		n_uniq;
1586
	ulint*		offsets;
1587
1588
	heap = mem_heap_create(1024);
1589
	n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
1590
func_start:
1591
	mem_heap_empty(heap);
1592
	offsets = NULL;
1593
1594
	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
1595
				MTR_MEMO_X_LOCK));
1596
#ifdef UNIV_SYNC_DEBUG
1597
	ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
1598
#endif /* UNIV_SYNC_DEBUG */
1599
1600
	page = btr_cur_get_page(cursor);
1601
1602
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1603
				MTR_MEMO_PAGE_X_FIX));
1604
	ut_ad(page_get_n_recs(page) >= 2);
1605
1606
	page_no = buf_frame_get_page_no(page);
1607
1608
	/* 1. Decide the split record; split_rec == NULL means that the
1609
	tuple to be inserted should be the first record on the upper
1610
	half-page */
1611
1612
	if (n_iterations > 0) {
1613
		direction = FSP_UP;
1614
		hint_page_no = page_no + 1;
1615
		split_rec = btr_page_get_sure_split_rec(cursor, tuple);
1616
1617
	} else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
1618
		direction = FSP_UP;
1619
		hint_page_no = page_no + 1;
1620
1621
	} else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
1622
		direction = FSP_DOWN;
1623
		hint_page_no = page_no - 1;
1624
	} else {
1625
		direction = FSP_UP;
1626
		hint_page_no = page_no + 1;
1627
		split_rec = page_get_middle_rec(page);
1628
	}
1629
1630
	/* 2. Allocate a new page to the index */
1631
	new_page = btr_page_alloc(cursor->index, hint_page_no, direction,
1632
				  btr_page_get_level(page, mtr), mtr);
1633
	btr_page_create(new_page, cursor->index, mtr);
1634
1635
	/* 3. Calculate the first record on the upper half-page, and the
1636
	first record (move_limit) on original page which ends up on the
1637
	upper half */
1638
1639
	if (split_rec != NULL) {
1640
		first_rec = split_rec;
1641
		move_limit = split_rec;
1642
	} else {
1643
		buf = mem_alloc(rec_get_converted_size(cursor->index, tuple));
1644
1645
		first_rec = rec_convert_dtuple_to_rec(buf,
1646
						      cursor->index, tuple);
1647
		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
1648
	}
1649
1650
	/* 4. Do first the modifications in the tree structure */
1651
1652
	btr_attach_half_pages(cursor->index, page, first_rec,
1653
			      new_page, direction, mtr);
1654
1655
	if (split_rec == NULL) {
1656
		mem_free(buf);
1657
	}
1658
1659
	/* If the split is made on the leaf level and the insert will fit
1660
	on the appropriate half-page, we may release the tree x-latch.
1661
	We can then move the records after releasing the tree latch,
1662
	thus reducing the tree latch contention. */
1663
1664
	if (split_rec) {
1665
		offsets = rec_get_offsets(split_rec, cursor->index, offsets,
1666
					  n_uniq, &heap);
1667
1668
		insert_will_fit = btr_page_insert_fits(cursor,
1669
						       split_rec, offsets,
1670
						       tuple, heap);
1671
	} else {
1672
		insert_will_fit = btr_page_insert_fits(cursor,
1673
						       NULL, NULL,
1674
						       tuple, heap);
1675
	}
1676
1677
	if (insert_will_fit && (btr_page_get_level(page, mtr) == 0)) {
1678
1679
		mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
1680
				 MTR_MEMO_X_LOCK);
1681
	}
1682
1683
	/* 5. Move then the records to the new page */
1684
	if (direction == FSP_DOWN) {
1685
		/*		fputs("Split left\n", stderr); */
1686
1687
		page_move_rec_list_start(new_page, page, move_limit,
1688
					 cursor->index, mtr);
1689
		left_page = new_page;
1690
		right_page = page;
1691
1692
		lock_update_split_left(right_page, left_page);
1693
	} else {
1694
		/*		fputs("Split right\n", stderr); */
1695
1696
		page_move_rec_list_end(new_page, page, move_limit,
1697
				       cursor->index, mtr);
1698
		left_page = page;
1699
		right_page = new_page;
1700
1701
		lock_update_split_right(right_page, left_page);
1702
	}
1703
1704
	/* 6. The split and the tree modification is now completed. Decide the
1705
	page where the tuple should be inserted */
1706
1707
	if (split_rec == NULL) {
1708
		insert_page = right_page;
1709
1710
	} else {
1711
		offsets = rec_get_offsets(first_rec, cursor->index,
1712
					  offsets, n_uniq, &heap);
1713
1714
		if (cmp_dtuple_rec(tuple, first_rec, offsets) >= 0) {
1715
1716
			insert_page = right_page;
1717
		} else {
1718
			insert_page = left_page;
1719
		}
1720
	}
1721
1722
	/* 7. Reposition the cursor for insert and try insertion */
1723
	page_cursor = btr_cur_get_page_cur(cursor);
1724
1725
	page_cur_search(insert_page, cursor->index, tuple,
1726
			PAGE_CUR_LE, page_cursor);
1727
1728
	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr);
1729
1730
	if (rec != NULL) {
1731
		/* Insert fit on the page: update the free bits for the
1732
		left and right pages in the same mtr */
1733
1734
		ibuf_update_free_bits_for_two_pages_low(cursor->index,
1735
							left_page,
1736
							right_page, mtr);
1737
		/* fprintf(stderr, "Split and insert done %lu %lu\n",
1738
		buf_frame_get_page_no(left_page),
1739
		buf_frame_get_page_no(right_page)); */
1740
		mem_heap_free(heap);
1741
		return(rec);
1742
	}
1743
1744
	/* 8. If insert did not fit, try page reorganization */
1745
1746
	btr_page_reorganize(insert_page, cursor->index, mtr);
1747
1748
	page_cur_search(insert_page, cursor->index, tuple,
1749
			PAGE_CUR_LE, page_cursor);
1750
	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr);
1751
1752
	if (rec == NULL) {
1753
		/* The insert did not fit on the page: loop back to the
1754
		start of the function for a new split */
1755
1756
		/* We play safe and reset the free bits for new_page */
1757
		ibuf_reset_free_bits(cursor->index, new_page);
1758
1759
		/* fprintf(stderr, "Split second round %lu\n",
1760
		buf_frame_get_page_no(page)); */
1761
		n_iterations++;
1762
		ut_ad(n_iterations < 2);
1763
		ut_ad(!insert_will_fit);
1764
1765
		goto func_start;
1766
	}
1767
1768
	/* Insert fit on the page: update the free bits for the
1769
	left and right pages in the same mtr */
1770
1771
	ibuf_update_free_bits_for_two_pages_low(cursor->index, left_page,
1772
						right_page, mtr);
1773
#if 0
1774
	fprintf(stderr, "Split and insert done %lu %lu\n",
1775
		buf_frame_get_page_no(left_page),
1776
		buf_frame_get_page_no(right_page));
1777
#endif
1778
1779
	ut_ad(page_validate(left_page, cursor->index));
1780
	ut_ad(page_validate(right_page, cursor->index));
1781
1782
	mem_heap_free(heap);
1783
	return(rec);
1784
}
1785
1786
/*****************************************************************
1787
Removes a page from the level list of pages. */
1788
static
1789
void
1790
btr_level_list_remove(
1791
/*==================*/
1792
	page_t*		page,	/* in: page to remove */
1793
	mtr_t*		mtr)	/* in: mtr */
1794
{
1795
	ulint	space;
1796
	ulint	prev_page_no;
1797
	page_t*	prev_page;
1798
	ulint	next_page_no;
1799
	page_t*	next_page;
1800
1801
	ut_ad(page && mtr);
1802
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1803
				MTR_MEMO_PAGE_X_FIX));
1804
	/* Get the previous and next page numbers of page */
1805
1806
	prev_page_no = btr_page_get_prev(page, mtr);
1807
	next_page_no = btr_page_get_next(page, mtr);
1808
	space = buf_frame_get_space_id(page);
1809
1810
	/* Update page links of the level */
1811
1812
	if (prev_page_no != FIL_NULL) {
1813
1814
		prev_page = btr_page_get(space, prev_page_no, RW_X_LATCH, mtr);
1815
		ut_a(page_is_comp(prev_page) == page_is_comp(page));
1816
#ifdef UNIV_BTR_DEBUG
1817
		ut_a(btr_page_get_next(prev_page, mtr)
1818
		     == buf_frame_get_page_no(page));
1819
#endif /* UNIV_BTR_DEBUG */
1820
1821
		btr_page_set_next(prev_page, next_page_no, mtr);
1822
	}
1823
1824
	if (next_page_no != FIL_NULL) {
1825
1826
		next_page = btr_page_get(space, next_page_no, RW_X_LATCH, mtr);
1827
		ut_a(page_is_comp(next_page) == page_is_comp(page));
1828
#ifdef UNIV_BTR_DEBUG
1829
		ut_a(btr_page_get_prev(next_page, mtr)
1830
		     == buf_frame_get_page_no(page));
1831
#endif /* UNIV_BTR_DEBUG */
1832
1833
		btr_page_set_prev(next_page, prev_page_no, mtr);
1834
	}
1835
}
1836
1837
/********************************************************************
1838
Writes the redo log record for setting an index record as the predefined
1839
minimum record. */
1840
UNIV_INLINE
1841
void
1842
btr_set_min_rec_mark_log(
1843
/*=====================*/
1844
	rec_t*	rec,	/* in: record */
1845
	ulint	comp,	/* nonzero=compact record format */
1846
	mtr_t*	mtr)	/* in: mtr */
1847
{
1848
	mlog_write_initial_log_record(
1849
		rec, comp ? MLOG_COMP_REC_MIN_MARK : MLOG_REC_MIN_MARK, mtr);
1850
1851
	/* Write rec offset as a 2-byte ulint */
1852
	mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
1853
}
1854
1855
/********************************************************************
1856
Parses the redo log record for setting an index record as the predefined
1857
minimum record. */
1858
1859
byte*
1860
btr_parse_set_min_rec_mark(
1861
/*=======================*/
1862
			/* out: end of log record or NULL */
1863
	byte*	ptr,	/* in: buffer */
1864
	byte*	end_ptr,/* in: buffer end */
1865
	ulint	comp,	/* in: nonzero=compact page format */
1866
	page_t*	page,	/* in: page or NULL */
1867
	mtr_t*	mtr)	/* in: mtr or NULL */
1868
{
1869
	rec_t*	rec;
1870
1871
	if (end_ptr < ptr + 2) {
1872
1873
		return(NULL);
1874
	}
1875
1876
	if (page) {
1877
		ut_a(!page_is_comp(page) == !comp);
1878
1879
		rec = page + mach_read_from_2(ptr);
1880
1881
		btr_set_min_rec_mark(rec, comp, mtr);
1882
	}
1883
1884
	return(ptr + 2);
1885
}
1886
1887
/********************************************************************
1888
Sets a record as the predefined minimum record. */
1889
1890
void
1891
btr_set_min_rec_mark(
1892
/*=================*/
1893
	rec_t*	rec,	/* in: record */
1894
	ulint	comp,	/* in: nonzero=compact page format */
1895
	mtr_t*	mtr)	/* in: mtr */
1896
{
1897
	ulint	info_bits;
1898
1899
	info_bits = rec_get_info_bits(rec, comp);
1900
1901
	rec_set_info_bits(rec, comp, info_bits | REC_INFO_MIN_REC_FLAG);
1902
1903
	btr_set_min_rec_mark_log(rec, comp, mtr);
1904
}
1905
1906
/*****************************************************************
1907
Deletes on the upper level the node pointer to a page. */
1908
1909
void
1910
btr_node_ptr_delete(
1911
/*================*/
1912
	dict_index_t*	index,	/* in: index tree */
1913
	page_t*		page,	/* in: page whose node pointer is deleted */
1914
	mtr_t*		mtr)	/* in: mtr */
1915
{
1916
	rec_t*		node_ptr;
1917
	btr_cur_t	cursor;
1918
	ibool		compressed;
1919
	ulint		err;
1920
1921
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1922
				MTR_MEMO_PAGE_X_FIX));
1923
	/* Delete node pointer on father page */
1924
1925
	node_ptr = btr_page_get_father_node_ptr(index, page, mtr);
1926
1927
	btr_cur_position(index, node_ptr, &cursor);
1928
	compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, FALSE,
1929
						mtr);
1930
	ut_a(err == DB_SUCCESS);
1931
1932
	if (!compressed) {
1933
		btr_cur_compress_if_useful(&cursor, mtr);
1934
	}
1935
}
1936
1937
/*****************************************************************
1938
If page is the only on its level, this function moves its records to the
1939
father page, thus reducing the tree height. */
1940
static
1941
void
1942
btr_lift_page_up(
1943
/*=============*/
1944
	dict_index_t*	index,	/* in: index tree */
1945
	page_t*		page,	/* in: page which is the only on its level;
1946
				must not be empty: use
1947
				btr_discard_only_page_on_level if the last
1948
				record from the page should be removed */
1949
	mtr_t*		mtr)	/* in: mtr */
1950
{
1951
	page_t*		father_page;
1952
	page_t*		iter_page;
1953
	page_t*		pages[BTR_MAX_LEVELS];
1954
	ulint		page_level;
1955
	ulint		root_page_no;
1956
	ulint		ancestors;
1957
	ulint		i;
1958
1959
	ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
1960
	ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
1961
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1962
				MTR_MEMO_PAGE_X_FIX));
1963
	father_page = buf_frame_align(
1964
		btr_page_get_father_node_ptr(index, page, mtr));
1965
1966
	page_level = btr_page_get_level(page, mtr);
1967
	root_page_no = dict_index_get_page(index);
1968
1969
	ancestors = 1;
1970
	pages[0] = father_page;
1971
1972
	/* Store all ancestor pages so we can reset their levels later on.
1973
	We have to do all the searches on the tree now because later on,
1974
	after we've replaced the first level, the tree is in an inconsistent
1975
	state and can not be searched. */
1976
	iter_page = father_page;
1977
	for (;;) {
1978
		if (buf_block_get_page_no(buf_block_align(iter_page))
1979
		    == root_page_no) {
1980
1981
			break;
1982
		}
1983
1984
		ut_a(ancestors < BTR_MAX_LEVELS);
1985
1986
		iter_page = buf_frame_align(
1987
			btr_page_get_father_node_ptr(index, iter_page, mtr));
1988
1989
		pages[ancestors++] = iter_page;
1990
	}
1991
1992
	btr_search_drop_page_hash_index(page);
1993
1994
	/* Make the father empty */
1995
	btr_page_empty(father_page, mtr);
1996
1997
	/* Move records to the father */
1998
	page_copy_rec_list_end(father_page, page, page_get_infimum_rec(page),
1999
			       index, mtr);
2000
	lock_update_copy_and_discard(father_page, page);
2001
2002
	/* Go upward to root page, decreasing levels by one. */
2003
	for (i = 0; i < ancestors; i++) {
2004
		iter_page = pages[i];
2005
2006
		ut_ad(btr_page_get_level(iter_page, mtr) == (page_level + 1));
2007
2008
		btr_page_set_level(iter_page, page_level, mtr);
2009
		page_level++;
2010
	}
2011
2012
	/* Free the file page */
2013
	btr_page_free(index, page, mtr);
2014
2015
	/* We play safe and reset the free bits for the father */
2016
	ibuf_reset_free_bits(index, father_page);
2017
	ut_ad(page_validate(father_page, index));
2018
	ut_ad(btr_check_node_ptr(index, father_page, mtr));
2019
}
2020
2021
/*****************************************************************
2022
Tries to merge the page first to the left immediate brother if such a
2023
brother exists, and the node pointers to the current page and to the brother
2024
reside on the same page. If the left brother does not satisfy these
2025
conditions, looks at the right brother. If the page is the only one on that
2026
level lifts the records of the page to the father page, thus reducing the
2027
tree height. It is assumed that mtr holds an x-latch on the tree and on the
2028
page. If cursor is on the leaf level, mtr must also hold x-latches to the
2029
brothers, if they exist. NOTE: it is assumed that the caller has reserved
2030
enough free extents so that the compression will always succeed if done! */
2031
2032
void
2033
btr_compress(
2034
/*=========*/
2035
	btr_cur_t*	cursor,	/* in: cursor on the page to merge or lift;
2036
				the page must not be empty: in record delete
2037
				use btr_discard_page if the page would become
2038
				empty */
2039
	mtr_t*		mtr)	/* in: mtr */
2040
{
2041
	dict_index_t*	index;
2042
	ulint		space;
2043
	ulint		left_page_no;
2044
	ulint		right_page_no;
2045
	page_t*		merge_page;
2046
	page_t*		father_page;
2047
	ibool		is_left;
2048
	page_t*		page;
2049
	rec_t*		orig_pred;
2050
	rec_t*		orig_succ;
2051
	rec_t*		node_ptr;
2052
	ulint		data_size;
2053
	ulint		n_recs;
2054
	ulint		max_ins_size;
2055
	ulint		max_ins_size_reorg;
2056
	ulint		level;
2057
	ulint		comp;
2058
2059
	page = btr_cur_get_page(cursor);
2060
	index = btr_cur_get_index(cursor);
2061
	comp = page_is_comp(page);
2062
	ut_a((ibool)!!comp == dict_table_is_comp(index->table));
2063
2064
	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2065
				MTR_MEMO_X_LOCK));
2066
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
2067
				MTR_MEMO_PAGE_X_FIX));
2068
	level = btr_page_get_level(page, mtr);
2069
	space = dict_index_get_space(index);
2070
2071
	left_page_no = btr_page_get_prev(page, mtr);
2072
	right_page_no = btr_page_get_next(page, mtr);
2073
2074
#if 0
2075
	fprintf(stderr, "Merge left page %lu right %lu \n",
2076
		left_page_no, right_page_no);
2077
#endif
2078
2079
	node_ptr = btr_page_get_father_node_ptr(index, page, mtr);
2080
	ut_ad(!comp || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
2081
	father_page = buf_frame_align(node_ptr);
2082
	ut_a(comp == page_is_comp(father_page));
2083
2084
	/* Decide the page to which we try to merge and which will inherit
2085
	the locks */
2086
2087
	is_left = left_page_no != FIL_NULL;
2088
2089
	if (is_left) {
2090
2091
		merge_page = btr_page_get(space, left_page_no, RW_X_LATCH,
2092
					  mtr);
2093
#ifdef UNIV_BTR_DEBUG
2094
		ut_a(btr_page_get_next(merge_page, mtr)
2095
		     == buf_frame_get_page_no(page));
2096
#endif /* UNIV_BTR_DEBUG */
2097
	} else if (right_page_no != FIL_NULL) {
2098
2099
		merge_page = btr_page_get(space, right_page_no, RW_X_LATCH,
2100
					  mtr);
2101
#ifdef UNIV_BTR_DEBUG
2102
		ut_a(btr_page_get_prev(merge_page, mtr)
2103
		     == buf_frame_get_page_no(page));
2104
#endif /* UNIV_BTR_DEBUG */
2105
	} else {
2106
		/* The page is the only one on the level, lift the records
2107
		to the father */
2108
		btr_lift_page_up(index, page, mtr);
2109
2110
		return;
2111
	}
2112
2113
	n_recs = page_get_n_recs(page);
2114
	data_size = page_get_data_size(page);
2115
	ut_a(page_is_comp(merge_page) == comp);
2116
2117
	max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
2118
		merge_page, n_recs);
2119
	if (data_size > max_ins_size_reorg) {
2120
2121
		/* No space for merge */
2122
2123
		return;
2124
	}
2125
2126
	ut_ad(page_validate(merge_page, index));
2127
2128
	max_ins_size = page_get_max_insert_size(merge_page, n_recs);
2129
2130
	if (data_size > max_ins_size) {
2131
2132
		/* We have to reorganize merge_page */
2133
2134
		btr_page_reorganize(merge_page, index, mtr);
2135
2136
		max_ins_size = page_get_max_insert_size(merge_page, n_recs);
2137
2138
		ut_ad(page_validate(merge_page, index));
2139
		ut_ad(page_get_max_insert_size(merge_page, n_recs)
2140
		      == max_ins_size_reorg);
2141
	}
2142
2143
	if (data_size > max_ins_size) {
2144
2145
		/* Add fault tolerance, though this should never happen */
2146
2147
		return;
2148
	}
2149
2150
	btr_search_drop_page_hash_index(page);
2151
2152
	/* Remove the page from the level list */
2153
	btr_level_list_remove(page, mtr);
2154
2155
	if (is_left) {
2156
		btr_node_ptr_delete(index, page, mtr);
2157
	} else {
2158
		mem_heap_t*	heap		= NULL;
2159
		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2160
		*offsets_ = (sizeof offsets_) / sizeof *offsets_;
2161
		/* Replace the address of the old child node (= page) with the
2162
		address of the merge page to the right */
2163
2164
		btr_node_ptr_set_child_page_no(node_ptr,
2165
					       rec_get_offsets(
2166
						       node_ptr, index,
2167
						       offsets_,
2168
						       ULINT_UNDEFINED,
2169
						       &heap),
2170
					       right_page_no, mtr);
2171
		if (UNIV_LIKELY_NULL(heap)) {
2172
			mem_heap_free(heap);
2173
		}
2174
		btr_node_ptr_delete(index, merge_page, mtr);
2175
	}
2176
2177
	/* Move records to the merge page */
2178
	if (is_left) {
2179
		orig_pred = page_rec_get_prev(
2180
			page_get_supremum_rec(merge_page));
2181
		page_copy_rec_list_start(merge_page, page,
2182
					 page_get_supremum_rec(page),
2183
					 index, mtr);
2184
2185
		lock_update_merge_left(merge_page, orig_pred, page);
2186
	} else {
2187
		orig_succ = page_rec_get_next(
2188
			page_get_infimum_rec(merge_page));
2189
		page_copy_rec_list_end(merge_page, page,
2190
				       page_get_infimum_rec(page),
2191
				       index, mtr);
2192
2193
		lock_update_merge_right(orig_succ, page);
2194
	}
2195
2196
	/* We have added new records to merge_page: update its free bits */
2197
	ibuf_update_free_bits_if_full(index, merge_page,
2198
				      UNIV_PAGE_SIZE, ULINT_UNDEFINED);
2199
2200
	ut_ad(page_validate(merge_page, index));
2201
2202
	/* Free the file page */
2203
	btr_page_free(index, page, mtr);
2204
2205
	ut_ad(btr_check_node_ptr(index, merge_page, mtr));
2206
}
2207
2208
/*****************************************************************
2209
Discards a page that is the only page on its level. */
2210
static
2211
void
2212
btr_discard_only_page_on_level(
2213
/*===========================*/
2214
	dict_index_t*	index,	/* in: index tree */
2215
	page_t*		page,	/* in: page which is the only on its level */
2216
	mtr_t*		mtr)	/* in: mtr */
2217
{
2218
	rec_t*	node_ptr;
2219
	page_t*	father_page;
2220
	ulint	page_level;
2221
2222
	ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
2223
	ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
2224
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
2225
				MTR_MEMO_PAGE_X_FIX));
2226
	btr_search_drop_page_hash_index(page);
2227
2228
	node_ptr = btr_page_get_father_node_ptr(index, page, mtr);
2229
	father_page = buf_frame_align(node_ptr);
2230
2231
	page_level = btr_page_get_level(page, mtr);
2232
2233
	lock_update_discard(page_get_supremum_rec(father_page), page);
2234
2235
	btr_page_set_level(father_page, page_level, mtr);
2236
2237
	/* Free the file page */
2238
	btr_page_free(index, page, mtr);
2239
2240
	if (buf_frame_get_page_no(father_page) == dict_index_get_page(index)) {
2241
		/* The father is the root page */
2242
2243
		btr_page_empty(father_page, mtr);
2244
2245
		/* We play safe and reset the free bits for the father */
2246
		ibuf_reset_free_bits(index, father_page);
2247
	} else {
2248
		ut_ad(page_get_n_recs(father_page) == 1);
2249
2250
		btr_discard_only_page_on_level(index, father_page, mtr);
2251
	}
2252
}
2253
2254
/*****************************************************************
2255
Discards a page from a B-tree. This is used to remove the last record from
2256
a B-tree page: the whole page must be removed at the same time. This cannot
2257
be used for the root page, which is allowed to be empty. */
2258
2259
void
2260
btr_discard_page(
2261
/*=============*/
2262
	btr_cur_t*	cursor,	/* in: cursor on the page to discard: not on
2263
				the root page */
2264
	mtr_t*		mtr)	/* in: mtr */
2265
{
2266
	dict_index_t*	index;
2267
	ulint		space;
2268
	ulint		left_page_no;
2269
	ulint		right_page_no;
2270
	page_t*		merge_page;
2271
	page_t*		page;
2272
	rec_t*		node_ptr;
2273
2274
	page = btr_cur_get_page(cursor);
2275
	index = btr_cur_get_index(cursor);
2276
2277
	ut_ad(dict_index_get_page(index) != buf_frame_get_page_no(page));
2278
	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2279
				MTR_MEMO_X_LOCK));
2280
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
2281
				MTR_MEMO_PAGE_X_FIX));
2282
	space = dict_index_get_space(index);
2283
2284
	/* Decide the page which will inherit the locks */
2285
2286
	left_page_no = btr_page_get_prev(page, mtr);
2287
	right_page_no = btr_page_get_next(page, mtr);
2288
2289
	if (left_page_no != FIL_NULL) {
2290
		merge_page = btr_page_get(space, left_page_no, RW_X_LATCH,
2291
					  mtr);
2292
#ifdef UNIV_BTR_DEBUG
2293
		ut_a(btr_page_get_next(merge_page, mtr)
2294
		     == buf_frame_get_page_no(page));
2295
#endif /* UNIV_BTR_DEBUG */
2296
	} else if (right_page_no != FIL_NULL) {
2297
		merge_page = btr_page_get(space, right_page_no, RW_X_LATCH,
2298
					  mtr);
2299
#ifdef UNIV_BTR_DEBUG
2300
		ut_a(btr_page_get_prev(merge_page, mtr)
2301
		     == buf_frame_get_page_no(page));
2302
#endif /* UNIV_BTR_DEBUG */
2303
	} else {
2304
		btr_discard_only_page_on_level(index, page, mtr);
2305
2306
		return;
2307
	}
2308
2309
	ut_a(page_is_comp(merge_page) == page_is_comp(page));
2310
	btr_search_drop_page_hash_index(page);
2311
2312
	if (left_page_no == FIL_NULL && btr_page_get_level(page, mtr) > 0) {
2313
2314
		/* We have to mark the leftmost node pointer on the right
2315
		side page as the predefined minimum record */
2316
2317
		node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
2318
2319
		ut_ad(page_rec_is_user_rec(node_ptr));
2320
2321
		btr_set_min_rec_mark(node_ptr, page_is_comp(merge_page), mtr);
2322
	}
2323
2324
	btr_node_ptr_delete(index, page, mtr);
2325
2326
	/* Remove the page from the level list */
2327
	btr_level_list_remove(page, mtr);
2328
2329
	if (left_page_no != FIL_NULL) {
2330
		lock_update_discard(page_get_supremum_rec(merge_page), page);
2331
	} else {
2332
		lock_update_discard(page_rec_get_next(
2333
					    page_get_infimum_rec(merge_page)),
2334
				    page);
2335
	}
2336
2337
	/* Free the file page */
2338
	btr_page_free(index, page, mtr);
2339
2340
	ut_ad(btr_check_node_ptr(index, merge_page, mtr));
2341
}
2342
2343
#ifdef UNIV_BTR_PRINT
2344
/*****************************************************************
2345
Prints size info of a B-tree. */
2346
2347
void
2348
btr_print_size(
2349
/*===========*/
2350
	dict_index_t*	index)	/* in: index tree */
2351
{
2352
	page_t*		root;
2353
	fseg_header_t*	seg;
2354
	mtr_t		mtr;
2355
2356
	if (index->type & DICT_IBUF) {
2357
		fputs("Sorry, cannot print info of an ibuf tree:"
2358
		      " use ibuf functions\n", stderr);
2359
2360
		return;
2361
	}
2362
2363
	mtr_start(&mtr);
2364
2365
	root = btr_root_get(index, &mtr);
2366
2367
	seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
2368
2369
	fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
2370
	fseg_print(seg, &mtr);
2371
2372
	if (!(index->type & DICT_UNIVERSAL)) {
2373
2374
		seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
2375
2376
		fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
2377
		fseg_print(seg, &mtr);
2378
	}
2379
2380
	mtr_commit(&mtr);
2381
}
2382
2383
/****************************************************************
2384
Prints recursively index tree pages. */
2385
static
2386
void
2387
btr_print_recursive(
2388
/*================*/
2389
	dict_index_t*	index,	/* in: index tree */
2390
	page_t*		page,	/* in: index page */
2391
	ulint		width,	/* in: print this many entries from start
2392
				and end */
2393
	mem_heap_t**	heap,	/* in/out: heap for rec_get_offsets() */
2394
	ulint**		offsets,/* in/out: buffer for rec_get_offsets() */
2395
	mtr_t*		mtr)	/* in: mtr */
2396
{
2397
	page_cur_t	cursor;
2398
	ulint		n_recs;
2399
	ulint		i	= 0;
2400
	mtr_t		mtr2;
2401
	rec_t*		node_ptr;
2402
	page_t*		child;
2403
2404
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
2405
				MTR_MEMO_PAGE_X_FIX));
2406
	fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
2407
		(ulong) btr_page_get_level(page, mtr),
2408
		(ulong) buf_frame_get_page_no(page));
2409
2410
	page_print(page, index, width, width);
2411
2412
	n_recs = page_get_n_recs(page);
2413
2414
	page_cur_set_before_first(page, &cursor);
2415
	page_cur_move_to_next(&cursor);
2416
2417
	while (!page_cur_is_after_last(&cursor)) {
2418
2419
		if (0 == btr_page_get_level(page, mtr)) {
2420
2421
			/* If this is the leaf level, do nothing */
2422
2423
		} else if ((i <= width) || (i >= n_recs - width)) {
2424
2425
			mtr_start(&mtr2);
2426
2427
			node_ptr = page_cur_get_rec(&cursor);
2428
2429
			*offsets = rec_get_offsets(node_ptr, index, *offsets,
2430
						   ULINT_UNDEFINED, heap);
2431
			child = btr_node_ptr_get_child(node_ptr,
2432
						       *offsets, &mtr2);
2433
			btr_print_recursive(index, child, width,
2434
					    heap, offsets, &mtr2);
2435
			mtr_commit(&mtr2);
2436
		}
2437
2438
		page_cur_move_to_next(&cursor);
2439
		i++;
2440
	}
2441
}
2442
2443
/******************************************************************
2444
Prints directories and other info of all nodes in the tree. */
2445
2446
void
2447
btr_print_index(
2448
/*============*/
2449
	dict_index_t*	index,	/* in: index */
2450
	ulint		width)	/* in: print this many entries from start
2451
				and end */
2452
{
2453
	mtr_t		mtr;
2454
	page_t*		root;
2455
	mem_heap_t*	heap	= NULL;
2456
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2457
	ulint*		offsets	= offsets_;
2458
	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
2459
2460
	fputs("--------------------------\n"
2461
	      "INDEX TREE PRINT\n", stderr);
2462
2463
	mtr_start(&mtr);
2464
2465
	root = btr_root_get(index, &mtr);
2466
2467
	btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
2468
	if (UNIV_LIKELY_NULL(heap)) {
2469
		mem_heap_free(heap);
2470
	}
2471
2472
	mtr_commit(&mtr);
2473
2474
	btr_validate_index(index, NULL);
2475
}
2476
#endif /* UNIV_BTR_PRINT */
2477
2478
#ifdef UNIV_DEBUG
2479
/****************************************************************
2480
Checks that the node pointer to a page is appropriate. */
2481
2482
ibool
2483
btr_check_node_ptr(
2484
/*===============*/
2485
				/* out: TRUE */
2486
	dict_index_t*	index,	/* in: index tree */
2487
	page_t*		page,	/* in: index page */
2488
	mtr_t*		mtr)	/* in: mtr */
2489
{
2490
	mem_heap_t*	heap;
2491
	rec_t*		node_ptr;
2492
	dtuple_t*	node_ptr_tuple;
2493
2494
	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
2495
				MTR_MEMO_PAGE_X_FIX));
2496
	if (dict_index_get_page(index) == buf_frame_get_page_no(page)) {
2497
2498
		return(TRUE);
2499
	}
2500
2501
	node_ptr = btr_page_get_father_node_ptr(index, page, mtr);
2502
2503
	if (btr_page_get_level(page, mtr) == 0) {
2504
2505
		return(TRUE);
2506
	}
2507
2508
	heap = mem_heap_create(256);
2509
2510
	node_ptr_tuple = dict_index_build_node_ptr(
2511
		index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
2512
		btr_page_get_level(page, mtr));
2513
2514
	ut_a(!cmp_dtuple_rec(node_ptr_tuple, node_ptr,
2515
			     rec_get_offsets(node_ptr, index,
2516
					     NULL, ULINT_UNDEFINED, &heap)));
2517
2518
	mem_heap_free(heap);
2519
2520
	return(TRUE);
2521
}
2522
#endif /* UNIV_DEBUG */
2523
2524
/****************************************************************
2525
Display identification information for a record. */
2526
static
2527
void
2528
btr_index_rec_validate_report(
2529
/*==========================*/
2530
	page_t*		page,	/* in: index page */
2531
	rec_t*		rec,	/* in: index record */
2532
	dict_index_t*	index)	/* in: index */
2533
{
2534
	fputs("InnoDB: Record in ", stderr);
2535
	dict_index_name_print(stderr, NULL, index);
2536
	fprintf(stderr, ", page %lu, at offset %lu\n",
2537
		buf_frame_get_page_no(page), (ulint)(rec - page));
2538
}
2539
2540
/****************************************************************
2541
Checks the size and number of fields in a record based on the definition of
2542
the index. */
2543
2544
ibool
2545
btr_index_rec_validate(
2546
/*===================*/
2547
					/* out: TRUE if ok */
2548
	rec_t*		rec,		/* in: index record */
2549
	dict_index_t*	index,		/* in: index */
2550
	ibool		dump_on_error)	/* in: TRUE if the function
2551
					should print hex dump of record
2552
					and page on error */
2553
{
2554
	ulint		len;
2555
	ulint		n;
2556
	ulint		i;
2557
	page_t*		page;
2558
	mem_heap_t*	heap	= NULL;
2559
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2560
	ulint*		offsets	= offsets_;
2561
	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
2562
2563
	page = buf_frame_align(rec);
2564
2565
	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
2566
		/* The insert buffer index tree can contain records from any
2567
		other index: we cannot check the number of fields or
2568
		their length */
2569
2570
		return(TRUE);
2571
	}
2572
2573
	if (UNIV_UNLIKELY((ibool)!!page_is_comp(page)
2574
			  != dict_table_is_comp(index->table))) {
2575
		btr_index_rec_validate_report(page, rec, index);
2576
		fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
2577
			(ulong) !!page_is_comp(page),
2578
			(ulong) dict_table_is_comp(index->table));
2579
2580
		return(FALSE);
2581
	}
2582
2583
	n = dict_index_get_n_fields(index);
2584
2585
	if (!page_is_comp(page)
2586
	    && UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) {
2587
		btr_index_rec_validate_report(page, rec, index);
2588
		fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
2589
			(ulong) rec_get_n_fields_old(rec), (ulong) n);
2590
2591
		if (dump_on_error) {
2592
			buf_page_print(page);
2593
2594
			fputs("InnoDB: corrupt record ", stderr);
2595
			rec_print_old(stderr, rec);
2596
			putc('\n', stderr);
2597
		}
2598
		return(FALSE);
2599
	}
2600
2601
	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
2602
2603
	for (i = 0; i < n; i++) {
2604
		ulint	fixed_size = dict_col_get_fixed_size(
2605
			dict_index_get_nth_col(index, i));
2606
2607
		rec_get_nth_field(rec, offsets, i, &len);
2608
2609
		/* Note that if fixed_size != 0, it equals the
2610
		length of a fixed-size column in the clustered index.
2611
		A prefix index of the column is of fixed, but different
2612
		length.  When fixed_size == 0, prefix_len is the maximum
2613
		length of the prefix index column. */
2614
2615
		if ((dict_index_get_nth_field(index, i)->prefix_len == 0
2616
		     && len != UNIV_SQL_NULL && fixed_size
2617
		     && len != fixed_size)
2618
		    || (dict_index_get_nth_field(index, i)->prefix_len > 0
2619
			&& len != UNIV_SQL_NULL
2620
			&& len
2621
			> dict_index_get_nth_field(index, i)->prefix_len)) {
2622
2623
			btr_index_rec_validate_report(page, rec, index);
2624
			fprintf(stderr,
2625
				"InnoDB: field %lu len is %lu,"
2626
				" should be %lu\n",
2627
				(ulong) i, (ulong) len, (ulong) fixed_size);
2628
2629
			if (dump_on_error) {
2630
				buf_page_print(page);
2631
2632
				fputs("InnoDB: corrupt record ", stderr);
2633
				rec_print_new(stderr, rec, offsets);
2634
				putc('\n', stderr);
2635
			}
2636
			if (UNIV_LIKELY_NULL(heap)) {
2637
				mem_heap_free(heap);
2638
			}
2639
			return(FALSE);
2640
		}
2641
	}
2642
2643
	if (UNIV_LIKELY_NULL(heap)) {
2644
		mem_heap_free(heap);
2645
	}
2646
	return(TRUE);
2647
}
2648
2649
/****************************************************************
2650
Checks the size and number of fields in records based on the definition of
2651
the index. */
2652
static
2653
ibool
2654
btr_index_page_validate(
2655
/*====================*/
2656
				/* out: TRUE if ok */
2657
	page_t*		page,	/* in: index page */
2658
	dict_index_t*	index)	/* in: index */
2659
{
2660
	page_cur_t	cur;
2661
	ibool		ret	= TRUE;
2662
2663
	page_cur_set_before_first(page, &cur);
2664
	page_cur_move_to_next(&cur);
2665
2666
	for (;;) {
2667
		if (page_cur_is_after_last(&cur)) {
2668
2669
			break;
2670
		}
2671
2672
		if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
2673
2674
			return(FALSE);
2675
		}
2676
2677
		page_cur_move_to_next(&cur);
2678
	}
2679
2680
	return(ret);
2681
}
2682
2683
/****************************************************************
2684
Report an error on one page of an index tree. */
2685
static
2686
void
2687
btr_validate_report1(
2688
/*=================*/
2689
				/* out: TRUE if ok */
2690
	dict_index_t*	index,	/* in: index */
2691
	ulint		level,	/* in: B-tree level */
2692
	page_t*		page)	/* in: index page */
2693
{
2694
	fprintf(stderr, "InnoDB: Error in page %lu of ",
2695
		buf_frame_get_page_no(page));
2696
	dict_index_name_print(stderr, NULL, index);
2697
	if (level) {
2698
		fprintf(stderr, ", index tree level %lu", level);
2699
	}
2700
	putc('\n', stderr);
2701
}
2702
2703
/****************************************************************
2704
Report an error on two pages of an index tree. */
2705
static
2706
void
2707
btr_validate_report2(
2708
/*=================*/
2709
				/* out: TRUE if ok */
2710
	dict_index_t*	index,	/* in: index */
2711
	ulint		level,	/* in: B-tree level */
2712
	page_t*		page1,	/* in: first index page */
2713
	page_t*		page2)	/* in: second index page */
2714
{
2715
	fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
2716
		buf_frame_get_page_no(page1),
2717
		buf_frame_get_page_no(page2));
2718
	dict_index_name_print(stderr, NULL, index);
2719
	if (level) {
2720
		fprintf(stderr, ", index tree level %lu", level);
2721
	}
2722
	putc('\n', stderr);
2723
}
2724
2725
/****************************************************************
2726
Validates index tree level. */
2727
static
2728
ibool
2729
btr_validate_level(
2730
/*===============*/
2731
				/* out: TRUE if ok */
2732
	dict_index_t*	index,	/* in: index tree */
2733
	trx_t*		trx,	/* in: transaction or NULL */
2734
	ulint		level)	/* in: level number */
2735
{
2736
	ulint		space;
2737
	page_t*		page;
2738
	page_t*		right_page = 0; /* remove warning */
2739
	page_t*		father_page;
2740
	page_t*		right_father_page;
2741
	rec_t*		node_ptr;
2742
	rec_t*		right_node_ptr;
2743
	rec_t*		rec;
2744
	ulint		right_page_no;
2745
	ulint		left_page_no;
2746
	page_cur_t	cursor;
2747
	dtuple_t*	node_ptr_tuple;
2748
	ibool		ret	= TRUE;
2749
	mtr_t		mtr;
2750
	mem_heap_t*	heap	= mem_heap_create(256);
2751
	ulint*		offsets	= NULL;
2752
	ulint*		offsets2= NULL;
2753
2754
	mtr_start(&mtr);
2755
2756
	mtr_x_lock(dict_index_get_lock(index), &mtr);
2757
2758
	page = btr_root_get(index, &mtr);
2759
2760
	space = buf_frame_get_space_id(page);
2761
2762
	while (level != btr_page_get_level(page, &mtr)) {
2763
2764
		ut_a(btr_page_get_level(page, &mtr) > 0);
2765
2766
		page_cur_set_before_first(page, &cursor);
2767
		page_cur_move_to_next(&cursor);
2768
2769
		node_ptr = page_cur_get_rec(&cursor);
2770
		offsets = rec_get_offsets(node_ptr, index, offsets,
2771
					  ULINT_UNDEFINED, &heap);
2772
		page = btr_node_ptr_get_child(node_ptr, offsets, &mtr);
2773
	}
2774
2775
	/* Now we are on the desired level. Loop through the pages on that
2776
	level. */
2777
loop:
2778
	if (trx_is_interrupted(trx)) {
2779
		mtr_commit(&mtr);
2780
		mem_heap_free(heap);
2781
		return(ret);
2782
	}
2783
	mem_heap_empty(heap);
2784
	offsets = offsets2 = NULL;
2785
	mtr_x_lock(dict_index_get_lock(index), &mtr);
2786
2787
	/* Check ordering etc. of records */
2788
2789
	if (!page_validate(page, index)) {
2790
		btr_validate_report1(index, level, page);
2791
2792
		ret = FALSE;
2793
	} else if (level == 0) {
2794
		/* We are on level 0. Check that the records have the right
2795
		number of fields, and field lengths are right. */
2796
2797
		if (!btr_index_page_validate(page, index)) {
2798
2799
			ret = FALSE;
2800
		}
2801
	}
2802
2803
	ut_a(btr_page_get_level(page, &mtr) == level);
2804
2805
	right_page_no = btr_page_get_next(page, &mtr);
2806
	left_page_no = btr_page_get_prev(page, &mtr);
2807
2808
	ut_a((page_get_n_recs(page) > 0)
2809
	     || ((level == 0)
2810
		 && (buf_frame_get_page_no(page)
2811
		     == dict_index_get_page(index))));
2812
2813
	if (right_page_no != FIL_NULL) {
2814
		rec_t*	right_rec;
2815
		right_page = btr_page_get(space, right_page_no, RW_X_LATCH,
2816
					  &mtr);
2817
		if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
2818
				  != buf_frame_get_page_no(page))) {
2819
			btr_validate_report2(index, level, page, right_page);
2820
			fputs("InnoDB: broken FIL_PAGE_NEXT"
2821
			      " or FIL_PAGE_PREV links\n", stderr);
2822
			buf_page_print(page);
2823
			buf_page_print(right_page);
2824
2825
			ret = FALSE;
2826
		}
2827
2828
		if (UNIV_UNLIKELY(page_is_comp(right_page)
2829
				  != page_is_comp(page))) {
2830
			btr_validate_report2(index, level, page, right_page);
2831
			fputs("InnoDB: 'compact' flag mismatch\n", stderr);
2832
			buf_page_print(page);
2833
			buf_page_print(right_page);
2834
2835
			ret = FALSE;
2836
2837
			goto node_ptr_fails;
2838
		}
2839
2840
		rec = page_rec_get_prev(page_get_supremum_rec(page));
2841
		right_rec = page_rec_get_next(page_get_infimum_rec(
2842
						      right_page));
2843
		offsets = rec_get_offsets(rec, index,
2844
					  offsets, ULINT_UNDEFINED, &heap);
2845
		offsets2 = rec_get_offsets(right_rec, index,
2846
					   offsets2, ULINT_UNDEFINED, &heap);
2847
		if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec,
2848
					      offsets, offsets2,
2849
					      index) >= 0)) {
2850
2851
			btr_validate_report2(index, level, page, right_page);
2852
2853
			fputs("InnoDB: records in wrong order"
2854
			      " on adjacent pages\n", stderr);
2855
2856
			buf_page_print(page);
2857
			buf_page_print(right_page);
2858
2859
			fputs("InnoDB: record ", stderr);
2860
			rec = page_rec_get_prev(page_get_supremum_rec(page));
2861
			rec_print(stderr, rec, index);
2862
			putc('\n', stderr);
2863
			fputs("InnoDB: record ", stderr);
2864
			rec = page_rec_get_next(
2865
				page_get_infimum_rec(right_page));
2866
			rec_print(stderr, rec, index);
2867
			putc('\n', stderr);
2868
2869
			ret = FALSE;
2870
		}
2871
	}
2872
2873
	if (level > 0 && left_page_no == FIL_NULL) {
2874
		ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
2875
			     page_rec_get_next(page_get_infimum_rec(page)),
2876
			     page_is_comp(page)));
2877
	}
2878
2879
	if (buf_frame_get_page_no(page) != dict_index_get_page(index)) {
2880
2881
		/* Check father node pointers */
2882
2883
		node_ptr = btr_page_get_father_node_ptr(index, page, &mtr);
2884
		father_page = buf_frame_align(node_ptr);
2885
		offsets	= rec_get_offsets(node_ptr, index,
2886
					  offsets, ULINT_UNDEFINED, &heap);
2887
2888
		if (btr_node_ptr_get_child_page_no(node_ptr, offsets)
2889
		    != buf_frame_get_page_no(page)
2890
		    || node_ptr != btr_page_get_father_for_rec(
2891
			    index, page,
2892
			    page_rec_get_prev(page_get_supremum_rec(page)),
2893
			    &mtr)) {
2894
			btr_validate_report1(index, level, page);
2895
2896
			fputs("InnoDB: node pointer to the page is wrong\n",
2897
			      stderr);
2898
2899
			buf_page_print(father_page);
2900
			buf_page_print(page);
2901
2902
			fputs("InnoDB: node ptr ", stderr);
2903
			rec_print_new(stderr, node_ptr, offsets);
2904
2905
			fprintf(stderr, "\n"
2906
				"InnoDB: node ptr child page n:o %lu\n",
2907
				(unsigned long) btr_node_ptr_get_child_page_no
2908
				(node_ptr, offsets));
2909
2910
			fputs("InnoDB: record on page ", stderr);
2911
			rec = btr_page_get_father_for_rec(
2912
				index, page,
2913
				page_rec_get_prev(page_get_supremum_rec(page)),
2914
				&mtr);
2915
			rec_print(stderr, rec, index);
2916
			putc('\n', stderr);
2917
			ret = FALSE;
2918
2919
			goto node_ptr_fails;
2920
		}
2921
2922
		if (btr_page_get_level(page, &mtr) > 0) {
2923
			offsets	= rec_get_offsets(node_ptr, index,
2924
						  offsets, ULINT_UNDEFINED,
2925
						  &heap);
2926
2927
			node_ptr_tuple = dict_index_build_node_ptr(
2928
				index,
2929
				page_rec_get_next(page_get_infimum_rec(page)),
2930
				0, heap, btr_page_get_level(page, &mtr));
2931
2932
			if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
2933
					   offsets)) {
2934
				rec_t*	first_rec	= page_rec_get_next(
2935
					page_get_infimum_rec(page));
2936
2937
				btr_validate_report1(index, level, page);
2938
2939
				buf_page_print(father_page);
2940
				buf_page_print(page);
2941
2942
				fputs("InnoDB: Error: node ptrs differ"
2943
				      " on levels > 0\n"
2944
				      "InnoDB: node ptr ", stderr);
2945
				rec_print_new(stderr, node_ptr, offsets);
2946
				fputs("InnoDB: first rec ", stderr);
2947
				rec_print(stderr, first_rec, index);
2948
				putc('\n', stderr);
2949
				ret = FALSE;
2950
2951
				goto node_ptr_fails;
2952
			}
2953
		}
2954
2955
		if (left_page_no == FIL_NULL) {
2956
			ut_a(node_ptr == page_rec_get_next(
2957
				     page_get_infimum_rec(father_page)));
2958
			ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
2959
		}
2960
2961
		if (right_page_no == FIL_NULL) {
2962
			ut_a(node_ptr == page_rec_get_prev(
2963
				     page_get_supremum_rec(father_page)));
2964
			ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
2965
		} else {
2966
			right_node_ptr = btr_page_get_father_node_ptr(
2967
				index, right_page, &mtr);
2968
			if (page_rec_get_next(node_ptr)
2969
			    != page_get_supremum_rec(father_page)) {
2970
2971
				if (right_node_ptr
2972
				    != page_rec_get_next(node_ptr)) {
2973
					ret = FALSE;
2974
					fputs("InnoDB: node pointer to"
2975
					      " the right page is wrong\n",
2976
					      stderr);
2977
2978
					btr_validate_report1(index, level,
2979
							     page);
2980
2981
					buf_page_print(father_page);
2982
					buf_page_print(page);
2983
					buf_page_print(right_page);
2984
				}
2985
			} else {
2986
				right_father_page = buf_frame_align(
2987
					right_node_ptr);
2988
2989
				if (right_node_ptr != page_rec_get_next(
2990
					    page_get_infimum_rec(
2991
						    right_father_page))) {
2992
					ret = FALSE;
2993
					fputs("InnoDB: node pointer 2 to"
2994
					      " the right page is wrong\n",
2995
					      stderr);
2996
2997
					btr_validate_report1(index, level,
2998
							     page);
2999
3000
					buf_page_print(father_page);
3001
					buf_page_print(right_father_page);
3002
					buf_page_print(page);
3003
					buf_page_print(right_page);
3004
				}
3005
3006
				if (buf_frame_get_page_no(right_father_page)
3007
				    != btr_page_get_next(father_page, &mtr)) {
3008
3009
					ret = FALSE;
3010
					fputs("InnoDB: node pointer 3 to"
3011
					      " the right page is wrong\n",
3012
					      stderr);
3013
3014
					btr_validate_report1(index, level,
3015
							     page);
3016
3017
					buf_page_print(father_page);
3018
					buf_page_print(right_father_page);
3019
					buf_page_print(page);
3020
					buf_page_print(right_page);
3021
				}
3022
			}
3023
		}
3024
	}
3025
3026
node_ptr_fails:
3027
	/* Commit the mini-transaction to release the latch on 'page'.
3028
	Re-acquire the latch on right_page, which will become 'page'
3029
	on the next loop.  The page has already been checked. */
3030
	mtr_commit(&mtr);
3031
3032
	if (right_page_no != FIL_NULL) {
3033
		mtr_start(&mtr);
3034
3035
		page = btr_page_get(space, right_page_no, RW_X_LATCH, &mtr);
3036
3037
		goto loop;
3038
	}
3039
3040
	mem_heap_free(heap);
3041
	return(ret);
3042
}
3043
3044
/******************************************************************
3045
Checks the consistency of an index tree. */
3046
3047
ibool
3048
btr_validate_index(
3049
/*===============*/
3050
				/* out: TRUE if ok */
3051
	dict_index_t*	index,	/* in: index */
3052
	trx_t*		trx)	/* in: transaction or NULL */
3053
{
3054
	mtr_t	mtr;
3055
	page_t*	root;
3056
	ulint	i;
3057
	ulint	n;
3058
3059
	mtr_start(&mtr);
3060
	mtr_x_lock(dict_index_get_lock(index), &mtr);
3061
3062
	root = btr_root_get(index, &mtr);
3063
	n = btr_page_get_level(root, &mtr);
3064
3065
	for (i = 0; i <= n && !trx_is_interrupted(trx); i++) {
3066
		if (!btr_validate_level(index, trx, n - i)) {
3067
3068
			mtr_commit(&mtr);
3069
3070
			return(FALSE);
3071
		}
3072
	}
3073
3074
	mtr_commit(&mtr);
3075
3076
	return(TRUE);
3077
}