1
/******************************************************
4
All changes that row operations make to a B-tree or the records
5
there must go through this module! Undo log records are written here
6
of every modify or insert of a clustered index record.
9
To make sure we do not run out of disk space during a pessimistic
10
insert or update, we have to reserve 2 x the height of the index tree
11
many pages in the tablespace before we start the operation, because
12
if leaf splitting has been started, it is difficult to undo, except
13
by crashing the database and doing a roll-forward.
15
(c) 1994-2001 Innobase Oy
17
Created 10/16/1994 Heikki Tuuri
18
*******************************************************/
26
#include "page0page.h"
35
#include "trx0roll.h" /* trx_is_recv() */
39
#include "ibuf0ibuf.h"
40
#include "lock0lock.h"
44
/* If the following is set to TRUE, this module prints a lot of
45
trace information of individual record operations */
46
UNIV_INTERN ibool btr_cur_print_record_ops = FALSE;
47
#endif /* UNIV_DEBUG */
49
UNIV_INTERN ulint btr_cur_n_non_sea = 0;
50
UNIV_INTERN ulint btr_cur_n_sea = 0;
51
UNIV_INTERN ulint btr_cur_n_non_sea_old = 0;
52
UNIV_INTERN ulint btr_cur_n_sea_old = 0;
54
/* In the optimistic insert, if the insert does not fit, but this much space
55
can be released by page reorganize, then it is reorganized */
57
#define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32)
59
/* The structure of a BLOB part header */
60
/*--------------------------------------*/
61
#define BTR_BLOB_HDR_PART_LEN 0 /* BLOB part len on this
63
#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /* next BLOB part page no,
65
/*--------------------------------------*/
66
#define BTR_BLOB_HDR_SIZE 8
68
/* A BLOB field reference full of zero, for use in assertions and tests.
69
Initially, BLOB field references are set to zero, in
70
dtuple_convert_big_rec(). */
71
UNIV_INTERN const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE];
73
/***********************************************************************
74
Marks all extern fields in a record as owned by the record. This function
75
should be called if the delete mark of a record is removed: a not delete
76
marked record always owns all its extern fields. */
79
btr_cur_unmark_extern_fields(
80
/*=========================*/
81
page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed
82
part will be updated, or NULL */
83
rec_t* rec, /* in/out: record in a clustered index */
84
dict_index_t* index, /* in: index of the page */
85
const ulint* offsets,/* in: array returned by rec_get_offsets() */
86
mtr_t* mtr); /* in: mtr, or NULL if not logged */
87
/***********************************************************************
88
Adds path information to the cursor for the current page, for which
89
the binary search has been performed. */
92
btr_cur_add_path_info(
93
/*==================*/
94
btr_cur_t* cursor, /* in: cursor positioned on a page */
95
ulint height, /* in: height of the page in tree;
97
ulint root_height); /* in: root node height in tree */
98
/***************************************************************
99
Frees the externally stored fields for a record, if the field is mentioned
100
in the update vector. */
103
btr_rec_free_updated_extern_fields(
104
/*===============================*/
105
dict_index_t* index, /* in: index of rec; the index tree MUST be
107
rec_t* rec, /* in: record */
108
page_zip_des_t* page_zip,/* in: compressed page whose uncompressed
109
part will be updated, or NULL */
110
const ulint* offsets,/* in: rec_get_offsets(rec, index) */
111
const upd_t* update, /* in: update vector */
112
enum trx_rb_ctx rb_ctx, /* in: rollback context */
113
mtr_t* mtr); /* in: mini-transaction handle which contains
114
an X-latch to record page and to the tree */
115
/***************************************************************
116
Frees the externally stored fields for a record. */
119
btr_rec_free_externally_stored_fields(
120
/*==================================*/
121
dict_index_t* index, /* in: index of the data, the index
122
tree MUST be X-latched */
123
rec_t* rec, /* in: record */
124
const ulint* offsets,/* in: rec_get_offsets(rec, index) */
125
page_zip_des_t* page_zip,/* in: compressed page whose uncompressed
126
part will be updated, or NULL */
127
enum trx_rb_ctx rb_ctx, /* in: rollback context */
128
mtr_t* mtr); /* in: mini-transaction handle which contains
129
an X-latch to record page and to the index
131
/***************************************************************
132
Gets the externally stored size of a record, in units of a database page. */
135
btr_rec_get_externally_stored_len(
136
/*==============================*/
137
/* out: externally stored part,
138
in units of a database page */
139
rec_t* rec, /* in: record */
140
const ulint* offsets);/* in: array returned by rec_get_offsets() */
142
/**********************************************************
143
The following function is used to set the deleted bit of a record. */
146
btr_rec_set_deleted_flag(
147
/*=====================*/
148
/* out: TRUE on success;
149
FALSE on page_zip overflow */
150
rec_t* rec, /* in/out: physical record */
151
page_zip_des_t* page_zip,/* in/out: compressed page (or NULL) */
152
ulint flag) /* in: nonzero if delete marked */
154
if (page_rec_is_comp(rec)) {
155
rec_set_deleted_flag_new(rec, page_zip, flag);
158
rec_set_deleted_flag_old(rec, flag);
162
/*==================== B-TREE SEARCH =========================*/
164
/************************************************************************
165
Latches the leaf page or pages requested. */
168
btr_cur_latch_leaves(
169
/*=================*/
170
page_t* page, /* in: leaf page where the search
172
ulint space, /* in: space id */
173
ulint zip_size, /* in: compressed page size in bytes
174
or 0 for uncompressed pages */
175
ulint page_no, /* in: page number of the leaf */
176
ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */
177
btr_cur_t* cursor, /* in: cursor */
178
mtr_t* mtr) /* in: mtr */
183
buf_block_t* get_block;
187
switch (latch_mode) {
188
case BTR_SEARCH_LEAF:
189
case BTR_MODIFY_LEAF:
190
mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH;
191
get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
192
#ifdef UNIV_BTR_DEBUG
193
ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
194
#endif /* UNIV_BTR_DEBUG */
195
get_block->check_index_page_at_flush = TRUE;
197
case BTR_MODIFY_TREE:
198
/* x-latch also brothers from left to right */
199
left_page_no = btr_page_get_prev(page, mtr);
201
if (left_page_no != FIL_NULL) {
202
get_block = btr_block_get(space, zip_size,
205
#ifdef UNIV_BTR_DEBUG
206
ut_a(page_is_comp(get_block->frame)
207
== page_is_comp(page));
208
ut_a(btr_page_get_next(get_block->frame, mtr)
209
== page_get_page_no(page));
210
#endif /* UNIV_BTR_DEBUG */
211
get_block->check_index_page_at_flush = TRUE;
214
get_block = btr_block_get(space, zip_size, page_no,
216
#ifdef UNIV_BTR_DEBUG
217
ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
218
#endif /* UNIV_BTR_DEBUG */
219
get_block->check_index_page_at_flush = TRUE;
221
right_page_no = btr_page_get_next(page, mtr);
223
if (right_page_no != FIL_NULL) {
224
get_block = btr_block_get(space, zip_size,
227
#ifdef UNIV_BTR_DEBUG
228
ut_a(page_is_comp(get_block->frame)
229
== page_is_comp(page));
230
ut_a(btr_page_get_prev(get_block->frame, mtr)
231
== page_get_page_no(page));
232
#endif /* UNIV_BTR_DEBUG */
233
get_block->check_index_page_at_flush = TRUE;
238
case BTR_SEARCH_PREV:
239
case BTR_MODIFY_PREV:
240
mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
241
/* latch also left brother */
242
left_page_no = btr_page_get_prev(page, mtr);
244
if (left_page_no != FIL_NULL) {
245
get_block = btr_block_get(space, zip_size,
246
left_page_no, mode, mtr);
247
cursor->left_block = get_block;
248
#ifdef UNIV_BTR_DEBUG
249
ut_a(page_is_comp(get_block->frame)
250
== page_is_comp(page));
251
ut_a(btr_page_get_next(get_block->frame, mtr)
252
== page_get_page_no(page));
253
#endif /* UNIV_BTR_DEBUG */
254
get_block->check_index_page_at_flush = TRUE;
257
get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
258
#ifdef UNIV_BTR_DEBUG
259
ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
260
#endif /* UNIV_BTR_DEBUG */
261
get_block->check_index_page_at_flush = TRUE;
268
/************************************************************************
269
Searches an index tree and positions a tree cursor on a given level.
270
NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
271
to node pointer page number fields on the upper levels of the tree!
272
Note that if mode is PAGE_CUR_LE, which is used in inserts, then
273
cursor->up_match and cursor->low_match both will have sensible values.
274
If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
276
If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
277
search tuple should be performed in the B-tree. InnoDB does an insert
278
immediately after the cursor. Thus, the cursor may end up on a user record,
279
or on a page infimum record. */
282
btr_cur_search_to_nth_level(
283
/*========================*/
284
dict_index_t* index, /* in: index */
285
ulint level, /* in: the tree level of search */
286
const dtuple_t* tuple, /* in: data tuple; NOTE: n_fields_cmp in
287
tuple must be set so that it cannot get
288
compared to the node ptr page number field! */
289
ulint mode, /* in: PAGE_CUR_L, ...;
290
Inserts should always be made using
291
PAGE_CUR_LE to search the position! */
292
ulint latch_mode, /* in: BTR_SEARCH_LEAF, ..., ORed with
293
BTR_INSERT and BTR_ESTIMATE;
294
cursor->left_block is used to store a pointer
295
to the left neighbor page, in the cases
296
BTR_SEARCH_PREV and BTR_MODIFY_PREV;
297
NOTE that if has_search_latch
298
is != 0, we maybe do not have a latch set
299
on the cursor page, we assume
300
the caller uses his search latch
301
to protect the record! */
302
btr_cur_t* cursor, /* in/out: tree cursor; the cursor page is
303
s- or x-latched, but see also above! */
304
ulint has_search_latch,/* in: info on the latch mode the
305
caller currently has on btr_search_latch:
307
mtr_t* mtr) /* in: mtr */
309
page_cur_t* page_cursor;
323
ulint insert_planned;
326
ulint ignore_sec_unique;
327
ulint root_height = 0; /* remove warning */
331
mem_heap_t* heap = NULL;
332
ulint offsets_[REC_OFFS_NORMAL_SIZE];
333
ulint* offsets = offsets_;
334
rec_offs_init(offsets_);
335
/* Currently, PAGE_CUR_LE is the only search mode used for searches
336
ending to upper levels */
338
ut_ad(level == 0 || mode == PAGE_CUR_LE);
339
ut_ad(dict_index_check_search_tuple(index, tuple));
340
ut_ad(!dict_index_is_ibuf(index) || ibuf_inside());
341
ut_ad(dtuple_check_typed(tuple));
344
cursor->up_match = ULINT_UNDEFINED;
345
cursor->low_match = ULINT_UNDEFINED;
347
insert_planned = latch_mode & BTR_INSERT;
348
estimate = latch_mode & BTR_ESTIMATE;
349
ignore_sec_unique = latch_mode & BTR_IGNORE_SEC_UNIQUE;
350
latch_mode = latch_mode & ~(BTR_INSERT | BTR_ESTIMATE
351
| BTR_IGNORE_SEC_UNIQUE);
353
ut_ad(!insert_planned || (mode == PAGE_CUR_LE));
355
cursor->flag = BTR_CUR_BINARY;
356
cursor->index = index;
358
#ifndef BTR_CUR_ADAPT
361
info = btr_search_get_info(index);
363
guess = info->root_guess;
365
#ifdef BTR_CUR_HASH_ADAPT
367
#ifdef UNIV_SEARCH_PERF_STAT
370
if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED
371
&& latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
373
#ifdef PAGE_CUR_LE_OR_EXTENDS
374
&& mode != PAGE_CUR_LE_OR_EXTENDS
375
#endif /* PAGE_CUR_LE_OR_EXTENDS */
376
&& !UNIV_UNLIKELY(btr_search_disabled)
377
&& btr_search_guess_on_hash(index, info, tuple, mode,
379
has_search_latch, mtr)) {
381
/* Search using the hash index succeeded */
383
ut_ad(cursor->up_match != ULINT_UNDEFINED
384
|| mode != PAGE_CUR_GE);
385
ut_ad(cursor->up_match != ULINT_UNDEFINED
386
|| mode != PAGE_CUR_LE);
387
ut_ad(cursor->low_match != ULINT_UNDEFINED
388
|| mode != PAGE_CUR_LE);
393
#endif /* BTR_CUR_HASH_ADAPT */
394
#endif /* BTR_CUR_ADAPT */
397
/* If the hash search did not succeed, do binary search down the
400
if (has_search_latch) {
401
/* Release possible search latch to obey latching order */
402
rw_lock_s_unlock(&btr_search_latch);
405
/* Store the position of the tree latch we push to mtr so that we
406
know how to release it when we have latched leaf node(s) */
408
savepoint = mtr_set_savepoint(mtr);
410
if (latch_mode == BTR_MODIFY_TREE) {
411
mtr_x_lock(dict_index_get_lock(index), mtr);
413
} else if (latch_mode == BTR_CONT_MODIFY_TREE) {
415
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
418
mtr_s_lock(dict_index_get_lock(index), mtr);
421
page_cursor = btr_cur_get_page_cur(cursor);
423
space = dict_index_get_space(index);
424
page_no = dict_index_get_page(index);
431
height = ULINT_UNDEFINED;
432
rw_latch = RW_NO_LATCH;
435
/* We use these modified search modes on non-leaf levels of the
436
B-tree. These let us end up in the right B-tree leaf. In that leaf
437
we use the original search mode. */
441
page_mode = PAGE_CUR_L;
444
page_mode = PAGE_CUR_LE;
447
#ifdef PAGE_CUR_LE_OR_EXTENDS
448
ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
449
|| mode == PAGE_CUR_LE_OR_EXTENDS);
450
#else /* PAGE_CUR_LE_OR_EXTENDS */
451
ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
452
#endif /* PAGE_CUR_LE_OR_EXTENDS */
457
/* Loop and search until we arrive at the desired level */
463
zip_size = dict_table_zip_size(index->table);
465
block = buf_page_get_gen(space, zip_size, page_no,
466
rw_latch, guess, buf_mode,
467
__FILE__, __LINE__, mtr);
469
/* This must be a search to perform an insert;
470
try insert to the insert buffer */
472
ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
473
ut_ad(insert_planned);
476
if (ibuf_should_try(index, ignore_sec_unique)
477
&& ibuf_insert(tuple, index, space, zip_size,
478
page_no, cursor->thr)) {
479
/* Insertion to the insert buffer succeeded */
480
cursor->flag = BTR_CUR_INSERT_TO_IBUF;
481
if (UNIV_LIKELY_NULL(heap)) {
487
/* Insert to the insert buffer did not succeed:
495
page = buf_block_get_frame(block);
496
#ifdef UNIV_ZIP_DEBUG
497
if (rw_latch != RW_NO_LATCH) {
498
const page_zip_des_t* page_zip
499
= buf_block_get_page_zip(block);
500
ut_a(!page_zip || page_zip_validate(page_zip, page));
502
#endif /* UNIV_ZIP_DEBUG */
504
block->check_index_page_at_flush = TRUE;
506
if (rw_latch != RW_NO_LATCH) {
507
buf_block_dbg_add_level(block, SYNC_TREE_NODE);
510
ut_ad(0 == ut_dulint_cmp(index->id,
511
btr_page_get_index_id(page)));
513
if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
514
/* We are in the root node */
516
height = btr_page_get_level(page, mtr);
517
root_height = height;
518
cursor->tree_height = root_height + 1;
520
if (block != guess) {
521
info->root_guess = block;
527
if (rw_latch == RW_NO_LATCH) {
529
btr_cur_latch_leaves(page, space, zip_size,
534
if ((latch_mode != BTR_MODIFY_TREE)
535
&& (latch_mode != BTR_CONT_MODIFY_TREE)) {
537
/* Release the tree s-latch */
539
mtr_release_s_latch_at_savepoint(
541
dict_index_get_lock(index));
547
page_cur_search_with_match(block, index, tuple, page_mode,
548
&up_match, &up_bytes,
549
&low_match, &low_bytes,
553
btr_cur_add_path_info(cursor, height, root_height);
556
/* If this is the desired level, leave the loop */
558
ut_ad(height == btr_page_get_level(
559
page_cur_get_page(page_cursor), mtr));
561
if (level == height) {
564
/* x-latch the page */
565
page = btr_page_get(space, zip_size,
566
page_no, RW_X_LATCH, mtr);
567
ut_a((ibool)!!page_is_comp(page)
568
== dict_table_is_comp(index->table));
578
if ((height == 0) && (latch_mode <= BTR_MODIFY_LEAF)) {
580
rw_latch = latch_mode;
583
&& ibuf_should_try(index, ignore_sec_unique)) {
585
/* Try insert to the insert buffer if the
586
page is not in the buffer pool */
588
buf_mode = BUF_GET_IF_IN_POOL;
594
node_ptr = page_cur_get_rec(page_cursor);
595
offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
596
ULINT_UNDEFINED, &heap);
597
/* Go to the child node */
598
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
601
if (UNIV_LIKELY_NULL(heap)) {
606
cursor->low_match = low_match;
607
cursor->low_bytes = low_bytes;
608
cursor->up_match = up_match;
609
cursor->up_bytes = up_bytes;
612
if (!UNIV_UNLIKELY(btr_search_disabled)) {
614
btr_search_info_update(index, cursor);
617
ut_ad(cursor->up_match != ULINT_UNDEFINED
618
|| mode != PAGE_CUR_GE);
619
ut_ad(cursor->up_match != ULINT_UNDEFINED
620
|| mode != PAGE_CUR_LE);
621
ut_ad(cursor->low_match != ULINT_UNDEFINED
622
|| mode != PAGE_CUR_LE);
626
if (has_search_latch) {
628
rw_lock_s_lock(&btr_search_latch);
632
/*********************************************************************
633
Opens a cursor at either end of an index. */
636
btr_cur_open_at_index_side(
637
/*=======================*/
638
ibool from_left, /* in: TRUE if open to the low end,
639
FALSE if to the high end */
640
dict_index_t* index, /* in: index */
641
ulint latch_mode, /* in: latch mode */
642
btr_cur_t* cursor, /* in: cursor */
643
mtr_t* mtr) /* in: mtr */
645
page_cur_t* page_cursor;
650
ulint root_height = 0; /* remove warning */
654
mem_heap_t* heap = NULL;
655
ulint offsets_[REC_OFFS_NORMAL_SIZE];
656
ulint* offsets = offsets_;
657
rec_offs_init(offsets_);
659
estimate = latch_mode & BTR_ESTIMATE;
660
latch_mode = latch_mode & ~BTR_ESTIMATE;
662
/* Store the position of the tree latch we push to mtr so that we
663
know how to release it when we have latched the leaf node */
665
savepoint = mtr_set_savepoint(mtr);
667
if (latch_mode == BTR_MODIFY_TREE) {
668
mtr_x_lock(dict_index_get_lock(index), mtr);
670
mtr_s_lock(dict_index_get_lock(index), mtr);
673
page_cursor = btr_cur_get_page_cur(cursor);
674
cursor->index = index;
676
space = dict_index_get_space(index);
677
zip_size = dict_table_zip_size(index->table);
678
page_no = dict_index_get_page(index);
680
height = ULINT_UNDEFINED;
685
block = buf_page_get_gen(space, zip_size, page_no,
686
RW_NO_LATCH, NULL, BUF_GET,
687
__FILE__, __LINE__, mtr);
688
page = buf_block_get_frame(block);
689
ut_ad(0 == ut_dulint_cmp(index->id,
690
btr_page_get_index_id(page)));
692
block->check_index_page_at_flush = TRUE;
694
if (height == ULINT_UNDEFINED) {
695
/* We are in the root node */
697
height = btr_page_get_level(page, mtr);
698
root_height = height;
702
btr_cur_latch_leaves(page, space, zip_size, page_no,
703
latch_mode, cursor, mtr);
705
/* In versions <= 3.23.52 we had forgotten to
706
release the tree latch here. If in an index scan
707
we had to scan far to find a record visible to the
708
current transaction, that could starve others
709
waiting for the tree latch. */
711
if ((latch_mode != BTR_MODIFY_TREE)
712
&& (latch_mode != BTR_CONT_MODIFY_TREE)) {
714
/* Release the tree s-latch */
716
mtr_release_s_latch_at_savepoint(
718
dict_index_get_lock(index));
723
page_cur_set_before_first(block, page_cursor);
725
page_cur_set_after_last(block, page_cursor);
730
btr_cur_add_path_info(cursor, height,
740
page_cur_move_to_next(page_cursor);
742
page_cur_move_to_prev(page_cursor);
746
btr_cur_add_path_info(cursor, height, root_height);
751
node_ptr = page_cur_get_rec(page_cursor);
752
offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
753
ULINT_UNDEFINED, &heap);
754
/* Go to the child node */
755
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
758
if (UNIV_LIKELY_NULL(heap)) {
763
/**************************************************************************
764
Positions a cursor at a randomly chosen position within a B-tree. */
767
btr_cur_open_at_rnd_pos(
768
/*====================*/
769
dict_index_t* index, /* in: index */
770
ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */
771
btr_cur_t* cursor, /* in/out: B-tree cursor */
772
mtr_t* mtr) /* in: mtr */
774
page_cur_t* page_cursor;
780
mem_heap_t* heap = NULL;
781
ulint offsets_[REC_OFFS_NORMAL_SIZE];
782
ulint* offsets = offsets_;
783
rec_offs_init(offsets_);
785
if (latch_mode == BTR_MODIFY_TREE) {
786
mtr_x_lock(dict_index_get_lock(index), mtr);
788
mtr_s_lock(dict_index_get_lock(index), mtr);
791
page_cursor = btr_cur_get_page_cur(cursor);
792
cursor->index = index;
794
space = dict_index_get_space(index);
795
zip_size = dict_table_zip_size(index->table);
796
page_no = dict_index_get_page(index);
798
height = ULINT_UNDEFINED;
804
block = buf_page_get_gen(space, zip_size, page_no,
805
RW_NO_LATCH, NULL, BUF_GET,
806
__FILE__, __LINE__, mtr);
807
page = buf_block_get_frame(block);
808
ut_ad(0 == ut_dulint_cmp(index->id,
809
btr_page_get_index_id(page)));
811
if (height == ULINT_UNDEFINED) {
812
/* We are in the root node */
814
height = btr_page_get_level(page, mtr);
818
btr_cur_latch_leaves(page, space, zip_size, page_no,
819
latch_mode, cursor, mtr);
822
page_cur_open_on_rnd_user_rec(block, page_cursor);
833
node_ptr = page_cur_get_rec(page_cursor);
834
offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
835
ULINT_UNDEFINED, &heap);
836
/* Go to the child node */
837
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
840
if (UNIV_LIKELY_NULL(heap)) {
845
/*==================== B-TREE INSERT =========================*/
847
/*****************************************************************
848
Inserts a record if there is enough space, or if enough space can
849
be freed by reorganizing. Differs from btr_cur_optimistic_insert because
850
no heuristics is applied to whether it pays to use CPU time for
851
reorganizing the page or not. */
854
btr_cur_insert_if_possible(
855
/*=======================*/
856
/* out: pointer to inserted record if succeed,
858
btr_cur_t* cursor, /* in: cursor on page after which to insert;
859
cursor stays valid */
860
const dtuple_t* tuple, /* in: tuple to insert; the size info need not
861
have been stored to tuple */
862
ulint n_ext, /* in: number of externally stored columns */
863
mtr_t* mtr) /* in: mtr */
865
page_cur_t* page_cursor;
869
ut_ad(dtuple_check_typed(tuple));
871
block = btr_cur_get_block(cursor);
873
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
874
page_cursor = btr_cur_get_page_cur(cursor);
876
/* Now, try the insert */
877
rec = page_cur_tuple_insert(page_cursor, tuple,
878
cursor->index, n_ext, mtr);
880
if (UNIV_UNLIKELY(!rec)) {
881
/* If record did not fit, reorganize */
883
if (btr_page_reorganize(block, cursor->index, mtr)) {
885
page_cur_search(block, cursor->index, tuple,
886
PAGE_CUR_LE, page_cursor);
888
rec = page_cur_tuple_insert(page_cursor, tuple,
889
cursor->index, n_ext, mtr);
896
/*****************************************************************
897
For an insert, checks the locks and does the undo logging if desired. */
900
btr_cur_ins_lock_and_undo(
901
/*======================*/
902
/* out: DB_SUCCESS, DB_WAIT_LOCK,
903
DB_FAIL, or error number */
904
ulint flags, /* in: undo logging and locking flags: if
905
not zero, the parameters index and thr
906
should be specified */
907
btr_cur_t* cursor, /* in: cursor on page after which to insert */
908
const dtuple_t* entry, /* in: entry to insert */
909
que_thr_t* thr, /* in: query thread or NULL */
910
ibool* inherit)/* out: TRUE if the inserted new record maybe
911
should inherit LOCK_GAP type locks from the
919
/* Check if we have to wait for a lock: enqueue an explicit lock
922
rec = btr_cur_get_rec(cursor);
923
index = cursor->index;
925
err = lock_rec_insert_check_and_lock(flags, rec,
926
btr_cur_get_block(cursor),
927
index, thr, inherit);
929
if (err != DB_SUCCESS) {
934
if (dict_index_is_clust(index) && !dict_index_is_ibuf(index)) {
936
err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
940
if (err != DB_SUCCESS) {
945
/* Now we can fill in the roll ptr field in entry */
947
if (!(flags & BTR_KEEP_SYS_FLAG)) {
949
row_upd_index_entry_sys_field(entry, index,
950
DATA_ROLL_PTR, roll_ptr);
958
/*****************************************************************
959
Report information about a transaction. */
964
trx_t* trx, /* in: transaction */
965
const dict_index_t* index, /* in: index */
966
const char* op) /* in: operation */
968
fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ",
969
TRX_ID_PREP_PRINTF(trx->id));
971
dict_index_name_print(stderr, trx, index);
974
#endif /* UNIV_DEBUG */
976
/*****************************************************************
977
Tries to perform an insert to a page in an index tree, next to cursor.
978
It is assumed that mtr holds an x-latch on the page. The operation does
979
not succeed if there is too little space on the page. If there is just
980
one record on the page, the insert will always succeed; this is to
981
prevent trying to split a page with just one record. */
984
btr_cur_optimistic_insert(
985
/*======================*/
986
/* out: DB_SUCCESS, DB_WAIT_LOCK,
987
DB_FAIL, or error number */
988
ulint flags, /* in: undo logging and locking flags: if not
989
zero, the parameters index and thr should be
991
btr_cur_t* cursor, /* in: cursor on page after which to insert;
992
cursor stays valid */
993
dtuple_t* entry, /* in/out: entry to insert */
994
rec_t** rec, /* out: pointer to inserted record if
996
big_rec_t** big_rec,/* out: big rec vector whose fields have to
997
be stored externally by the caller, or
999
ulint n_ext, /* in: number of externally stored columns */
1000
que_thr_t* thr, /* in: query thread or NULL */
1001
mtr_t* mtr) /* in: mtr; if this function returns
1002
DB_SUCCESS on a leaf page of a secondary
1003
index in a compressed tablespace, the
1004
mtr must be committed before latching
1005
any further pages */
1007
big_rec_t* big_rec_vec = NULL;
1008
dict_index_t* index;
1009
page_cur_t* page_cursor;
1019
mem_heap_t* heap = NULL;
1024
block = btr_cur_get_block(cursor);
1025
page = buf_block_get_frame(block);
1026
index = cursor->index;
1027
zip_size = buf_block_get_zip_size(block);
1028
#ifdef UNIV_DEBUG_VALGRIND
1030
UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
1031
UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
1033
#endif /* UNIV_DEBUG_VALGRIND */
1035
if (!dtuple_check_typed_no_assert(entry)) {
1036
fputs("InnoDB: Error in a tuple to insert into ", stderr);
1037
dict_index_name_print(stderr, thr_get_trx(thr), index);
1040
if (btr_cur_print_record_ops && thr) {
1041
btr_cur_trx_report(thr_get_trx(thr), index, "insert into ");
1042
dtuple_print(stderr, entry);
1044
#endif /* UNIV_DEBUG */
1046
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1047
max_size = page_get_max_insert_size_after_reorganize(page, 1);
1048
leaf = page_is_leaf(page);
1050
/* Calculate the record size when entry is converted to a record */
1051
rec_size = rec_get_converted_size(index, entry, n_ext);
1053
if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
1054
dtuple_get_n_fields(entry), zip_size)) {
1056
/* The record is so big that we have to store some fields
1057
externally on separate database pages */
1058
big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
1060
if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
1062
return(DB_TOO_BIG_RECORD);
1065
rec_size = rec_get_converted_size(index, entry, n_ext);
1068
if (UNIV_UNLIKELY(zip_size)) {
1069
/* Estimate the free space of an empty compressed page.
1070
Subtract one byte for the encoded heap_no in the
1071
modification log. */
1072
ulint free_space_zip = page_zip_empty_size(
1073
cursor->index->n_fields, zip_size) - 1;
1074
ulint n_uniq = dict_index_get_n_unique_in_tree(index);
1076
ut_ad(dict_table_is_comp(index->table));
1078
/* There should be enough room for two node pointer
1079
records on an empty non-leaf page. This prevents
1080
infinite page splits. */
1082
if (UNIV_LIKELY(entry->n_fields >= n_uniq)
1083
&& UNIV_UNLIKELY(REC_NODE_PTR_SIZE
1084
+ rec_get_converted_size_comp_prefix(
1085
index, entry->fields, n_uniq,
1087
/* On a compressed page, there is
1088
a two-byte entry in the dense
1089
page directory for every record.
1090
But there is no record header. */
1091
- (REC_N_NEW_EXTRA_BYTES - 2)
1092
> free_space_zip / 2)) {
1095
dtuple_convert_back_big_rec(
1096
index, entry, big_rec_vec);
1100
mem_heap_free(heap);
1103
return(DB_TOO_BIG_RECORD);
1107
/* If there have been many consecutive inserts, and we are on the leaf
1108
level, check if we have to split the page to reserve enough free space
1109
for future updates of records. */
1111
if (dict_index_is_clust(index)
1112
&& (page_get_n_recs(page) >= 2)
1113
&& UNIV_LIKELY(leaf)
1114
&& (dict_index_get_space_reserve() + rec_size > max_size)
1115
&& (btr_page_get_split_rec_to_right(cursor, &dummy_rec)
1116
|| btr_page_get_split_rec_to_left(cursor, &dummy_rec))) {
1122
dtuple_convert_back_big_rec(index, entry, big_rec_vec);
1125
if (UNIV_LIKELY_NULL(heap)) {
1126
mem_heap_free(heap);
1132
if (UNIV_UNLIKELY(max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
1133
|| max_size < rec_size)
1134
&& UNIV_LIKELY(page_get_n_recs(page) > 1)
1135
&& page_get_max_insert_size(page, 1) < rec_size) {
1140
/* Check locks and write to the undo log, if specified */
1141
err = btr_cur_ins_lock_and_undo(flags, cursor, entry, thr, &inherit);
1143
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
1148
page_cursor = btr_cur_get_page_cur(cursor);
1150
/* Now, try the insert */
1153
const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
1154
*rec = page_cur_tuple_insert(page_cursor, entry, index,
1156
reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
1158
if (UNIV_UNLIKELY(reorg)) {
1164
if (UNIV_UNLIKELY(!*rec) && UNIV_LIKELY(!reorg)) {
1165
/* If the record did not fit, reorganize */
1166
if (UNIV_UNLIKELY(!btr_page_reorganize(block, index, mtr))) {
1173
|| page_get_max_insert_size(page, 1) == max_size);
1177
page_cur_search(block, index, entry, PAGE_CUR_LE, page_cursor);
1179
*rec = page_cur_tuple_insert(page_cursor, entry, index,
1182
if (UNIV_UNLIKELY(!*rec)) {
1183
if (UNIV_LIKELY(zip_size != 0)) {
1188
fputs("InnoDB: Error: cannot insert tuple ", stderr);
1189
dtuple_print(stderr, entry);
1190
fputs(" into ", stderr);
1191
dict_index_name_print(stderr, thr_get_trx(thr), index);
1192
fprintf(stderr, "\nInnoDB: max insert size %lu\n",
1198
if (UNIV_LIKELY_NULL(heap)) {
1199
mem_heap_free(heap);
1202
#ifdef BTR_CUR_HASH_ADAPT
1203
if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
1204
btr_search_update_hash_node_on_insert(cursor);
1206
btr_search_update_hash_on_insert(cursor);
1210
if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
1212
lock_update_insert(block, *rec);
1216
fprintf(stderr, "Insert into page %lu, max ins size %lu,"
1217
" rec %lu ind type %lu\n",
1218
buf_block_get_page_no(block), max_size,
1219
rec_size + PAGE_DIR_SLOT_SIZE, index->type);
1221
if (!dict_index_is_clust(index) && leaf) {
1222
/* Update the free bits of the B-tree page in the
1223
insert buffer bitmap. */
1225
/* The free bits in the insert buffer bitmap must
1226
never exceed the free space on a page. It is safe to
1227
decrement or reset the bits in the bitmap in a
1228
mini-transaction that is committed before the
1229
mini-transaction that affects the free space. */
1231
/* It is unsafe to increment the bits in a separately
1232
committed mini-transaction, because in crash recovery,
1233
the free bits could momentarily be set too high. */
1236
/* Update the bits in the same mini-transaction. */
1237
ibuf_update_free_bits_zip(block, mtr);
1239
/* Decrement the bits in a separate
1240
mini-transaction. */
1241
ibuf_update_free_bits_if_full(
1243
rec_size + PAGE_DIR_SLOT_SIZE);
1247
*big_rec = big_rec_vec;
1252
/*****************************************************************
1253
Performs an insert on a page of an index tree. It is assumed that mtr
1254
holds an x-latch on the tree and on the cursor page. If the insert is
1255
made on the leaf level, to avoid deadlocks, mtr must also own x-latches
1256
to brothers of page, if those brothers exist. */
1259
btr_cur_pessimistic_insert(
1260
/*=======================*/
1261
/* out: DB_SUCCESS or error number */
1262
ulint flags, /* in: undo logging and locking flags: if not
1263
zero, the parameter thr should be
1264
specified; if no undo logging is specified,
1265
then the caller must have reserved enough
1266
free extents in the file space so that the
1267
insertion will certainly succeed */
1268
btr_cur_t* cursor, /* in: cursor after which to insert;
1269
cursor stays valid */
1270
dtuple_t* entry, /* in/out: entry to insert */
1271
rec_t** rec, /* out: pointer to inserted record if
1273
big_rec_t** big_rec,/* out: big rec vector whose fields have to
1274
be stored externally by the caller, or
1276
ulint n_ext, /* in: number of externally stored columns */
1277
que_thr_t* thr, /* in: query thread or NULL */
1278
mtr_t* mtr) /* in: mtr */
1280
dict_index_t* index = cursor->index;
1281
ulint zip_size = dict_table_zip_size(index->table);
1282
big_rec_t* big_rec_vec = NULL;
1283
mem_heap_t* heap = NULL;
1287
ulint n_extents = 0;
1290
ut_ad(dtuple_check_typed(entry));
1294
ut_ad(mtr_memo_contains(mtr,
1295
dict_index_get_lock(btr_cur_get_index(cursor)),
1297
ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
1298
MTR_MEMO_PAGE_X_FIX));
1300
/* Try first an optimistic insert; reset the cursor flag: we do not
1301
assume anything of how it was positioned */
1303
cursor->flag = BTR_CUR_BINARY;
1305
err = btr_cur_optimistic_insert(flags, cursor, entry, rec,
1306
big_rec, n_ext, thr, mtr);
1307
if (err != DB_FAIL) {
1312
/* Retry with a pessimistic insert. Check locks and write to undo log,
1315
err = btr_cur_ins_lock_and_undo(flags, cursor, entry, thr, &dummy_inh);
1317
if (err != DB_SUCCESS) {
1322
if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
1323
/* First reserve enough free space for the file segments
1324
of the index tree, so that the insert will not fail because
1327
n_extents = cursor->tree_height / 16 + 3;
1329
success = fsp_reserve_free_extents(&n_reserved, index->space,
1330
n_extents, FSP_NORMAL, mtr);
1332
return(DB_OUT_OF_FILE_SPACE);
1336
if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
1337
dict_table_is_comp(index->table),
1338
dict_index_get_n_fields(index),
1340
/* The record is so big that we have to store some fields
1341
externally on separate database pages */
1343
if (UNIV_LIKELY_NULL(big_rec_vec)) {
1344
/* This should never happen, but we handle
1345
the situation in a robust manner. */
1347
dtuple_convert_back_big_rec(index, entry, big_rec_vec);
1350
big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
1352
if (big_rec_vec == NULL) {
1354
if (n_extents > 0) {
1355
fil_space_release_free_extents(index->space,
1358
return(DB_TOO_BIG_RECORD);
1362
if (dict_index_get_page(index)
1363
== buf_block_get_page_no(btr_cur_get_block(cursor))) {
1365
/* The page is the root page */
1366
*rec = btr_root_raise_and_insert(cursor, entry, n_ext, mtr);
1368
*rec = btr_page_split_and_insert(cursor, entry, n_ext, mtr);
1371
if (UNIV_LIKELY_NULL(heap)) {
1372
mem_heap_free(heap);
1375
ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
1377
#ifdef BTR_CUR_ADAPT
1378
btr_search_update_hash_on_insert(cursor);
1380
if (!(flags & BTR_NO_LOCKING_FLAG)) {
1382
lock_update_insert(btr_cur_get_block(cursor), *rec);
1385
if (n_extents > 0) {
1386
fil_space_release_free_extents(index->space, n_reserved);
1389
*big_rec = big_rec_vec;
1394
/*==================== B-TREE UPDATE =========================*/
1396
/*****************************************************************
1397
For an update, checks the locks and does the undo logging. */
1400
btr_cur_upd_lock_and_undo(
1401
/*======================*/
1402
/* out: DB_SUCCESS, DB_WAIT_LOCK, or error
1404
ulint flags, /* in: undo logging and locking flags */
1405
btr_cur_t* cursor, /* in: cursor on record to update */
1406
const upd_t* update, /* in: update vector */
1407
ulint cmpl_info,/* in: compiler info on secondary index
1409
que_thr_t* thr, /* in: query thread */
1410
dulint* roll_ptr)/* out: roll pointer */
1412
dict_index_t* index;
1416
ut_ad(cursor && update && thr && roll_ptr);
1418
rec = btr_cur_get_rec(cursor);
1419
index = cursor->index;
1421
if (!dict_index_is_clust(index)) {
1422
/* We do undo logging only when we update a clustered index
1424
return(lock_sec_rec_modify_check_and_lock(
1425
flags, btr_cur_get_block(cursor), rec,
1429
/* Check if we have to wait for a lock: enqueue an explicit lock
1434
if (!(flags & BTR_NO_LOCKING_FLAG)) {
1435
mem_heap_t* heap = NULL;
1436
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1437
rec_offs_init(offsets_);
1439
err = lock_clust_rec_modify_check_and_lock(
1440
flags, btr_cur_get_block(cursor), rec, index,
1441
rec_get_offsets(rec, index, offsets_,
1442
ULINT_UNDEFINED, &heap), thr);
1443
if (UNIV_LIKELY_NULL(heap)) {
1444
mem_heap_free(heap);
1446
if (err != DB_SUCCESS) {
1452
/* Append the info about the update in the undo log */
1454
err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
1455
index, NULL, update,
1456
cmpl_info, rec, roll_ptr);
1460
/***************************************************************
1461
Writes a redo log record of updating a record in-place. */
1464
btr_cur_update_in_place_log(
1465
/*========================*/
1466
ulint flags, /* in: flags */
1467
rec_t* rec, /* in: record */
1468
dict_index_t* index, /* in: index where cursor positioned */
1469
const upd_t* update, /* in: update vector */
1470
trx_t* trx, /* in: transaction */
1471
dulint roll_ptr, /* in: roll ptr */
1472
mtr_t* mtr) /* in: mtr */
1475
page_t* page = page_align(rec);
1477
ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
1479
log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
1480
? MLOG_COMP_REC_UPDATE_IN_PLACE
1481
: MLOG_REC_UPDATE_IN_PLACE,
1482
1 + DATA_ROLL_PTR_LEN + 14 + 2
1486
/* Logging in mtr is switched off during crash recovery */
1490
/* The code below assumes index is a clustered index: change index to
1491
the clustered index if we are updating a secondary index record (or we
1492
could as well skip writing the sys col values to the log in this case
1493
because they are not needed for a secondary index record update) */
1495
index = dict_table_get_first_index(index->table);
1497
mach_write_to_1(log_ptr, flags);
1500
log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
1502
mach_write_to_2(log_ptr, page_offset(rec));
1505
row_upd_index_write_log(update, log_ptr, mtr);
1508
/***************************************************************
1509
Parses a redo log record of updating a record in-place. */
1512
btr_cur_parse_update_in_place(
1513
/*==========================*/
1514
/* out: end of log record or NULL */
1515
byte* ptr, /* in: buffer */
1516
byte* end_ptr,/* in: buffer end */
1517
page_t* page, /* in/out: page or NULL */
1518
page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */
1519
dict_index_t* index) /* in: index corresponding to page */
1531
if (end_ptr < ptr + 1) {
1536
flags = mach_read_from_1(ptr);
1539
ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
1546
if (end_ptr < ptr + 2) {
1551
rec_offset = mach_read_from_2(ptr);
1554
ut_a(rec_offset <= UNIV_PAGE_SIZE);
1556
heap = mem_heap_create(256);
1558
ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
1560
if (!ptr || !page) {
1565
ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
1566
rec = page + rec_offset;
1568
/* We do not need to reserve btr_search_latch, as the page is only
1569
being recovered, and there cannot be a hash index to it. */
1571
offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
1573
if (!(flags & BTR_KEEP_SYS_FLAG)) {
1574
row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
1575
pos, trx_id, roll_ptr);
1578
row_upd_rec_in_place(rec, index, offsets, update, page_zip);
1581
mem_heap_free(heap);
1586
/*****************************************************************
1587
See if there is enough place in the page modification log to log
1588
an update-in-place. */
1591
btr_cur_update_alloc_zip(
1592
/*=====================*/
1593
/* out: TRUE if enough place */
1594
page_zip_des_t* page_zip,/* in/out: compressed page */
1595
buf_block_t* block, /* in/out: buffer page */
1596
dict_index_t* index, /* in: the index corresponding to the block */
1597
ulint length, /* in: size needed */
1598
mtr_t* mtr) /* in: mini-transaction */
1600
ut_a(page_zip == buf_block_get_page_zip(block));
1603
if (page_zip_available(page_zip, dict_index_is_clust(index),
1608
if (!page_zip->m_nonempty) {
1609
/* The page has been freshly compressed, so
1610
recompressing it will not help. */
1614
if (!page_zip_compress(page_zip, buf_block_get_frame(block),
1616
/* Unable to compress the page */
1620
/* After recompressing a page, we must make sure that the free
1621
bits in the insert buffer bitmap will not exceed the free
1622
space on the page. Because this function will not attempt
1623
recompression unless page_zip_available() fails above, it is
1624
safe to reset the free bits if page_zip_available() fails
1625
again, below. The free bits can safely be reset in a separate
1626
mini-transaction. If page_zip_available() succeeds below, we
1627
can be sure that the page_zip_compress() above did not reduce
1628
the free space available on the page. */
1630
if (!page_zip_available(page_zip, dict_index_is_clust(index),
1632
/* Out of space: reset the free bits. */
1633
if (!dict_index_is_clust(index)
1634
&& page_is_leaf(buf_block_get_frame(block))) {
1635
ibuf_reset_free_bits(block);
1643
/*****************************************************************
1644
Updates a record when the update causes no size changes in its fields.
1645
We assume here that the ordering fields of the record do not change. */
1648
btr_cur_update_in_place(
1649
/*====================*/
1650
/* out: DB_SUCCESS or error number */
1651
ulint flags, /* in: undo logging and locking flags */
1652
btr_cur_t* cursor, /* in: cursor on the record to update;
1653
cursor stays valid and positioned on the
1655
const upd_t* update, /* in: update vector */
1656
ulint cmpl_info,/* in: compiler info on secondary index
1658
que_thr_t* thr, /* in: query thread */
1659
mtr_t* mtr) /* in: mtr; must be committed before
1660
latching any further pages */
1662
dict_index_t* index;
1664
page_zip_des_t* page_zip;
1667
dulint roll_ptr = ut_dulint_zero;
1669
ulint was_delete_marked;
1670
mem_heap_t* heap = NULL;
1671
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1672
ulint* offsets = offsets_;
1673
rec_offs_init(offsets_);
1675
rec = btr_cur_get_rec(cursor);
1676
index = cursor->index;
1677
ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
1678
trx = thr_get_trx(thr);
1679
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1681
if (btr_cur_print_record_ops && thr) {
1682
btr_cur_trx_report(trx, index, "update ");
1683
rec_print_new(stderr, rec, offsets);
1685
#endif /* UNIV_DEBUG */
1687
block = btr_cur_get_block(cursor);
1688
page_zip = buf_block_get_page_zip(block);
1690
/* Check that enough space is available on the compressed page. */
1691
if (UNIV_LIKELY_NULL(page_zip)
1692
&& !btr_cur_update_alloc_zip(page_zip, block, index,
1693
rec_offs_size(offsets), mtr)) {
1694
return(DB_ZIP_OVERFLOW);
1697
/* Do lock checking and undo logging */
1698
err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
1700
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
1702
if (UNIV_LIKELY_NULL(heap)) {
1703
mem_heap_free(heap);
1708
if (block->is_hashed) {
1709
/* The function row_upd_changes_ord_field_binary works only
1710
if the update vector was built for a clustered index, we must
1711
NOT call it if index is secondary */
1713
if (!dict_index_is_clust(index)
1714
|| row_upd_changes_ord_field_binary(NULL, index, update)) {
1716
/* Remove possible hash index pointer to this record */
1717
btr_search_update_hash_on_delete(cursor);
1720
rw_lock_x_lock(&btr_search_latch);
1723
if (!(flags & BTR_KEEP_SYS_FLAG)) {
1724
row_upd_rec_sys_fields(rec, NULL,
1725
index, offsets, trx, roll_ptr);
1728
was_delete_marked = rec_get_deleted_flag(
1729
rec, page_is_comp(buf_block_get_frame(block)));
1731
row_upd_rec_in_place(rec, index, offsets, update, page_zip);
1733
if (block->is_hashed) {
1734
rw_lock_x_unlock(&btr_search_latch);
1737
if (page_zip && !dict_index_is_clust(index)
1738
&& page_is_leaf(buf_block_get_frame(block))) {
1739
/* Update the free bits in the insert buffer. */
1740
ibuf_update_free_bits_zip(block, mtr);
1743
btr_cur_update_in_place_log(flags, rec, index, update,
1744
trx, roll_ptr, mtr);
1746
if (was_delete_marked
1747
&& !rec_get_deleted_flag(rec, page_is_comp(
1748
buf_block_get_frame(block)))) {
1749
/* The new updated record owns its possible externally
1752
btr_cur_unmark_extern_fields(page_zip,
1753
rec, index, offsets, mtr);
1756
if (UNIV_LIKELY_NULL(heap)) {
1757
mem_heap_free(heap);
1762
/*****************************************************************
1763
Tries to update a record on a page in an index tree. It is assumed that mtr
1764
holds an x-latch on the page. The operation does not succeed if there is too
1765
little space on the page or if the update would result in too empty a page,
1766
so that tree compression is recommended. We assume here that the ordering
1767
fields of the record do not change. */
1770
btr_cur_optimistic_update(
1771
/*======================*/
1772
/* out: DB_SUCCESS, or DB_OVERFLOW if the
1773
updated record does not fit, DB_UNDERFLOW
1774
if the page would become too empty, or
1775
DB_ZIP_OVERFLOW if there is not enough
1776
space left on the compressed page */
1777
ulint flags, /* in: undo logging and locking flags */
1778
btr_cur_t* cursor, /* in: cursor on the record to update;
1779
cursor stays valid and positioned on the
1781
const upd_t* update, /* in: update vector; this must also
1782
contain trx id and roll ptr fields */
1783
ulint cmpl_info,/* in: compiler info on secondary index
1785
que_thr_t* thr, /* in: query thread */
1786
mtr_t* mtr) /* in: mtr; must be committed before
1787
latching any further pages */
1789
dict_index_t* index;
1790
page_cur_t* page_cursor;
1794
page_zip_des_t* page_zip;
1800
dtuple_t* new_entry;
1808
block = btr_cur_get_block(cursor);
1809
page = buf_block_get_frame(block);
1810
orig_rec = rec = btr_cur_get_rec(cursor);
1811
index = cursor->index;
1812
ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
1813
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1815
heap = mem_heap_create(1024);
1816
offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
1819
if (btr_cur_print_record_ops && thr) {
1820
btr_cur_trx_report(thr_get_trx(thr), index, "update ");
1821
rec_print_new(stderr, rec, offsets);
1823
#endif /* UNIV_DEBUG */
1825
if (!row_upd_changes_field_size_or_external(index, offsets, update)) {
1827
/* The simplest and the most common case: the update does not
1828
change the size of any field and none of the updated fields is
1829
externally stored in rec or update, and there is enough space
1830
on the compressed page to log the update. */
1832
mem_heap_free(heap);
1833
return(btr_cur_update_in_place(flags, cursor, update,
1834
cmpl_info, thr, mtr));
1837
if (rec_offs_any_extern(offsets)) {
1839
/* Externally stored fields are treated in pessimistic
1842
mem_heap_free(heap);
1843
return(DB_OVERFLOW);
1846
for (i = 0; i < upd_get_n_fields(update); i++) {
1847
if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
1853
page_cursor = btr_cur_get_page_cur(cursor);
1855
new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
1857
/* We checked above that there are no externally stored fields. */
1860
/* The page containing the clustered index record
1861
corresponding to new_entry is latched in mtr.
1862
Thus the following call is safe. */
1863
row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
1865
old_rec_size = rec_offs_size(offsets);
1866
new_rec_size = rec_get_converted_size(index, new_entry, 0);
1868
page_zip = buf_block_get_page_zip(block);
1869
#ifdef UNIV_ZIP_DEBUG
1870
ut_a(!page_zip || page_zip_validate(page_zip, page));
1871
#endif /* UNIV_ZIP_DEBUG */
1873
if (UNIV_LIKELY_NULL(page_zip)
1874
&& !btr_cur_update_alloc_zip(page_zip, block, index,
1875
new_rec_size, mtr)) {
1876
err = DB_ZIP_OVERFLOW;
1880
if (UNIV_UNLIKELY(new_rec_size
1881
>= (page_get_free_space_of_empty(page_is_comp(page))
1888
if (UNIV_UNLIKELY(page_get_data_size(page)
1889
- old_rec_size + new_rec_size
1890
< BTR_CUR_PAGE_COMPRESS_LIMIT)) {
1892
/* The page would become too empty */
1898
max_size = old_rec_size
1899
+ page_get_max_insert_size_after_reorganize(page, 1);
1901
if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
1902
&& (max_size >= new_rec_size))
1903
|| (page_get_n_recs(page) <= 1))) {
1905
/* There was not enough space, or it did not pay to
1906
reorganize: for simplicity, we decide what to do assuming a
1907
reorganization is needed, though it might not be necessary */
1913
/* Do lock checking and undo logging */
1914
err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, thr,
1916
if (err != DB_SUCCESS) {
1918
mem_heap_free(heap);
1922
/* Ok, we may do the replacement. Store on the page infimum the
1923
explicit locks on rec, before deleting rec (see the comment in
1924
btr_cur_pessimistic_update). */
1926
lock_rec_store_on_page_infimum(block, rec);
1928
btr_search_update_hash_on_delete(cursor);
1930
/* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
1931
invokes rec_offs_make_valid() to point to the copied record that
1932
the fields of new_entry point to. We have to undo it here. */
1933
ut_ad(rec_offs_validate(NULL, index, offsets));
1934
rec_offs_make_valid(page_cur_get_rec(page_cursor), index, offsets);
1936
page_cur_delete_rec(page_cursor, index, offsets, mtr);
1938
page_cur_move_to_prev(page_cursor);
1940
trx = thr_get_trx(thr);
1942
if (!(flags & BTR_KEEP_SYS_FLAG)) {
1943
row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
1945
row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
1949
/* There are no externally stored columns in new_entry */
1950
rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr);
1951
ut_a(rec); /* <- We calculated above the insert would fit */
1953
if (page_zip && !dict_index_is_clust(index)
1954
&& page_is_leaf(page)) {
1955
/* Update the free bits in the insert buffer. */
1956
ibuf_update_free_bits_zip(block, mtr);
1959
/* Restore the old explicit lock state on the record */
1961
lock_rec_restore_from_page_infimum(block, rec, block);
1963
page_cur_move_to_next(page_cursor);
1965
mem_heap_free(heap);
1970
/*****************************************************************
1971
If, in a split, a new supremum record was created as the predecessor of the
1972
updated record, the supremum record must inherit exactly the locks on the
1973
updated record. In the split it may have inherited locks from the successor
1974
of the updated record, which is not correct. This function restores the
1975
right locks for the new supremum. */
1978
btr_cur_pess_upd_restore_supremum(
1979
/*==============================*/
1980
buf_block_t* block, /* in: buffer block of rec */
1981
const rec_t* rec, /* in: updated record */
1982
mtr_t* mtr) /* in: mtr */
1985
buf_block_t* prev_block;
1990
page = buf_block_get_frame(block);
1992
if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
1993
/* Updated record is not the first user record on its page */
1998
space = buf_block_get_space(block);
1999
zip_size = buf_block_get_zip_size(block);
2000
prev_page_no = btr_page_get_prev(page, mtr);
2002
ut_ad(prev_page_no != FIL_NULL);
2003
prev_block = buf_page_get_with_no_latch(space, zip_size,
2005
#ifdef UNIV_BTR_DEBUG
2006
ut_a(btr_page_get_next(prev_block->frame, mtr)
2007
== page_get_page_no(page));
2008
#endif /* UNIV_BTR_DEBUG */
2010
/* We must already have an x-latch on prev_block! */
2011
ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
2013
lock_rec_reset_and_inherit_gap_locks(prev_block, block,
2014
PAGE_HEAP_NO_SUPREMUM,
2015
page_rec_get_heap_no(rec));
2018
/*****************************************************************
2019
Performs an update of a record on a page of a tree. It is assumed
2020
that mtr holds an x-latch on the tree and on the cursor page. If the
2021
update is made on the leaf level, to avoid deadlocks, mtr must also
2022
own x-latches to brothers of page, if those brothers exist. We assume
2023
here that the ordering fields of the record do not change. */
2026
btr_cur_pessimistic_update(
2027
/*=======================*/
2028
/* out: DB_SUCCESS or error code */
2029
ulint flags, /* in: undo logging, locking, and rollback
2031
btr_cur_t* cursor, /* in: cursor on the record to update */
2032
mem_heap_t** heap, /* in/out: pointer to memory heap, or NULL */
2033
big_rec_t** big_rec,/* out: big rec vector whose fields have to
2034
be stored externally by the caller, or NULL */
2035
const upd_t* update, /* in: update vector; this is allowed also
2036
contain trx id and roll ptr fields, but
2037
the values in update vector have no effect */
2038
ulint cmpl_info,/* in: compiler info on secondary index
2040
que_thr_t* thr, /* in: query thread */
2041
mtr_t* mtr) /* in: mtr; must be committed before
2042
latching any further pages */
2044
big_rec_t* big_rec_vec = NULL;
2045
big_rec_t* dummy_big_rec;
2046
dict_index_t* index;
2049
page_zip_des_t* page_zip;
2051
page_cur_t* page_cursor;
2052
dtuple_t* new_entry;
2058
ulint n_extents = 0;
2061
ulint* offsets = NULL;
2065
block = btr_cur_get_block(cursor);
2066
page = buf_block_get_frame(block);
2067
page_zip = buf_block_get_page_zip(block);
2068
rec = btr_cur_get_rec(cursor);
2069
index = cursor->index;
2071
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2073
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2074
#ifdef UNIV_ZIP_DEBUG
2075
ut_a(!page_zip || page_zip_validate(page_zip, page));
2076
#endif /* UNIV_ZIP_DEBUG */
2078
optim_err = btr_cur_optimistic_update(flags, cursor, update,
2079
cmpl_info, thr, mtr);
2081
switch (optim_err) {
2084
case DB_ZIP_OVERFLOW:
2090
/* Do lock checking and undo logging */
2091
err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
2093
if (err != DB_SUCCESS) {
2098
if (optim_err == DB_OVERFLOW) {
2101
/* First reserve enough free space for the file segments
2102
of the index tree, so that the update will not fail because
2105
n_extents = cursor->tree_height / 16 + 3;
2107
if (flags & BTR_NO_UNDO_LOG_FLAG) {
2108
reserve_flag = FSP_CLEANING;
2110
reserve_flag = FSP_NORMAL;
2113
if (!fsp_reserve_free_extents(&n_reserved, index->space,
2114
n_extents, reserve_flag, mtr)) {
2115
return(DB_OUT_OF_FILE_SPACE);
2120
*heap = mem_heap_create(1024);
2122
offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, heap);
2124
trx = thr_get_trx(thr);
2126
new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
2128
/* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
2129
invokes rec_offs_make_valid() to point to the copied record that
2130
the fields of new_entry point to. We have to undo it here. */
2131
ut_ad(rec_offs_validate(NULL, index, offsets));
2132
rec_offs_make_valid(rec, index, offsets);
2134
/* The page containing the clustered index record
2135
corresponding to new_entry is latched in mtr. If the
2136
clustered index record is delete-marked, then its externally
2137
stored fields cannot have been purged yet, because then the
2138
purge would also have removed the clustered index record
2139
itself. Thus the following call is safe. */
2140
row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
2142
if (!(flags & BTR_KEEP_SYS_FLAG)) {
2143
row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
2145
row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
2149
if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(offsets)) {
2150
/* We are in a transaction rollback undoing a row
2151
update: we must free possible externally stored fields
2152
which got new values in the update, if they are not
2153
inherited values. They can be inherited if we have
2154
updated the primary key to another value, and then
2155
update it back again. */
2157
ut_ad(big_rec_vec == NULL);
2159
btr_rec_free_updated_extern_fields(
2160
index, rec, page_zip, offsets, update,
2161
trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr);
2164
/* We have to set appropriate extern storage bits in the new
2165
record to be inserted: we have to remember which fields were such */
2167
ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
2168
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap);
2169
n_ext += btr_push_update_extern_fields(new_entry, update, *heap);
2171
if (UNIV_LIKELY_NULL(page_zip)) {
2172
ut_ad(page_is_comp(page));
2173
if (page_zip_rec_needs_ext(
2174
rec_get_converted_size(index, new_entry, n_ext),
2176
dict_index_get_n_fields(index),
2177
page_zip_get_size(page_zip))) {
2181
} else if (page_zip_rec_needs_ext(
2182
rec_get_converted_size(index, new_entry, n_ext),
2183
page_is_comp(page), 0, 0)) {
2185
big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
2186
if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
2188
err = DB_TOO_BIG_RECORD;
2189
goto return_after_reservations;
2193
/* Store state of explicit locks on rec on the page infimum record,
2194
before deleting rec. The page infimum acts as a dummy carrier of the
2195
locks, taking care also of lock releases, before we can move the locks
2196
back on the actual record. There is a special case: if we are
2197
inserting on the root page and the insert causes a call of
2198
btr_root_raise_and_insert. Therefore we cannot in the lock system
2199
delete the lock structs set on the root page even if the root
2200
page carries just node pointers. */
2202
lock_rec_store_on_page_infimum(block, rec);
2204
btr_search_update_hash_on_delete(cursor);
2206
#ifdef UNIV_ZIP_DEBUG
2207
ut_a(!page_zip || page_zip_validate(page_zip, page));
2208
#endif /* UNIV_ZIP_DEBUG */
2209
page_cursor = btr_cur_get_page_cur(cursor);
2211
page_cur_delete_rec(page_cursor, index, offsets, mtr);
2213
page_cur_move_to_prev(page_cursor);
2215
rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr);
2218
lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
2221
offsets = rec_get_offsets(rec, index, offsets,
2222
ULINT_UNDEFINED, heap);
2224
if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
2225
/* The new inserted record owns its possible externally
2227
btr_cur_unmark_extern_fields(page_zip,
2228
rec, index, offsets, mtr);
2231
btr_cur_compress_if_useful(cursor, mtr);
2233
if (page_zip && !dict_index_is_clust(index)
2234
&& page_is_leaf(page)) {
2235
/* Update the free bits in the insert buffer. */
2236
ibuf_update_free_bits_zip(block, mtr);
2240
goto return_after_reservations;
2242
ut_a(optim_err != DB_UNDERFLOW);
2244
/* Out of space: reset the free bits. */
2245
if (!dict_index_is_clust(index)
2246
&& page_is_leaf(page)) {
2247
ibuf_reset_free_bits(block);
2251
/* Was the record to be updated positioned as the first user
2252
record on its page? */
2253
was_first = page_cur_is_before_first(page_cursor);
2255
/* The first parameter means that no lock checking and undo logging
2256
is made in the insert */
2258
err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
2259
| BTR_NO_LOCKING_FLAG
2260
| BTR_KEEP_SYS_FLAG,
2261
cursor, new_entry, &rec,
2262
&dummy_big_rec, n_ext, NULL, mtr);
2264
ut_a(err == DB_SUCCESS);
2265
ut_a(dummy_big_rec == NULL);
2267
if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
2268
/* The new inserted record owns its possible externally
2270
buf_block_t* rec_block = btr_cur_get_block(cursor);
2272
#ifdef UNIV_ZIP_DEBUG
2273
ut_a(!page_zip || page_zip_validate(page_zip, page));
2274
page = buf_block_get_frame(rec_block);
2275
#endif /* UNIV_ZIP_DEBUG */
2276
page_zip = buf_block_get_page_zip(rec_block);
2278
offsets = rec_get_offsets(rec, index, offsets,
2279
ULINT_UNDEFINED, heap);
2280
btr_cur_unmark_extern_fields(page_zip,
2281
rec, index, offsets, mtr);
2284
lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
2287
/* If necessary, restore also the correct lock state for a new,
2288
preceding supremum record created in a page split. While the old
2289
record was nonexistent, the supremum might have inherited its locks
2290
from a wrong record. */
2293
btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
2297
return_after_reservations:
2298
#ifdef UNIV_ZIP_DEBUG
2299
ut_a(!page_zip || page_zip_validate(page_zip, page));
2300
#endif /* UNIV_ZIP_DEBUG */
2302
if (n_extents > 0) {
2303
fil_space_release_free_extents(index->space, n_reserved);
2306
*big_rec = big_rec_vec;
2311
/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
2313
/********************************************************************
2314
Writes the redo log record for delete marking or unmarking of an index
2318
btr_cur_del_mark_set_clust_rec_log(
2319
/*===============================*/
2320
ulint flags, /* in: flags */
2321
rec_t* rec, /* in: record */
2322
dict_index_t* index, /* in: index of the record */
2323
ibool val, /* in: value to set */
2324
trx_t* trx, /* in: deleting transaction */
2325
dulint roll_ptr,/* in: roll ptr to the undo log record */
2326
mtr_t* mtr) /* in: mtr */
2332
ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
2334
log_ptr = mlog_open_and_write_index(mtr, rec, index,
2335
page_rec_is_comp(rec)
2336
? MLOG_COMP_REC_CLUST_DELETE_MARK
2337
: MLOG_REC_CLUST_DELETE_MARK,
2338
1 + 1 + DATA_ROLL_PTR_LEN
2342
/* Logging in mtr is switched off during crash recovery */
2346
mach_write_to_1(log_ptr, flags);
2348
mach_write_to_1(log_ptr, val);
2351
log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
2353
mach_write_to_2(log_ptr, page_offset(rec));
2356
mlog_close(mtr, log_ptr);
2359
/********************************************************************
2360
Parses the redo log record for delete marking or unmarking of a clustered
2364
btr_cur_parse_del_mark_set_clust_rec(
2365
/*=================================*/
2366
/* out: end of log record or NULL */
2367
byte* ptr, /* in: buffer */
2368
byte* end_ptr,/* in: buffer end */
2369
page_t* page, /* in/out: page or NULL */
2370
page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */
2371
dict_index_t* index) /* in: index corresponding to page */
2382
|| !!page_is_comp(page) == dict_table_is_comp(index->table));
2384
if (end_ptr < ptr + 2) {
2389
flags = mach_read_from_1(ptr);
2391
val = mach_read_from_1(ptr);
2394
ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
2401
if (end_ptr < ptr + 2) {
2406
offset = mach_read_from_2(ptr);
2409
ut_a(offset <= UNIV_PAGE_SIZE);
2412
rec = page + offset;
2414
/* We do not need to reserve btr_search_latch, as the page
2415
is only being recovered, and there cannot be a hash index to
2418
btr_rec_set_deleted_flag(rec, page_zip, val);
2420
if (!(flags & BTR_KEEP_SYS_FLAG)) {
2421
mem_heap_t* heap = NULL;
2422
ulint offsets_[REC_OFFS_NORMAL_SIZE];
2423
rec_offs_init(offsets_);
2425
row_upd_rec_sys_fields_in_recovery(
2427
rec_get_offsets(rec, index, offsets_,
2428
ULINT_UNDEFINED, &heap),
2429
pos, trx_id, roll_ptr);
2430
if (UNIV_LIKELY_NULL(heap)) {
2431
mem_heap_free(heap);
2439
/***************************************************************
2440
Marks a clustered index record deleted. Writes an undo log record to
2441
undo log on this delete marking. Writes in the trx id field the id
2442
of the deleting transaction, and in the roll ptr field pointer to the
2443
undo log record created. */
2446
btr_cur_del_mark_set_clust_rec(
2447
/*===========================*/
2448
/* out: DB_SUCCESS, DB_LOCK_WAIT, or error
2450
ulint flags, /* in: undo logging and locking flags */
2451
btr_cur_t* cursor, /* in: cursor */
2452
ibool val, /* in: value to set */
2453
que_thr_t* thr, /* in: query thread */
2454
mtr_t* mtr) /* in: mtr */
2456
dict_index_t* index;
2461
page_zip_des_t* page_zip;
2463
mem_heap_t* heap = NULL;
2464
ulint offsets_[REC_OFFS_NORMAL_SIZE];
2465
ulint* offsets = offsets_;
2466
rec_offs_init(offsets_);
2468
rec = btr_cur_get_rec(cursor);
2469
index = cursor->index;
2470
ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
2471
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
2474
if (btr_cur_print_record_ops && thr) {
2475
btr_cur_trx_report(thr_get_trx(thr), index, "del mark ");
2476
rec_print_new(stderr, rec, offsets);
2478
#endif /* UNIV_DEBUG */
2480
ut_ad(dict_index_is_clust(index));
2481
ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
2483
err = lock_clust_rec_modify_check_and_lock(flags,
2484
btr_cur_get_block(cursor),
2485
rec, index, offsets, thr);
2487
if (err != DB_SUCCESS) {
2492
err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
2493
index, NULL, NULL, 0, rec,
2495
if (err != DB_SUCCESS) {
2500
block = btr_cur_get_block(cursor);
2502
if (block->is_hashed) {
2503
rw_lock_x_lock(&btr_search_latch);
2506
page_zip = buf_block_get_page_zip(block);
2508
btr_rec_set_deleted_flag(rec, page_zip, val);
2510
trx = thr_get_trx(thr);
2512
if (!(flags & BTR_KEEP_SYS_FLAG)) {
2513
row_upd_rec_sys_fields(rec, page_zip,
2514
index, offsets, trx, roll_ptr);
2517
if (block->is_hashed) {
2518
rw_lock_x_unlock(&btr_search_latch);
2521
btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx,
2525
if (UNIV_LIKELY_NULL(heap)) {
2526
mem_heap_free(heap);
2531
/********************************************************************
2532
Writes the redo log record for a delete mark setting of a secondary
2536
btr_cur_del_mark_set_sec_rec_log(
2537
/*=============================*/
2538
rec_t* rec, /* in: record */
2539
ibool val, /* in: value to set */
2540
mtr_t* mtr) /* in: mtr */
2545
log_ptr = mlog_open(mtr, 11 + 1 + 2);
2548
/* Logging in mtr is switched off during crash recovery:
2549
in that case mlog_open returns NULL */
2553
log_ptr = mlog_write_initial_log_record_fast(
2554
rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
2555
mach_write_to_1(log_ptr, val);
2558
mach_write_to_2(log_ptr, page_offset(rec));
2561
mlog_close(mtr, log_ptr);
2564
/********************************************************************
2565
Parses the redo log record for delete marking or unmarking of a secondary
2569
btr_cur_parse_del_mark_set_sec_rec(
2570
/*===============================*/
2571
/* out: end of log record or NULL */
2572
byte* ptr, /* in: buffer */
2573
byte* end_ptr,/* in: buffer end */
2574
page_t* page, /* in/out: page or NULL */
2575
page_zip_des_t* page_zip)/* in/out: compressed page, or NULL */
2581
if (end_ptr < ptr + 3) {
2586
val = mach_read_from_1(ptr);
2589
offset = mach_read_from_2(ptr);
2592
ut_a(offset <= UNIV_PAGE_SIZE);
2595
rec = page + offset;
2597
/* We do not need to reserve btr_search_latch, as the page
2598
is only being recovered, and there cannot be a hash index to
2601
btr_rec_set_deleted_flag(rec, page_zip, val);
2607
/***************************************************************
2608
Sets a secondary index record delete mark to TRUE or FALSE. */
2611
btr_cur_del_mark_set_sec_rec(
2612
/*=========================*/
2613
/* out: DB_SUCCESS, DB_LOCK_WAIT, or error
2615
ulint flags, /* in: locking flag */
2616
btr_cur_t* cursor, /* in: cursor */
2617
ibool val, /* in: value to set */
2618
que_thr_t* thr, /* in: query thread */
2619
mtr_t* mtr) /* in: mtr */
2625
block = btr_cur_get_block(cursor);
2626
rec = btr_cur_get_rec(cursor);
2629
if (btr_cur_print_record_ops && thr) {
2630
btr_cur_trx_report(thr_get_trx(thr), cursor->index,
2632
rec_print(stderr, rec, cursor->index);
2634
#endif /* UNIV_DEBUG */
2636
err = lock_sec_rec_modify_check_and_lock(flags,
2637
btr_cur_get_block(cursor),
2638
rec, cursor->index, thr);
2639
if (err != DB_SUCCESS) {
2644
ut_ad(!!page_rec_is_comp(rec)
2645
== dict_table_is_comp(cursor->index->table));
2647
if (block->is_hashed) {
2648
rw_lock_x_lock(&btr_search_latch);
2651
btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
2653
if (block->is_hashed) {
2654
rw_lock_x_unlock(&btr_search_latch);
2657
btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
2662
/***************************************************************
2663
Clear a secondary index record's delete mark. This function is only
2664
used by the insert buffer insert merge mechanism. */
2667
btr_cur_del_unmark_for_ibuf(
2668
/*========================*/
2669
rec_t* rec, /* in/out: record to delete unmark */
2670
page_zip_des_t* page_zip, /* in/out: compressed page
2671
corresponding to rec, or NULL
2672
when the tablespace is
2674
mtr_t* mtr) /* in: mtr */
2676
/* We do not need to reserve btr_search_latch, as the page has just
2677
been read to the buffer pool and there cannot be a hash index to it. */
2679
btr_rec_set_deleted_flag(rec, page_zip, FALSE);
2681
btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr);
2684
/*==================== B-TREE RECORD REMOVE =========================*/
2686
/*****************************************************************
2687
Tries to compress a page of the tree if it seems useful. It is assumed
2688
that mtr holds an x-latch on the tree and on the cursor page. To avoid
2689
deadlocks, mtr must also own x-latches to brothers of page, if those
2690
brothers exist. NOTE: it is assumed that the caller has reserved enough
2691
free extents so that the compression will always succeed if done! */
2694
btr_cur_compress_if_useful(
2695
/*=======================*/
2696
/* out: TRUE if compression occurred */
2697
btr_cur_t* cursor, /* in: cursor on the page to compress;
2698
cursor does not stay valid if compression
2700
mtr_t* mtr) /* in: mtr */
2702
ut_ad(mtr_memo_contains(mtr,
2703
dict_index_get_lock(btr_cur_get_index(cursor)),
2705
ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
2706
MTR_MEMO_PAGE_X_FIX));
2708
return(btr_cur_compress_recommendation(cursor, mtr)
2709
&& btr_compress(cursor, mtr));
2712
/***********************************************************
2713
Removes the record on which the tree cursor is positioned on a leaf page.
2714
It is assumed that the mtr has an x-latch on the page where the cursor is
2715
positioned, but no latch on the whole tree. */
2718
btr_cur_optimistic_delete(
2719
/*======================*/
2720
/* out: TRUE if success, i.e., the page
2721
did not become too empty */
2722
btr_cur_t* cursor, /* in: cursor on leaf page, on the record to
2723
delete; cursor stays valid: if deletion
2724
succeeds, on function exit it points to the
2725
successor of the deleted record */
2726
mtr_t* mtr) /* in: mtr */
2730
mem_heap_t* heap = NULL;
2731
ulint offsets_[REC_OFFS_NORMAL_SIZE];
2732
ulint* offsets = offsets_;
2733
ibool no_compress_needed;
2734
rec_offs_init(offsets_);
2736
ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
2737
MTR_MEMO_PAGE_X_FIX));
2738
/* This is intended only for leaf page deletions */
2740
block = btr_cur_get_block(cursor);
2742
ut_ad(page_is_leaf(buf_block_get_frame(block)));
2744
rec = btr_cur_get_rec(cursor);
2745
offsets = rec_get_offsets(rec, cursor->index, offsets,
2746
ULINT_UNDEFINED, &heap);
2748
no_compress_needed = !rec_offs_any_extern(offsets)
2749
&& btr_cur_can_delete_without_compress(
2750
cursor, rec_offs_size(offsets), mtr);
2752
if (no_compress_needed) {
2754
page_t* page = buf_block_get_frame(block);
2755
page_zip_des_t* page_zip= buf_block_get_page_zip(block);
2758
lock_update_delete(block, rec);
2760
btr_search_update_hash_on_delete(cursor);
2763
max_ins = page_get_max_insert_size_after_reorganize(
2766
#ifdef UNIV_ZIP_DEBUG
2767
ut_a(!page_zip || page_zip_validate(page_zip, page));
2768
#endif /* UNIV_ZIP_DEBUG */
2769
page_cur_delete_rec(btr_cur_get_page_cur(cursor),
2770
cursor->index, offsets, mtr);
2771
#ifdef UNIV_ZIP_DEBUG
2772
ut_a(!page_zip || page_zip_validate(page_zip, page));
2773
#endif /* UNIV_ZIP_DEBUG */
2775
if (dict_index_is_clust(cursor->index)
2776
|| !page_is_leaf(page)) {
2777
/* The insert buffer does not handle
2778
inserts to clustered indexes or to non-leaf
2779
pages of secondary index B-trees. */
2780
} else if (page_zip) {
2781
ibuf_update_free_bits_zip(block, mtr);
2783
ibuf_update_free_bits_low(block, max_ins, mtr);
2787
if (UNIV_LIKELY_NULL(heap)) {
2788
mem_heap_free(heap);
2791
return(no_compress_needed);
2794
/*****************************************************************
2795
Removes the record on which the tree cursor is positioned. Tries
2796
to compress the page if its fillfactor drops below a threshold
2797
or if it is the only page on the level. It is assumed that mtr holds
2798
an x-latch on the tree and on the cursor page. To avoid deadlocks,
2799
mtr must also own x-latches to brothers of page, if those brothers
2803
btr_cur_pessimistic_delete(
2804
/*=======================*/
2805
/* out: TRUE if compression occurred */
2806
ulint* err, /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
2807
the latter may occur because we may have
2808
to update node pointers on upper levels,
2809
and in the case of variable length keys
2810
these may actually grow in size */
2811
ibool has_reserved_extents, /* in: TRUE if the
2812
caller has already reserved enough free
2813
extents so that he knows that the operation
2815
btr_cur_t* cursor, /* in: cursor on the record to delete;
2816
if compression does not occur, the cursor
2817
stays valid: it points to successor of
2818
deleted record on function exit */
2819
enum trx_rb_ctx rb_ctx, /* in: rollback context */
2820
mtr_t* mtr) /* in: mtr */
2824
page_zip_des_t* page_zip;
2825
dict_index_t* index;
2828
ulint n_extents = 0;
2836
block = btr_cur_get_block(cursor);
2837
page = buf_block_get_frame(block);
2838
index = btr_cur_get_index(cursor);
2840
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2842
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2843
if (!has_reserved_extents) {
2844
/* First reserve enough free space for the file segments
2845
of the index tree, so that the node pointer updates will
2846
not fail because of lack of space */
2848
n_extents = cursor->tree_height / 32 + 1;
2850
success = fsp_reserve_free_extents(&n_reserved,
2855
*err = DB_OUT_OF_FILE_SPACE;
2861
heap = mem_heap_create(1024);
2862
rec = btr_cur_get_rec(cursor);
2863
page_zip = buf_block_get_page_zip(block);
2864
#ifdef UNIV_ZIP_DEBUG
2865
ut_a(!page_zip || page_zip_validate(page_zip, page));
2866
#endif /* UNIV_ZIP_DEBUG */
2868
offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
2870
if (rec_offs_any_extern(offsets)) {
2871
btr_rec_free_externally_stored_fields(index,
2872
rec, offsets, page_zip,
2874
#ifdef UNIV_ZIP_DEBUG
2875
ut_a(!page_zip || page_zip_validate(page_zip, page));
2876
#endif /* UNIV_ZIP_DEBUG */
2879
if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
2880
&& UNIV_UNLIKELY(dict_index_get_page(index)
2881
!= buf_block_get_page_no(block))) {
2883
/* If there is only one record, drop the whole page in
2884
btr_discard_page, if this is not the root page */
2886
btr_discard_page(cursor, mtr);
2891
goto return_after_reservations;
2894
lock_update_delete(block, rec);
2895
level = btr_page_get_level(page, mtr);
2898
&& UNIV_UNLIKELY(rec == page_rec_get_next(
2899
page_get_infimum_rec(page)))) {
2901
rec_t* next_rec = page_rec_get_next(rec);
2903
if (btr_page_get_prev(page, mtr) == FIL_NULL) {
2905
/* If we delete the leftmost node pointer on a
2906
non-leaf level, we must mark the new leftmost node
2907
pointer as the predefined minimum record */
2909
/* This will make page_zip_validate() fail until
2910
page_cur_delete_rec() completes. This is harmless,
2911
because everything will take place within a single
2912
mini-transaction and because writing to the redo log
2913
is an atomic operation (performed by mtr_commit()). */
2914
btr_set_min_rec_mark(next_rec, mtr);
2916
/* Otherwise, if we delete the leftmost node pointer
2917
on a page, we have to change the father node pointer
2918
so that it is equal to the new leftmost node pointer
2921
btr_node_ptr_delete(index, block, mtr);
2923
node_ptr = dict_index_build_node_ptr(
2924
index, next_rec, buf_block_get_page_no(block),
2927
btr_insert_on_non_leaf_level(index,
2928
level + 1, node_ptr, mtr);
2932
btr_search_update_hash_on_delete(cursor);
2934
page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
2935
#ifdef UNIV_ZIP_DEBUG
2936
ut_a(!page_zip || page_zip_validate(page_zip, page));
2937
#endif /* UNIV_ZIP_DEBUG */
2939
ut_ad(btr_check_node_ptr(index, block, mtr));
2943
return_after_reservations:
2944
mem_heap_free(heap);
2947
ret = btr_cur_compress_if_useful(cursor, mtr);
2950
if (n_extents > 0) {
2951
fil_space_release_free_extents(index->space, n_reserved);
2957
/***********************************************************************
2958
Adds path information to the cursor for the current page, for which
2959
the binary search has been performed. */
2962
btr_cur_add_path_info(
2963
/*==================*/
2964
btr_cur_t* cursor, /* in: cursor positioned on a page */
2965
ulint height, /* in: height of the page in tree;
2966
0 means leaf node */
2967
ulint root_height) /* in: root node height in tree */
2972
ut_a(cursor->path_arr);
2974
if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
2975
/* Do nothing; return empty path */
2977
slot = cursor->path_arr;
2978
slot->nth_rec = ULINT_UNDEFINED;
2984
/* Mark end of slots for path */
2985
slot = cursor->path_arr + root_height + 1;
2986
slot->nth_rec = ULINT_UNDEFINED;
2989
rec = btr_cur_get_rec(cursor);
2991
slot = cursor->path_arr + (root_height - height);
2993
slot->nth_rec = page_rec_get_n_recs_before(rec);
2994
slot->n_recs = page_get_n_recs(page_align(rec));
2997
/***********************************************************************
2998
Estimates the number of rows in a given index range. */
3001
btr_estimate_n_rows_in_range(
3002
/*=========================*/
3003
/* out: estimated number of rows */
3004
dict_index_t* index, /* in: index */
3005
const dtuple_t* tuple1, /* in: range start, may also be empty tuple */
3006
ulint mode1, /* in: search mode for range start */
3007
const dtuple_t* tuple2, /* in: range end, may also be empty tuple */
3008
ulint mode2) /* in: search mode for range end */
3010
btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS];
3011
btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
3017
ulint divergence_level;
3024
cursor.path_arr = path1;
3026
if (dtuple_get_n_fields(tuple1) > 0) {
3028
btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
3029
BTR_SEARCH_LEAF | BTR_ESTIMATE,
3032
btr_cur_open_at_index_side(TRUE, index,
3033
BTR_SEARCH_LEAF | BTR_ESTIMATE,
3041
cursor.path_arr = path2;
3043
if (dtuple_get_n_fields(tuple2) > 0) {
3045
btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
3046
BTR_SEARCH_LEAF | BTR_ESTIMATE,
3049
btr_cur_open_at_index_side(FALSE, index,
3050
BTR_SEARCH_LEAF | BTR_ESTIMATE,
3056
/* We have the path information for the range in path1 and path2 */
3059
diverged = FALSE; /* This becomes true when the path is not
3060
the same any more */
3061
diverged_lot = FALSE; /* This becomes true when the paths are
3062
not the same or adjacent any more */
3063
divergence_level = 1000000; /* This is the level where paths diverged
3065
for (i = 0; ; i++) {
3066
ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
3071
if (slot1->nth_rec == ULINT_UNDEFINED
3072
|| slot2->nth_rec == ULINT_UNDEFINED) {
3074
if (i > divergence_level + 1) {
3075
/* In trees whose height is > 1 our algorithm
3076
tends to underestimate: multiply the estimate
3079
n_rows = n_rows * 2;
3082
/* Do not estimate the number of rows in the range
3083
to over 1 / 2 of the estimated rows in the whole
3086
if (n_rows > index->table->stat_n_rows / 2) {
3087
n_rows = index->table->stat_n_rows / 2;
3089
/* If there are just 0 or 1 rows in the table,
3090
then we estimate all rows are in the range */
3093
n_rows = index->table->stat_n_rows;
3100
if (!diverged && slot1->nth_rec != slot2->nth_rec) {
3104
if (slot1->nth_rec < slot2->nth_rec) {
3105
n_rows = slot2->nth_rec - slot1->nth_rec;
3108
diverged_lot = TRUE;
3109
divergence_level = i;
3112
/* Maybe the tree has changed between
3118
} else if (diverged && !diverged_lot) {
3120
if (slot1->nth_rec < slot1->n_recs
3121
|| slot2->nth_rec > 1) {
3123
diverged_lot = TRUE;
3124
divergence_level = i;
3128
if (slot1->nth_rec < slot1->n_recs) {
3129
n_rows += slot1->n_recs
3133
if (slot2->nth_rec > 1) {
3134
n_rows += slot2->nth_rec - 1;
3137
} else if (diverged_lot) {
3139
n_rows = (n_rows * (slot1->n_recs + slot2->n_recs))
3145
/***********************************************************************
3146
Estimates the number of different key values in a given index, for
3147
each n-column prefix of the index where n <= dict_index_get_n_unique(index).
3148
The estimates are stored in the array index->stat_n_diff_key_vals. */
3151
btr_estimate_number_of_different_key_vals(
3152
/*======================================*/
3153
dict_index_t* index) /* in: index */
3159
ulint matched_fields;
3160
ulint matched_bytes;
3162
ullint n_sample_pages; /* number of pages to sample */
3163
ulint not_empty_flag = 0;
3164
ulint total_external_size = 0;
3169
mem_heap_t* heap = NULL;
3170
ulint offsets_rec_[REC_OFFS_NORMAL_SIZE];
3171
ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE];
3172
ulint* offsets_rec = offsets_rec_;
3173
ulint* offsets_next_rec= offsets_next_rec_;
3174
rec_offs_init(offsets_rec_);
3175
rec_offs_init(offsets_next_rec_);
3177
n_cols = dict_index_get_n_unique(index);
3179
n_diff = mem_zalloc((n_cols + 1) * sizeof(ib_int64_t));
3181
/* It makes no sense to test more pages than are contained
3182
in the index, thus we lower the number if it is too high */
3183
if (srv_stats_sample_pages > index->stat_index_size) {
3184
if (index->stat_index_size > 0) {
3185
n_sample_pages = index->stat_index_size;
3190
n_sample_pages = srv_stats_sample_pages;
3193
/* We sample some pages in the index to get an estimate */
3195
for (i = 0; i < n_sample_pages; i++) {
3199
btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
3201
/* Count the number of different key values for each prefix of
3202
the key on this index page. If the prefix does not determine
3203
the index record uniquely in the B-tree, then we subtract one
3204
because otherwise our algorithm would give a wrong estimate
3205
for an index where there is just one key value. */
3207
page = btr_cur_get_page(&cursor);
3209
supremum = page_get_supremum_rec(page);
3210
rec = page_rec_get_next(page_get_infimum_rec(page));
3212
if (rec != supremum) {
3214
offsets_rec = rec_get_offsets(rec, index, offsets_rec,
3215
ULINT_UNDEFINED, &heap);
3218
while (rec != supremum) {
3219
rec_t* next_rec = page_rec_get_next(rec);
3220
if (next_rec == supremum) {
3226
offsets_next_rec = rec_get_offsets(next_rec, index,
3230
cmp_rec_rec_with_match(rec, next_rec,
3231
offsets_rec, offsets_next_rec,
3232
index, &matched_fields,
3235
for (j = matched_fields + 1; j <= n_cols; j++) {
3236
/* We add one if this index record has
3237
a different prefix from the previous */
3243
+= btr_rec_get_externally_stored_len(
3247
/* Initialize offsets_rec for the next round
3248
and assign the old offsets_rec buffer to
3249
offsets_next_rec. */
3251
ulint* offsets_tmp = offsets_rec;
3252
offsets_rec = offsets_next_rec;
3253
offsets_next_rec = offsets_tmp;
3258
if (n_cols == dict_index_get_n_unique_in_tree(index)) {
3260
/* If there is more than one leaf page in the tree,
3261
we add one because we know that the first record
3262
on the page certainly had a different prefix than the
3263
last record on the previous index page in the
3264
alphabetical order. Before this fix, if there was
3265
just one big record on each clustered index page, the
3266
algorithm grossly underestimated the number of rows
3269
if (btr_page_get_prev(page, &mtr) != FIL_NULL
3270
|| btr_page_get_next(page, &mtr) != FIL_NULL) {
3276
offsets_rec = rec_get_offsets(rec, index, offsets_rec,
3277
ULINT_UNDEFINED, &heap);
3278
total_external_size += btr_rec_get_externally_stored_len(
3283
/* If we saw k borders between different key values on
3284
n_sample_pages leaf pages, we can estimate how many
3285
there will be in index->stat_n_leaf_pages */
3287
/* We must take into account that our sample actually represents
3288
also the pages used for external storage of fields (those pages are
3289
included in index->stat_n_leaf_pages) */
3291
for (j = 0; j <= n_cols; j++) {
3292
index->stat_n_diff_key_vals[j]
3294
* (ib_int64_t)index->stat_n_leaf_pages
3295
+ n_sample_pages - 1
3296
+ total_external_size
3299
+ total_external_size));
3301
/* If the tree is small, smaller than
3302
10 * n_sample_pages + total_external_size, then
3303
the above estimate is ok. For bigger trees it is common that we
3304
do not see any borders between key values in the few pages
3305
we pick. But still there may be n_sample_pages
3306
different key values, or even more. Let us try to approximate
3309
add_on = index->stat_n_leaf_pages
3310
/ (10 * (n_sample_pages
3311
+ total_external_size));
3313
if (add_on > n_sample_pages) {
3314
add_on = n_sample_pages;
3317
index->stat_n_diff_key_vals[j] += add_on;
3321
if (UNIV_LIKELY_NULL(heap)) {
3322
mem_heap_free(heap);
3326
/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
3328
/***************************************************************
3329
Gets the externally stored size of a record, in units of a database page. */
3332
btr_rec_get_externally_stored_len(
3333
/*==============================*/
3334
/* out: externally stored part,
3335
in units of a database page */
3336
rec_t* rec, /* in: record */
3337
const ulint* offsets)/* in: array returned by rec_get_offsets() */
3343
ulint total_extern_len = 0;
3346
ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
3347
n_fields = rec_offs_n_fields(offsets);
3349
for (i = 0; i < n_fields; i++) {
3350
if (rec_offs_nth_extern(offsets, i)) {
3352
data = rec_get_nth_field(rec, offsets, i, &local_len);
3354
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
3356
extern_len = mach_read_from_4(data + local_len
3357
+ BTR_EXTERN_LEN + 4);
3359
total_extern_len += ut_calc_align(extern_len,
3364
return(total_extern_len / UNIV_PAGE_SIZE);
3367
/***********************************************************************
3368
Sets the ownership bit of an externally stored field in a record. */
3371
btr_cur_set_ownership_of_extern_field(
3372
/*==================================*/
3373
page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed
3374
part will be updated, or NULL */
3375
rec_t* rec, /* in/out: clustered index record */
3376
dict_index_t* index, /* in: index of the page */
3377
const ulint* offsets,/* in: array returned by rec_get_offsets() */
3378
ulint i, /* in: field number */
3379
ibool val, /* in: value to set */
3380
mtr_t* mtr) /* in: mtr, or NULL if not logged */
3386
data = rec_get_nth_field(rec, offsets, i, &local_len);
3388
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
3390
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
3392
byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
3395
byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
3397
byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
3400
if (UNIV_LIKELY_NULL(page_zip)) {
3401
mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
3402
page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
3403
} else if (UNIV_LIKELY(mtr != NULL)) {
3405
mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
3408
mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
3412
/***********************************************************************
3413
Marks not updated extern fields as not-owned by this record. The ownership
3414
is transferred to the updated record which is inserted elsewhere in the
3415
index tree. In purge only the owner of externally stored field is allowed
3416
to free the field. */
3419
btr_cur_mark_extern_inherited_fields(
3420
/*=================================*/
3421
page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed
3422
part will be updated, or NULL */
3423
rec_t* rec, /* in/out: record in a clustered index */
3424
dict_index_t* index, /* in: index of the page */
3425
const ulint* offsets,/* in: array returned by rec_get_offsets() */
3426
const upd_t* update, /* in: update vector */
3427
mtr_t* mtr) /* in: mtr, or NULL if not logged */
3433
ut_ad(rec_offs_validate(rec, NULL, offsets));
3434
ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
3436
if (!rec_offs_any_extern(offsets)) {
3441
n = rec_offs_n_fields(offsets);
3443
for (i = 0; i < n; i++) {
3444
if (rec_offs_nth_extern(offsets, i)) {
3446
/* Check it is not in updated fields */
3449
for (j = 0; j < upd_get_n_fields(update);
3451
if (upd_get_nth_field(update, j)
3459
btr_cur_set_ownership_of_extern_field(
3460
page_zip, rec, index, offsets, i, FALSE, mtr);
3467
/***********************************************************************
3468
The complement of the previous function: in an update entry may inherit
3469
some externally stored fields from a record. We must mark them as inherited
3470
in entry, so that they are not freed in a rollback. */
3473
btr_cur_mark_dtuple_inherited_extern(
3474
/*=================================*/
3475
dtuple_t* entry, /* in/out: updated entry to be
3476
inserted to clustered index */
3477
const upd_t* update) /* in: update vector */
3481
for (i = 0; i < dtuple_get_n_fields(entry); i++) {
3483
dfield_t* dfield = dtuple_get_nth_field(entry, i);
3488
if (!dfield_is_ext(dfield)) {
3492
/* Check if it is in updated fields */
3494
for (j = 0; j < upd_get_n_fields(update); j++) {
3495
if (upd_get_nth_field(update, j)->field_no == i) {
3501
data = dfield_get_data(dfield);
3502
len = dfield_get_len(dfield);
3503
data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN]
3504
|= BTR_EXTERN_INHERITED_FLAG;
3511
/***********************************************************************
3512
Marks all extern fields in a record as owned by the record. This function
3513
should be called if the delete mark of a record is removed: a not delete
3514
marked record always owns all its extern fields. */
3517
btr_cur_unmark_extern_fields(
3518
/*=========================*/
3519
page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed
3520
part will be updated, or NULL */
3521
rec_t* rec, /* in/out: record in a clustered index */
3522
dict_index_t* index, /* in: index of the page */
3523
const ulint* offsets,/* in: array returned by rec_get_offsets() */
3524
mtr_t* mtr) /* in: mtr, or NULL if not logged */
3529
ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
3530
n = rec_offs_n_fields(offsets);
3532
if (!rec_offs_any_extern(offsets)) {
3537
for (i = 0; i < n; i++) {
3538
if (rec_offs_nth_extern(offsets, i)) {
3540
btr_cur_set_ownership_of_extern_field(
3541
page_zip, rec, index, offsets, i, TRUE, mtr);
3546
/***********************************************************************
3547
Marks all extern fields in a dtuple as owned by the record. */
3550
btr_cur_unmark_dtuple_extern_fields(
3551
/*================================*/
3552
dtuple_t* entry) /* in/out: clustered index entry */
3556
for (i = 0; i < dtuple_get_n_fields(entry); i++) {
3557
dfield_t* dfield = dtuple_get_nth_field(entry, i);
3559
if (dfield_is_ext(dfield)) {
3560
byte* data = dfield_get_data(dfield);
3561
ulint len = dfield_get_len(dfield);
3563
data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN]
3564
&= ~BTR_EXTERN_OWNER_FLAG;
3569
/***********************************************************************
3570
Flags the data tuple fields that are marked as extern storage in the
3571
update vector. We use this function to remember which fields we must
3572
mark as extern storage in a record inserted for an update. */
3575
btr_push_update_extern_fields(
3576
/*==========================*/
3577
/* out: number of flagged external columns */
3578
dtuple_t* tuple, /* in/out: data tuple */
3579
const upd_t* update, /* in: update vector */
3580
mem_heap_t* heap) /* in: memory heap */
3584
const upd_field_t* uf;
3589
uf = update->fields;
3590
n = upd_get_n_fields(update);
3593
if (dfield_is_ext(&uf->new_val)) {
3595
= dtuple_get_nth_field(tuple, uf->field_no);
3597
if (!dfield_is_ext(field)) {
3598
dfield_set_ext(field);
3602
switch (uf->orig_len) {
3608
case BTR_EXTERN_FIELD_REF_SIZE:
3609
/* Restore the original locally stored
3610
part of the column. In the undo log,
3611
InnoDB writes a longer prefix of externally
3612
stored columns, so that column prefixes
3613
in secondary indexes can be reconstructed. */
3614
dfield_set_data(field, (byte*) dfield_get_data(field)
3615
+ dfield_get_len(field)
3616
- BTR_EXTERN_FIELD_REF_SIZE,
3617
BTR_EXTERN_FIELD_REF_SIZE);
3618
dfield_set_ext(field);
3621
/* Reconstruct the original locally
3622
stored part of the column. The data
3623
will have to be copied. */
3624
ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
3626
data = dfield_get_data(field);
3627
len = dfield_get_len(field);
3629
buf = mem_heap_alloc(heap, uf->orig_len);
3630
/* Copy the locally stored prefix. */
3633
- BTR_EXTERN_FIELD_REF_SIZE);
3634
/* Copy the BLOB pointer. */
3635
memcpy(buf + uf->orig_len
3636
- BTR_EXTERN_FIELD_REF_SIZE,
3637
data + len - BTR_EXTERN_FIELD_REF_SIZE,
3638
BTR_EXTERN_FIELD_REF_SIZE);
3640
dfield_set_data(field, buf, uf->orig_len);
3641
dfield_set_ext(field);
3649
/***********************************************************************
3650
Returns the length of a BLOB part stored on the header page. */
3653
btr_blob_get_part_len(
3654
/*==================*/
3655
/* out: part length */
3656
const byte* blob_header) /* in: blob header */
3658
return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
3661
/***********************************************************************
3662
Returns the page number where the next BLOB part is stored. */
3665
btr_blob_get_next_page_no(
3666
/*======================*/
3667
/* out: page number or FIL_NULL if
3669
const byte* blob_header) /* in: blob header */
3671
return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
3674
/***********************************************************************
3675
Deallocate a buffer block that was reserved for a BLOB part. */
3680
buf_block_t* block, /* in: buffer block */
3681
ibool all, /* in: TRUE=remove also the compressed page
3683
mtr_t* mtr) /* in: mini-transaction to commit */
3685
ulint space = buf_block_get_space(block);
3686
ulint page_no = buf_block_get_page_no(block);
3688
ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3692
buf_pool_mutex_enter();
3693
mutex_enter(&block->mutex);
3695
/* Only free the block if it is still allocated to
3696
the same file page. */
3698
if (buf_block_get_state(block)
3699
== BUF_BLOCK_FILE_PAGE
3700
&& buf_block_get_space(block) == space
3701
&& buf_block_get_page_no(block) == page_no) {
3703
if (buf_LRU_free_block(&block->page, all, NULL)
3705
&& all && block->page.zip.data) {
3706
/* Attempt to deallocate the uncompressed page
3707
if the whole block cannot be deallocted. */
3709
buf_LRU_free_block(&block->page, FALSE, NULL);
3713
buf_pool_mutex_exit();
3714
mutex_exit(&block->mutex);
3717
/***********************************************************************
3718
Stores the fields in big_rec_vec to the tablespace and puts pointers to
3719
them in rec. The extern flags in rec will have to be set beforehand.
3720
The fields are stored on pages allocated from leaf node
3721
file segment of the index tree. */
3724
btr_store_big_rec_extern_fields(
3725
/*============================*/
3726
/* out: DB_SUCCESS or error */
3727
dict_index_t* index, /* in: index of rec; the index tree
3728
MUST be X-latched */
3729
buf_block_t* rec_block, /* in/out: block containing rec */
3730
rec_t* rec, /* in/out: record */
3731
const ulint* offsets, /* in: rec_get_offsets(rec, index);
3732
the "external storage" flags in offsets
3733
will not correspond to rec when
3734
this function returns */
3735
big_rec_t* big_rec_vec, /* in: vector containing fields
3736
to be stored externally */
3737
mtr_t* local_mtr __attribute__((unused))) /* in: mtr
3738
containing the latch to rec and to the
3752
mem_heap_t* heap = NULL;
3753
page_zip_des_t* page_zip;
3756
ut_ad(rec_offs_validate(rec, index, offsets));
3757
ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
3759
ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
3760
ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
3761
ut_a(dict_index_is_clust(index));
3763
page_zip = buf_block_get_page_zip(rec_block);
3764
ut_a(dict_table_zip_size(index->table)
3765
== buf_block_get_zip_size(rec_block));
3767
space_id = buf_block_get_space(rec_block);
3768
zip_size = buf_block_get_zip_size(rec_block);
3769
rec_page_no = buf_block_get_page_no(rec_block);
3770
ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
3772
if (UNIV_LIKELY_NULL(page_zip)) {
3775
/* Zlib deflate needs 128 kilobytes for the default
3776
window size, plus 512 << memLevel, plus a few
3777
kilobytes for small objects. We use reduced memLevel
3778
to limit the memory consumption, and preallocate the
3779
heap, hoping to avoid memory fragmentation. */
3780
heap = mem_heap_create(250000);
3781
page_zip_set_alloc(&c_stream, heap);
3783
err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
3784
Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
3788
/* We have to create a file segment to the tablespace
3789
for each field and put the pointer to the field in rec */
3791
for (i = 0; i < big_rec_vec->n_fields; i++) {
3792
ut_ad(rec_offs_nth_extern(offsets,
3793
big_rec_vec->fields[i].field_no));
3796
field_ref = rec_get_nth_field(
3797
rec, offsets, big_rec_vec->fields[i].field_no,
3799
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
3800
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
3801
field_ref += local_len;
3803
extern_len = big_rec_vec->fields[i].len;
3805
ut_a(extern_len > 0);
3807
prev_page_no = FIL_NULL;
3809
if (UNIV_LIKELY_NULL(page_zip)) {
3810
int err = deflateReset(&c_stream);
3813
c_stream.next_in = (void*) big_rec_vec->fields[i].data;
3814
c_stream.avail_in = extern_len;
3823
if (prev_page_no == FIL_NULL) {
3824
hint_page_no = 1 + rec_page_no;
3826
hint_page_no = prev_page_no + 1;
3829
block = btr_page_alloc(index, hint_page_no,
3830
FSP_NO_DIR, 0, &mtr);
3831
if (UNIV_UNLIKELY(block == NULL)) {
3835
if (UNIV_LIKELY_NULL(page_zip)) {
3836
deflateEnd(&c_stream);
3837
mem_heap_free(heap);
3840
return(DB_OUT_OF_FILE_SPACE);
3843
page_no = buf_block_get_page_no(block);
3844
page = buf_block_get_frame(block);
3846
if (prev_page_no != FIL_NULL) {
3847
buf_block_t* prev_block;
3850
prev_block = buf_page_get(space_id, zip_size,
3853
buf_block_dbg_add_level(prev_block,
3854
SYNC_EXTERN_STORAGE);
3855
prev_page = buf_block_get_frame(prev_block);
3857
if (UNIV_LIKELY_NULL(page_zip)) {
3859
prev_page + FIL_PAGE_NEXT,
3860
page_no, MLOG_4BYTES, &mtr);
3861
memcpy(buf_block_get_page_zip(
3863
->data + FIL_PAGE_NEXT,
3864
prev_page + FIL_PAGE_NEXT, 4);
3867
prev_page + FIL_PAGE_DATA
3868
+ BTR_BLOB_HDR_NEXT_PAGE_NO,
3869
page_no, MLOG_4BYTES, &mtr);
3874
if (UNIV_LIKELY_NULL(page_zip)) {
3876
page_zip_des_t* blob_page_zip;
3878
mach_write_to_2(page + FIL_PAGE_TYPE,
3879
prev_page_no == FIL_NULL
3880
? FIL_PAGE_TYPE_ZBLOB
3881
: FIL_PAGE_TYPE_ZBLOB2);
3883
c_stream.next_out = page
3886
= page_zip_get_size(page_zip)
3889
err = deflate(&c_stream, Z_FINISH);
3890
ut_a(err == Z_OK || err == Z_STREAM_END);
3891
ut_a(err == Z_STREAM_END
3892
|| c_stream.avail_out == 0);
3894
/* Write the "next BLOB page" pointer */
3895
mlog_write_ulint(page + FIL_PAGE_NEXT,
3896
FIL_NULL, MLOG_4BYTES, &mtr);
3897
/* Initialize the unused "prev page" pointer */
3898
mlog_write_ulint(page + FIL_PAGE_PREV,
3899
FIL_NULL, MLOG_4BYTES, &mtr);
3900
/* Write a back pointer to the record
3901
into the otherwise unused area. This
3902
information could be useful in
3903
debugging. Later, we might want to
3904
implement the possibility to relocate
3905
BLOB pages. Then, we would need to be
3906
able to adjust the BLOB pointer in the
3907
record. We do not store the heap
3908
number of the record, because it can
3909
change in page_zip_reorganize() or
3910
btr_page_reorganize(). However, also
3911
the page number of the record may
3912
change when B-tree nodes are split or
3914
mlog_write_ulint(page
3915
+ FIL_PAGE_FILE_FLUSH_LSN,
3918
mlog_write_ulint(page
3919
+ FIL_PAGE_FILE_FLUSH_LSN + 4,
3923
/* Zero out the unused part of the page. */
3924
memset(page + page_zip_get_size(page_zip)
3925
- c_stream.avail_out,
3926
0, c_stream.avail_out);
3927
mlog_log_string(page + FIL_PAGE_TYPE,
3928
page_zip_get_size(page_zip)
3931
/* Copy the page to compressed storage,
3932
because it will be flushed to disk
3934
blob_page_zip = buf_block_get_page_zip(block);
3935
ut_ad(blob_page_zip);
3936
ut_ad(page_zip_get_size(blob_page_zip)
3937
== page_zip_get_size(page_zip));
3938
memcpy(blob_page_zip->data, page,
3939
page_zip_get_size(page_zip));
3941
if (err == Z_OK && prev_page_no != FIL_NULL) {
3946
rec_block = buf_page_get(space_id, zip_size,
3949
buf_block_dbg_add_level(rec_block,
3950
SYNC_NO_ORDER_CHECK);
3952
if (err == Z_STREAM_END) {
3953
mach_write_to_4(field_ref
3954
+ BTR_EXTERN_LEN, 0);
3955
mach_write_to_4(field_ref
3956
+ BTR_EXTERN_LEN + 4,
3959
memset(field_ref + BTR_EXTERN_LEN,
3963
if (prev_page_no == FIL_NULL) {
3964
mach_write_to_4(field_ref
3965
+ BTR_EXTERN_SPACE_ID,
3968
mach_write_to_4(field_ref
3969
+ BTR_EXTERN_PAGE_NO,
3972
mach_write_to_4(field_ref
3973
+ BTR_EXTERN_OFFSET,
3977
page_zip_write_blob_ptr(
3978
page_zip, rec, index, offsets,
3979
big_rec_vec->fields[i].field_no, &mtr);
3982
prev_page_no = page_no;
3984
/* Commit mtr and release the
3985
uncompressed page frame to save memory. */
3986
btr_blob_free(block, FALSE, &mtr);
3988
if (err == Z_STREAM_END) {
3992
mlog_write_ulint(page + FIL_PAGE_TYPE,
3996
if (extern_len > (UNIV_PAGE_SIZE
3999
- FIL_PAGE_DATA_END)) {
4000
store_len = UNIV_PAGE_SIZE
4003
- FIL_PAGE_DATA_END;
4005
store_len = extern_len;
4008
mlog_write_string(page + FIL_PAGE_DATA
4009
+ BTR_BLOB_HDR_SIZE,
4011
big_rec_vec->fields[i].data
4012
+ big_rec_vec->fields[i].len
4015
mlog_write_ulint(page + FIL_PAGE_DATA
4016
+ BTR_BLOB_HDR_PART_LEN,
4017
store_len, MLOG_4BYTES, &mtr);
4018
mlog_write_ulint(page + FIL_PAGE_DATA
4019
+ BTR_BLOB_HDR_NEXT_PAGE_NO,
4020
FIL_NULL, MLOG_4BYTES, &mtr);
4022
extern_len -= store_len;
4024
rec_block = buf_page_get(space_id, zip_size,
4027
buf_block_dbg_add_level(rec_block,
4028
SYNC_NO_ORDER_CHECK);
4030
mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
4032
mlog_write_ulint(field_ref
4033
+ BTR_EXTERN_LEN + 4,
4034
big_rec_vec->fields[i].len
4038
if (prev_page_no == FIL_NULL) {
4039
mlog_write_ulint(field_ref
4040
+ BTR_EXTERN_SPACE_ID,
4044
mlog_write_ulint(field_ref
4045
+ BTR_EXTERN_PAGE_NO,
4049
mlog_write_ulint(field_ref
4050
+ BTR_EXTERN_OFFSET,
4055
prev_page_no = page_no;
4059
if (extern_len == 0) {
4066
if (UNIV_LIKELY_NULL(page_zip)) {
4067
deflateEnd(&c_stream);
4068
mem_heap_free(heap);
4074
/***********************************************************************
4075
Frees the space in an externally stored field to the file space
4076
management if the field in data is owned by the externally stored field,
4077
in a rollback we may have the additional condition that the field must
4078
not be inherited. */
4081
btr_free_externally_stored_field(
4082
/*=============================*/
4083
dict_index_t* index, /* in: index of the data, the index
4084
tree MUST be X-latched; if the tree
4085
height is 1, then also the root page
4086
must be X-latched! (this is relevant
4087
in the case this function is called
4088
from purge where 'data' is located on
4089
an undo log page, not an index
4091
byte* field_ref, /* in/out: field reference */
4092
const rec_t* rec, /* in: record containing field_ref, for
4093
page_zip_write_blob_ptr(), or NULL */
4094
const ulint* offsets, /* in: rec_get_offsets(rec, index),
4096
page_zip_des_t* page_zip, /* in: compressed page corresponding
4097
to rec, or NULL if rec == NULL */
4098
ulint i, /* in: field number of field_ref;
4099
ignored if rec == NULL */
4100
enum trx_rb_ctx rb_ctx, /* in: rollback context */
4101
mtr_t* local_mtr __attribute__((unused))) /* in: mtr
4102
containing the latch to data an an
4103
X-latch to the index tree */
4107
ulint rec_zip_size = dict_table_zip_size(index->table);
4113
ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
4115
ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
4116
MTR_MEMO_PAGE_X_FIX));
4117
ut_ad(!rec || rec_offs_validate(rec, index, offsets));
4121
const byte* f = rec_get_nth_field(rec, offsets,
4123
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
4124
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
4126
ut_ad(f == field_ref);
4128
#endif /* UNIV_DEBUG */
4130
if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
4131
BTR_EXTERN_FIELD_REF_SIZE))) {
4132
/* In the rollback of uncommitted transactions, we may
4133
encounter a clustered index record whose BLOBs have
4134
not been written. There is nothing to free then. */
4135
ut_a(rb_ctx == RB_RECOVERY);
4139
space_id = mach_read_from_4(field_ref + BTR_EXTERN_SPACE_ID);
4141
if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
4142
ext_zip_size = fil_space_get_zip_size(space_id);
4143
/* This must be an undo log record in the system tablespace,
4144
that is, in row_purge_upd_exist_or_extern().
4145
Currently, externally stored records are stored in the
4146
same tablespace as the referring records. */
4147
ut_ad(!page_get_space_id(page_align(field_ref)));
4151
ext_zip_size = rec_zip_size;
4155
/* This is a call from row_purge_upd_exist_or_extern(). */
4161
buf_block_t* rec_block;
4162
buf_block_t* ext_block;
4166
rec_block = buf_page_get(page_get_space_id(
4167
page_align(field_ref)),
4170
page_align(field_ref)),
4172
buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
4173
page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
4175
if (/* There is no external storage data */
4177
/* This field does not own the externally stored field */
4178
|| (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
4179
& BTR_EXTERN_OWNER_FLAG)
4180
/* Rollback and inherited field */
4181
|| (rb_ctx != RB_NONE
4182
&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
4183
& BTR_EXTERN_INHERITED_FLAG))) {
4191
ext_block = buf_page_get(space_id, ext_zip_size, page_no,
4193
buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
4194
page = buf_block_get_frame(ext_block);
4197
/* Note that page_zip will be NULL
4198
in row_purge_upd_exist_or_extern(). */
4199
switch (fil_page_get_type(page)) {
4200
case FIL_PAGE_TYPE_ZBLOB:
4201
case FIL_PAGE_TYPE_ZBLOB2:
4206
next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
4208
btr_page_free_low(index, ext_block, 0, &mtr);
4210
if (UNIV_LIKELY(page_zip != NULL)) {
4211
mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
4213
mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
4215
page_zip_write_blob_ptr(page_zip, rec, index,
4218
mlog_write_ulint(field_ref
4219
+ BTR_EXTERN_PAGE_NO,
4222
mlog_write_ulint(field_ref
4223
+ BTR_EXTERN_LEN + 4, 0,
4227
ut_a(fil_page_get_type(page) == FIL_PAGE_TYPE_BLOB);
4230
next_page_no = mach_read_from_4(
4231
page + FIL_PAGE_DATA
4232
+ BTR_BLOB_HDR_NEXT_PAGE_NO);
4234
/* We must supply the page level (= 0) as an argument
4235
because we did not store it on the page (we save the
4236
space overhead from an index page header. */
4238
ut_a(space_id == page_get_space_id(page));
4239
ut_a(page_no == page_get_page_no(page));
4241
btr_page_free_low(index, ext_block, 0, &mtr);
4243
mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
4246
/* Zero out the BLOB length. If the server
4247
crashes during the execution of this function,
4248
trx_rollback_or_clean_all_recovered() could
4249
dereference the half-deleted BLOB, fetching a
4250
wrong prefix for the BLOB. */
4251
mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
4256
/* Commit mtr and release the BLOB block to save memory. */
4257
btr_blob_free(ext_block, TRUE, &mtr);
4261
/***************************************************************
4262
Frees the externally stored fields for a record. */
4265
btr_rec_free_externally_stored_fields(
4266
/*==================================*/
4267
dict_index_t* index, /* in: index of the data, the index
4268
tree MUST be X-latched */
4269
rec_t* rec, /* in/out: record */
4270
const ulint* offsets,/* in: rec_get_offsets(rec, index) */
4271
page_zip_des_t* page_zip,/* in: compressed page whose uncompressed
4272
part will be updated, or NULL */
4273
enum trx_rb_ctx rb_ctx, /* in: rollback context */
4274
mtr_t* mtr) /* in: mini-transaction handle which contains
4275
an X-latch to record page and to the index
4281
ut_ad(rec_offs_validate(rec, index, offsets));
4282
ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
4283
/* Free possible externally stored fields in the record */
4285
ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
4286
n_fields = rec_offs_n_fields(offsets);
4288
for (i = 0; i < n_fields; i++) {
4289
if (rec_offs_nth_extern(offsets, i)) {
4292
= rec_get_nth_field(rec, offsets, i, &len);
4293
ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
4295
btr_free_externally_stored_field(
4296
index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
4297
rec, offsets, page_zip, i, rb_ctx, mtr);
4302
/***************************************************************
4303
Frees the externally stored fields for a record, if the field is mentioned
4304
in the update vector. */
4307
btr_rec_free_updated_extern_fields(
4308
/*===============================*/
4309
dict_index_t* index, /* in: index of rec; the index tree MUST be
4311
rec_t* rec, /* in/out: record */
4312
page_zip_des_t* page_zip,/* in: compressed page whose uncompressed
4313
part will be updated, or NULL */
4314
const ulint* offsets,/* in: rec_get_offsets(rec, index) */
4315
const upd_t* update, /* in: update vector */
4316
enum trx_rb_ctx rb_ctx, /* in: rollback context */
4317
mtr_t* mtr) /* in: mini-transaction handle which contains
4318
an X-latch to record page and to the tree */
4323
ut_ad(rec_offs_validate(rec, index, offsets));
4324
ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
4326
/* Free possible externally stored fields in the record */
4328
n_fields = upd_get_n_fields(update);
4330
for (i = 0; i < n_fields; i++) {
4331
const upd_field_t* ufield = upd_get_nth_field(update, i);
4333
if (rec_offs_nth_extern(offsets, ufield->field_no)) {
4335
byte* data = rec_get_nth_field(
4336
rec, offsets, ufield->field_no, &len);
4337
ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
4339
btr_free_externally_stored_field(
4340
index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
4341
rec, offsets, page_zip,
4342
ufield->field_no, rb_ctx, mtr);
4347
/***********************************************************************
4348
Copies the prefix of an uncompressed BLOB. The clustered index record
4349
that points to this BLOB must be protected by a lock or a page latch. */
4352
btr_copy_blob_prefix(
4353
/*=================*/
4354
/* out: number of bytes written to buf */
4355
byte* buf, /* out: the externally stored part of
4356
the field, or a prefix of it */
4357
ulint len, /* in: length of buf, in bytes */
4358
ulint space_id,/* in: space id of the BLOB pages */
4359
ulint page_no,/* in: page number of the first BLOB page */
4360
ulint offset) /* in: offset on the first BLOB page */
4362
ulint copied_len = 0;
4368
const byte* blob_header;
4374
block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr);
4375
buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
4376
page = buf_block_get_frame(block);
4378
/* Unfortunately, FIL_PAGE_TYPE was uninitialized for
4379
many pages until MySQL/InnoDB 5.1.7. */
4380
/* ut_ad(fil_page_get_type(page) == FIL_PAGE_TYPE_BLOB); */
4381
blob_header = page + offset;
4382
part_len = btr_blob_get_part_len(blob_header);
4383
copy_len = ut_min(part_len, len - copied_len);
4385
memcpy(buf + copied_len,
4386
blob_header + BTR_BLOB_HDR_SIZE, copy_len);
4387
copied_len += copy_len;
4389
page_no = btr_blob_get_next_page_no(blob_header);
4393
if (page_no == FIL_NULL || copy_len != part_len) {
4397
/* On other BLOB pages except the first the BLOB header
4398
always is at the page data start: */
4400
offset = FIL_PAGE_DATA;
4402
ut_ad(copied_len <= len);
4406
/***********************************************************************
4407
Copies the prefix of a compressed BLOB. The clustered index record
4408
that points to this BLOB must be protected by a lock or a page latch. */
4411
btr_copy_zblob_prefix(
4412
/*==================*/
4413
z_stream* d_stream,/* in/out: the decompressing stream */
4414
ulint zip_size,/* in: compressed BLOB page size */
4415
ulint space_id,/* in: space id of the BLOB pages */
4416
ulint page_no,/* in: page number of the first BLOB page */
4417
ulint offset) /* in: offset on the first BLOB page */
4419
ulint page_type = FIL_PAGE_TYPE_ZBLOB;
4421
ut_ad(ut_is_2pow(zip_size));
4422
ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE);
4423
ut_ad(zip_size <= UNIV_PAGE_SIZE);
4431
/* There is no latch on bpage directly. Instead,
4432
bpage is protected by the B-tree page latch that
4433
is being held on the clustered index record, or,
4434
in row_merge_copy_blobs(), by an exclusive table lock. */
4435
bpage = buf_page_get_zip(space_id, zip_size, page_no);
4437
if (UNIV_UNLIKELY(!bpage)) {
4438
ut_print_timestamp(stderr);
4440
" InnoDB: Cannot load"
4442
" page %lu space %lu\n",
4443
(ulong) page_no, (ulong) space_id);
4448
(fil_page_get_type(bpage->zip.data) != page_type)) {
4449
ut_print_timestamp(stderr);
4451
" InnoDB: Unexpected type %lu of"
4453
" page %lu space %lu\n",
4454
(ulong) fil_page_get_type(bpage->zip.data),
4455
(ulong) page_no, (ulong) space_id);
4459
next_page_no = mach_read_from_4(bpage->zip.data + offset);
4461
if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
4462
/* When the BLOB begins at page header,
4463
the compressed data payload does not
4464
immediately follow the next page pointer. */
4465
offset = FIL_PAGE_DATA;
4470
d_stream->next_in = bpage->zip.data + offset;
4471
d_stream->avail_in = zip_size - offset;
4473
err = inflate(d_stream, Z_NO_FLUSH);
4476
if (!d_stream->avail_out) {
4481
if (next_page_no == FIL_NULL) {
4487
ut_print_timestamp(stderr);
4489
" InnoDB: inflate() of"
4491
" page %lu space %lu returned %d (%s)\n",
4492
(ulong) page_no, (ulong) space_id,
4493
err, d_stream->msg);
4498
if (next_page_no == FIL_NULL) {
4499
if (!d_stream->avail_in) {
4500
ut_print_timestamp(stderr);
4502
" InnoDB: unexpected end of"
4504
" page %lu space %lu\n",
4508
err = inflate(d_stream, Z_FINISH);
4519
buf_page_release_zip(bpage);
4523
buf_page_release_zip(bpage);
4525
/* On other BLOB pages except the first
4526
the BLOB header always is at the page header: */
4528
page_no = next_page_no;
4529
offset = FIL_PAGE_NEXT;
4530
page_type = FIL_PAGE_TYPE_ZBLOB2;
4534
/***********************************************************************
4535
Copies the prefix of an externally stored field of a record. The
4536
clustered index record that points to this BLOB must be protected by a
4537
lock or a page latch. */
4540
btr_copy_externally_stored_field_prefix_low(
4541
/*========================================*/
4542
/* out: number of bytes written to buf */
4543
byte* buf, /* out: the externally stored part of
4544
the field, or a prefix of it */
4545
ulint len, /* in: length of buf, in bytes */
4546
ulint zip_size,/* in: nonzero=compressed BLOB page size,
4547
zero for uncompressed BLOBs */
4548
ulint space_id,/* in: space id of the first BLOB page */
4549
ulint page_no,/* in: page number of the first BLOB page */
4550
ulint offset) /* in: offset on the first BLOB page */
4552
if (UNIV_UNLIKELY(len == 0)) {
4556
if (UNIV_UNLIKELY(zip_size)) {
4561
/* Zlib inflate needs 32 kilobytes for the default
4562
window size, plus a few kilobytes for small objects. */
4563
heap = mem_heap_create(40000);
4564
page_zip_set_alloc(&d_stream, heap);
4566
err = inflateInit(&d_stream);
4569
d_stream.next_out = buf;
4570
d_stream.avail_out = len;
4571
d_stream.avail_in = 0;
4573
btr_copy_zblob_prefix(&d_stream, zip_size,
4574
space_id, page_no, offset);
4575
inflateEnd(&d_stream);
4576
mem_heap_free(heap);
4577
return(d_stream.total_out);
4579
return(btr_copy_blob_prefix(buf, len, space_id,
4584
/***********************************************************************
4585
Copies the prefix of an externally stored field of a record. The
4586
clustered index record must be protected by a lock or a page latch. */
4589
btr_copy_externally_stored_field_prefix(
4590
/*====================================*/
4591
/* out: the length of the copied field,
4592
or 0 if the column was being or has been
4594
byte* buf, /* out: the field, or a prefix of it */
4595
ulint len, /* in: length of buf, in bytes */
4596
ulint zip_size,/* in: nonzero=compressed BLOB page size,
4597
zero for uncompressed BLOBs */
4598
const byte* data, /* in: 'internally' stored part of the
4599
field containing also the reference to
4600
the external part; must be protected by
4601
a lock or a page latch */
4602
ulint local_len)/* in: length of data, in bytes */
4608
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
4610
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
4612
if (UNIV_UNLIKELY(local_len >= len)) {
4613
memcpy(buf, data, len);
4617
memcpy(buf, data, local_len);
4620
ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
4622
if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
4623
/* The externally stored part of the column has been
4624
(partially) deleted. Signal the half-deleted BLOB
4630
space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
4632
page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
4634
offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
4637
+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
4644
/***********************************************************************
4645
Copies an externally stored field of a record to mem heap. The
4646
clustered index record must be protected by a lock or a page latch. */
4649
btr_copy_externally_stored_field(
4650
/*=============================*/
4651
/* out: the whole field copied to heap */
4652
ulint* len, /* out: length of the whole field */
4653
const byte* data, /* in: 'internally' stored part of the
4654
field containing also the reference to
4655
the external part; must be protected by
4656
a lock or a page latch */
4657
ulint zip_size,/* in: nonzero=compressed BLOB page size,
4658
zero for uncompressed BLOBs */
4659
ulint local_len,/* in: length of data */
4660
mem_heap_t* heap) /* in: mem heap */
4668
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
4670
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
4672
space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
4674
page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
4676
offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
4678
/* Currently a BLOB cannot be bigger than 4 GB; we
4679
leave the 4 upper bytes in the length field unused */
4681
extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
4683
buf = mem_heap_alloc(heap, local_len + extern_len);
4685
memcpy(buf, data, local_len);
4687
+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
4696
/***********************************************************************
4697
Copies an externally stored field of a record to mem heap. */
4700
btr_rec_copy_externally_stored_field(
4701
/*=================================*/
4702
/* out: the field copied to heap */
4703
const rec_t* rec, /* in: record in a clustered index;
4704
must be protected by a lock or a page latch */
4705
const ulint* offsets,/* in: array returned by rec_get_offsets() */
4706
ulint zip_size,/* in: nonzero=compressed BLOB page size,
4707
zero for uncompressed BLOBs */
4708
ulint no, /* in: field number */
4709
ulint* len, /* out: length of the field */
4710
mem_heap_t* heap) /* in: mem heap */
4715
ut_a(rec_offs_nth_extern(offsets, no));
4717
/* An externally stored field can contain some initial
4718
data from the field, and in the last 20 bytes it has the
4719
space id, page number, and offset where the rest of the
4720
field data is stored, and the data length in addition to
4721
the data stored locally. We may need to store some data
4722
locally to get the local record length above the 128 byte
4723
limit so that field offsets are stored in two bytes, and
4724
the extern bit is available in those two bytes. */
4726
data = rec_get_nth_field(rec, offsets, no, &local_len);
4728
return(btr_copy_externally_stored_field(len, data,
4729
zip_size, local_len, heap));