1
/******************************************************
4
All changes that row operations make to a B-tree or the records
5
there must go through this module! Undo log records are written here
6
of every modify or insert of a clustered index record.
9
To make sure we do not run out of disk space during a pessimistic
10
insert or update, we have to reserve 2 x the height of the index tree
11
many pages in the tablespace before we start the operation, because
12
if leaf splitting has been started, it is difficult to undo, except
13
by crashing the database and doing a roll-forward.
15
(c) 1994-2001 Innobase Oy
17
Created 10/16/1994 Heikki Tuuri
18
*******************************************************/
26
#include "page0page.h"
36
#include "ibuf0ibuf.h"
37
#include "lock0lock.h"
40
/* If the following is set to TRUE, this module prints a lot of
41
trace information of individual record operations */
42
ibool btr_cur_print_record_ops = FALSE;
43
#endif /* UNIV_DEBUG */
45
ulint btr_cur_n_non_sea = 0;
46
ulint btr_cur_n_sea = 0;
47
ulint btr_cur_n_non_sea_old = 0;
48
ulint btr_cur_n_sea_old = 0;
50
/* In the optimistic insert, if the insert does not fit, but this much space
51
can be released by page reorganize, then it is reorganized */
53
#define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32)
55
/* When estimating number of different kay values in an index sample
56
this many index pages */
57
#define BTR_KEY_VAL_ESTIMATE_N_PAGES 8
59
/* The structure of a BLOB part header */
60
/*--------------------------------------*/
61
#define BTR_BLOB_HDR_PART_LEN 0 /* BLOB part len on this
63
#define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /* next BLOB part page no,
65
/*--------------------------------------*/
66
#define BTR_BLOB_HDR_SIZE 8
68
/***********************************************************************
69
Marks all extern fields in a record as owned by the record. This function
70
should be called if the delete mark of a record is removed: a not delete
71
marked record always owns all its extern fields. */
74
btr_cur_unmark_extern_fields(
75
/*=========================*/
76
rec_t* rec, /* in: record in a clustered index */
77
mtr_t* mtr, /* in: mtr */
78
const ulint* offsets);/* in: array returned by rec_get_offsets() */
79
/***********************************************************************
80
Adds path information to the cursor for the current page, for which
81
the binary search has been performed. */
84
btr_cur_add_path_info(
85
/*==================*/
86
btr_cur_t* cursor, /* in: cursor positioned on a page */
87
ulint height, /* in: height of the page in tree;
89
ulint root_height); /* in: root node height in tree */
90
/***************************************************************
91
Frees the externally stored fields for a record, if the field is mentioned
92
in the update vector. */
95
btr_rec_free_updated_extern_fields(
96
/*===============================*/
97
dict_index_t* index, /* in: index of rec; the index tree MUST be
99
rec_t* rec, /* in: record */
100
const ulint* offsets,/* in: rec_get_offsets(rec, index) */
101
upd_t* update, /* in: update vector */
102
ibool do_not_free_inherited,/* in: TRUE if called in a
103
rollback and we do not want to free
105
mtr_t* mtr); /* in: mini-transaction handle which contains
106
an X-latch to record page and to the tree */
107
/***************************************************************
108
Gets the externally stored size of a record, in units of a database page. */
111
btr_rec_get_externally_stored_len(
112
/*==============================*/
113
/* out: externally stored part,
114
in units of a database page */
115
rec_t* rec, /* in: record */
116
const ulint* offsets);/* in: array returned by rec_get_offsets() */
118
/*==================== B-TREE SEARCH =========================*/
120
/************************************************************************
121
Latches the leaf page or pages requested. */
124
btr_cur_latch_leaves(
125
/*=================*/
126
page_t* page, /* in: leaf page where the search
128
ulint space, /* in: space id */
129
ulint page_no, /* in: page number of the leaf */
130
ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */
131
btr_cur_t* cursor, /* in: cursor */
132
mtr_t* mtr) /* in: mtr */
140
if (latch_mode == BTR_SEARCH_LEAF) {
142
get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr);
143
ut_a(page_is_comp(get_page) == page_is_comp(page));
144
buf_block_align(get_page)->check_index_page_at_flush = TRUE;
146
} else if (latch_mode == BTR_MODIFY_LEAF) {
148
get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
149
ut_a(page_is_comp(get_page) == page_is_comp(page));
150
buf_block_align(get_page)->check_index_page_at_flush = TRUE;
152
} else if (latch_mode == BTR_MODIFY_TREE) {
154
/* x-latch also brothers from left to right */
155
left_page_no = btr_page_get_prev(page, mtr);
157
if (left_page_no != FIL_NULL) {
158
get_page = btr_page_get(space, left_page_no,
160
#ifdef UNIV_BTR_DEBUG
161
ut_a(btr_page_get_next(get_page, mtr)
162
== buf_frame_get_page_no(page));
163
#endif /* UNIV_BTR_DEBUG */
164
ut_a(page_is_comp(get_page) == page_is_comp(page));
165
buf_block_align(get_page)->check_index_page_at_flush
169
get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
170
ut_a(page_is_comp(get_page) == page_is_comp(page));
171
buf_block_align(get_page)->check_index_page_at_flush = TRUE;
173
right_page_no = btr_page_get_next(page, mtr);
175
if (right_page_no != FIL_NULL) {
176
get_page = btr_page_get(space, right_page_no,
178
#ifdef UNIV_BTR_DEBUG
179
ut_a(btr_page_get_prev(get_page, mtr)
180
== buf_frame_get_page_no(page));
181
#endif /* UNIV_BTR_DEBUG */
182
buf_block_align(get_page)->check_index_page_at_flush
186
} else if (latch_mode == BTR_SEARCH_PREV) {
188
/* s-latch also left brother */
189
left_page_no = btr_page_get_prev(page, mtr);
191
if (left_page_no != FIL_NULL) {
192
cursor->left_page = btr_page_get(space, left_page_no,
194
#ifdef UNIV_BTR_DEBUG
195
ut_a(btr_page_get_next(cursor->left_page, mtr)
196
== buf_frame_get_page_no(page));
197
#endif /* UNIV_BTR_DEBUG */
198
ut_a(page_is_comp(cursor->left_page)
199
== page_is_comp(page));
200
buf_block_align(cursor->left_page)
201
->check_index_page_at_flush = TRUE;
204
get_page = btr_page_get(space, page_no, RW_S_LATCH, mtr);
205
ut_a(page_is_comp(get_page) == page_is_comp(page));
206
buf_block_align(get_page)->check_index_page_at_flush = TRUE;
208
} else if (latch_mode == BTR_MODIFY_PREV) {
210
/* x-latch also left brother */
211
left_page_no = btr_page_get_prev(page, mtr);
213
if (left_page_no != FIL_NULL) {
214
cursor->left_page = btr_page_get(space, left_page_no,
216
#ifdef UNIV_BTR_DEBUG
217
ut_a(btr_page_get_next(cursor->left_page, mtr)
218
== buf_frame_get_page_no(page));
219
#endif /* UNIV_BTR_DEBUG */
220
ut_a(page_is_comp(cursor->left_page)
221
== page_is_comp(page));
222
buf_block_align(cursor->left_page)
223
->check_index_page_at_flush = TRUE;
226
get_page = btr_page_get(space, page_no, RW_X_LATCH, mtr);
227
ut_a(page_is_comp(get_page) == page_is_comp(page));
228
buf_block_align(get_page)->check_index_page_at_flush = TRUE;
234
/************************************************************************
235
Searches an index tree and positions a tree cursor on a given level.
236
NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
237
to node pointer page number fields on the upper levels of the tree!
238
Note that if mode is PAGE_CUR_LE, which is used in inserts, then
239
cursor->up_match and cursor->low_match both will have sensible values.
240
If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
242
If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
243
search tuple should be performed in the B-tree. InnoDB does an insert
244
immediately after the cursor. Thus, the cursor may end up on a user record,
245
or on a page infimum record. */
248
btr_cur_search_to_nth_level(
249
/*========================*/
250
dict_index_t* index, /* in: index */
251
ulint level, /* in: the tree level of search */
252
dtuple_t* tuple, /* in: data tuple; NOTE: n_fields_cmp in
253
tuple must be set so that it cannot get
254
compared to the node ptr page number field! */
255
ulint mode, /* in: PAGE_CUR_L, ...;
256
Inserts should always be made using
257
PAGE_CUR_LE to search the position! */
258
ulint latch_mode, /* in: BTR_SEARCH_LEAF, ..., ORed with
259
BTR_INSERT and BTR_ESTIMATE;
260
cursor->left_page is used to store a pointer
261
to the left neighbor page, in the cases
262
BTR_SEARCH_PREV and BTR_MODIFY_PREV;
263
NOTE that if has_search_latch
264
is != 0, we maybe do not have a latch set
265
on the cursor page, we assume
266
the caller uses his search latch
267
to protect the record! */
268
btr_cur_t* cursor, /* in/out: tree cursor; the cursor page is
269
s- or x-latched, but see also above! */
270
ulint has_search_latch,/* in: info on the latch mode the
271
caller currently has on btr_search_latch:
273
mtr_t* mtr) /* in: mtr */
275
page_cur_t* page_cursor;
289
ulint insert_planned;
292
ulint ignore_sec_unique;
293
ulint root_height = 0; /* remove warning */
297
mem_heap_t* heap = NULL;
298
ulint offsets_[REC_OFFS_NORMAL_SIZE];
299
ulint* offsets = offsets_;
300
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
301
/* Currently, PAGE_CUR_LE is the only search mode used for searches
302
ending to upper levels */
304
ut_ad(level == 0 || mode == PAGE_CUR_LE);
305
ut_ad(dict_index_check_search_tuple(index, tuple));
306
ut_ad(!(index->type & DICT_IBUF) || ibuf_inside());
307
ut_ad(dtuple_check_typed(tuple));
310
cursor->up_match = ULINT_UNDEFINED;
311
cursor->low_match = ULINT_UNDEFINED;
313
insert_planned = latch_mode & BTR_INSERT;
314
estimate = latch_mode & BTR_ESTIMATE;
315
ignore_sec_unique = latch_mode & BTR_IGNORE_SEC_UNIQUE;
316
latch_mode = latch_mode & ~(BTR_INSERT | BTR_ESTIMATE
317
| BTR_IGNORE_SEC_UNIQUE);
319
ut_ad(!insert_planned || (mode == PAGE_CUR_LE));
321
cursor->flag = BTR_CUR_BINARY;
322
cursor->index = index;
324
#ifndef BTR_CUR_ADAPT
327
info = btr_search_get_info(index);
329
guess = info->root_guess;
331
#ifdef BTR_CUR_HASH_ADAPT
333
#ifdef UNIV_SEARCH_PERF_STAT
336
if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED
337
&& latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
339
#ifdef PAGE_CUR_LE_OR_EXTENDS
340
&& mode != PAGE_CUR_LE_OR_EXTENDS
341
#endif /* PAGE_CUR_LE_OR_EXTENDS */
342
&& srv_use_adaptive_hash_indexes
343
&& btr_search_guess_on_hash(index, info, tuple, mode,
345
has_search_latch, mtr)) {
347
/* Search using the hash index succeeded */
349
ut_ad(cursor->up_match != ULINT_UNDEFINED
350
|| mode != PAGE_CUR_GE);
351
ut_ad(cursor->up_match != ULINT_UNDEFINED
352
|| mode != PAGE_CUR_LE);
353
ut_ad(cursor->low_match != ULINT_UNDEFINED
354
|| mode != PAGE_CUR_LE);
363
/* If the hash search did not succeed, do binary search down the
366
if (has_search_latch) {
367
/* Release possible search latch to obey latching order */
368
rw_lock_s_unlock(&btr_search_latch);
371
/* Store the position of the tree latch we push to mtr so that we
372
know how to release it when we have latched leaf node(s) */
374
savepoint = mtr_set_savepoint(mtr);
376
if (latch_mode == BTR_MODIFY_TREE) {
377
mtr_x_lock(dict_index_get_lock(index), mtr);
379
} else if (latch_mode == BTR_CONT_MODIFY_TREE) {
381
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
384
mtr_s_lock(dict_index_get_lock(index), mtr);
387
page_cursor = btr_cur_get_page_cur(cursor);
389
space = dict_index_get_space(index);
390
page_no = dict_index_get_page(index);
397
height = ULINT_UNDEFINED;
398
rw_latch = RW_NO_LATCH;
401
/* We use these modified search modes on non-leaf levels of the
402
B-tree. These let us end up in the right B-tree leaf. In that leaf
403
we use the original search mode. */
407
page_mode = PAGE_CUR_L;
410
page_mode = PAGE_CUR_LE;
413
#ifdef PAGE_CUR_LE_OR_EXTENDS
414
ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
415
|| mode == PAGE_CUR_LE_OR_EXTENDS);
416
#else /* PAGE_CUR_LE_OR_EXTENDS */
417
ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
418
#endif /* PAGE_CUR_LE_OR_EXTENDS */
423
/* Loop and search until we arrive at the desired level */
426
if ((height == 0) && (latch_mode <= BTR_MODIFY_LEAF)) {
428
rw_latch = latch_mode;
431
&& ibuf_should_try(index, ignore_sec_unique)) {
433
/* Try insert to the insert buffer if the
434
page is not in the buffer pool */
436
buf_mode = BUF_GET_IF_IN_POOL;
440
page = buf_page_get_gen(space, page_no, rw_latch, guess,
445
/* This must be a search to perform an insert;
446
try insert to the insert buffer */
448
ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
449
ut_ad(insert_planned);
452
if (ibuf_should_try(index, ignore_sec_unique)
453
&& ibuf_insert(tuple, index, space, page_no,
455
/* Insertion to the insert buffer succeeded */
456
cursor->flag = BTR_CUR_INSERT_TO_IBUF;
457
if (UNIV_LIKELY_NULL(heap)) {
463
/* Insert to the insert buffer did not succeed:
471
buf_block_align(page)->check_index_page_at_flush = TRUE;
473
#ifdef UNIV_SYNC_DEBUG
474
if (rw_latch != RW_NO_LATCH) {
475
buf_page_dbg_add_level(page, SYNC_TREE_NODE);
478
ut_ad(0 == ut_dulint_cmp(index->id,
479
btr_page_get_index_id(page)));
481
if (height == ULINT_UNDEFINED) {
482
/* We are in the root node */
484
height = btr_page_get_level(page, mtr);
485
root_height = height;
486
cursor->tree_height = root_height + 1;
489
info->root_guess = page;
495
if (rw_latch == RW_NO_LATCH) {
497
btr_cur_latch_leaves(page, space,
502
if ((latch_mode != BTR_MODIFY_TREE)
503
&& (latch_mode != BTR_CONT_MODIFY_TREE)) {
505
/* Release the tree s-latch */
507
mtr_release_s_latch_at_savepoint(
509
dict_index_get_lock(index));
515
page_cur_search_with_match(page, index, tuple, page_mode,
516
&up_match, &up_bytes,
517
&low_match, &low_bytes,
520
btr_cur_add_path_info(cursor, height, root_height);
523
/* If this is the desired level, leave the loop */
525
ut_ad(height == btr_page_get_level(
526
page_cur_get_page(page_cursor), mtr));
528
if (level == height) {
531
/* x-latch the page */
532
page = btr_page_get(space,
533
page_no, RW_X_LATCH, mtr);
534
ut_a((ibool)!!page_is_comp(page)
535
== dict_table_is_comp(index->table));
546
node_ptr = page_cur_get_rec(page_cursor);
547
offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
548
ULINT_UNDEFINED, &heap);
549
/* Go to the child node */
550
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
553
if (UNIV_LIKELY_NULL(heap)) {
558
cursor->low_match = low_match;
559
cursor->low_bytes = low_bytes;
560
cursor->up_match = up_match;
561
cursor->up_bytes = up_bytes;
564
if (srv_use_adaptive_hash_indexes) {
566
btr_search_info_update(index, cursor);
569
ut_ad(cursor->up_match != ULINT_UNDEFINED
570
|| mode != PAGE_CUR_GE);
571
ut_ad(cursor->up_match != ULINT_UNDEFINED
572
|| mode != PAGE_CUR_LE);
573
ut_ad(cursor->low_match != ULINT_UNDEFINED
574
|| mode != PAGE_CUR_LE);
578
if (has_search_latch) {
580
rw_lock_s_lock(&btr_search_latch);
584
/*********************************************************************
585
Opens a cursor at either end of an index. */
588
btr_cur_open_at_index_side(
589
/*=======================*/
590
ibool from_left, /* in: TRUE if open to the low end,
591
FALSE if to the high end */
592
dict_index_t* index, /* in: index */
593
ulint latch_mode, /* in: latch mode */
594
btr_cur_t* cursor, /* in: cursor */
595
mtr_t* mtr) /* in: mtr */
597
page_cur_t* page_cursor;
602
ulint root_height = 0; /* remove warning */
606
mem_heap_t* heap = NULL;
607
ulint offsets_[REC_OFFS_NORMAL_SIZE];
608
ulint* offsets = offsets_;
609
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
611
estimate = latch_mode & BTR_ESTIMATE;
612
latch_mode = latch_mode & ~BTR_ESTIMATE;
614
/* Store the position of the tree latch we push to mtr so that we
615
know how to release it when we have latched the leaf node */
617
savepoint = mtr_set_savepoint(mtr);
619
if (latch_mode == BTR_MODIFY_TREE) {
620
mtr_x_lock(dict_index_get_lock(index), mtr);
622
mtr_s_lock(dict_index_get_lock(index), mtr);
625
page_cursor = btr_cur_get_page_cur(cursor);
626
cursor->index = index;
628
space = dict_index_get_space(index);
629
page_no = dict_index_get_page(index);
631
height = ULINT_UNDEFINED;
634
page = buf_page_get_gen(space, page_no, RW_NO_LATCH, NULL,
638
ut_ad(0 == ut_dulint_cmp(index->id,
639
btr_page_get_index_id(page)));
641
buf_block_align(page)->check_index_page_at_flush = TRUE;
643
if (height == ULINT_UNDEFINED) {
644
/* We are in the root node */
646
height = btr_page_get_level(page, mtr);
647
root_height = height;
651
btr_cur_latch_leaves(page, space, page_no,
652
latch_mode, cursor, mtr);
654
/* In versions <= 3.23.52 we had forgotten to
655
release the tree latch here. If in an index scan
656
we had to scan far to find a record visible to the
657
current transaction, that could starve others
658
waiting for the tree latch. */
660
if ((latch_mode != BTR_MODIFY_TREE)
661
&& (latch_mode != BTR_CONT_MODIFY_TREE)) {
663
/* Release the tree s-latch */
665
mtr_release_s_latch_at_savepoint(
667
dict_index_get_lock(index));
672
page_cur_set_before_first(page, page_cursor);
674
page_cur_set_after_last(page, page_cursor);
679
btr_cur_add_path_info(cursor, height,
689
page_cur_move_to_next(page_cursor);
691
page_cur_move_to_prev(page_cursor);
695
btr_cur_add_path_info(cursor, height, root_height);
700
node_ptr = page_cur_get_rec(page_cursor);
701
offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
702
ULINT_UNDEFINED, &heap);
703
/* Go to the child node */
704
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
707
if (UNIV_LIKELY_NULL(heap)) {
712
/**************************************************************************
713
Positions a cursor at a randomly chosen position within a B-tree. */
716
btr_cur_open_at_rnd_pos(
717
/*====================*/
718
dict_index_t* index, /* in: index */
719
ulint latch_mode, /* in: BTR_SEARCH_LEAF, ... */
720
btr_cur_t* cursor, /* in/out: B-tree cursor */
721
mtr_t* mtr) /* in: mtr */
723
page_cur_t* page_cursor;
729
mem_heap_t* heap = NULL;
730
ulint offsets_[REC_OFFS_NORMAL_SIZE];
731
ulint* offsets = offsets_;
732
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
734
if (latch_mode == BTR_MODIFY_TREE) {
735
mtr_x_lock(dict_index_get_lock(index), mtr);
737
mtr_s_lock(dict_index_get_lock(index), mtr);
740
page_cursor = btr_cur_get_page_cur(cursor);
741
cursor->index = index;
743
space = dict_index_get_space(index);
744
page_no = dict_index_get_page(index);
746
height = ULINT_UNDEFINED;
749
page = buf_page_get_gen(space, page_no, RW_NO_LATCH, NULL,
753
ut_ad(0 == ut_dulint_cmp(index->id,
754
btr_page_get_index_id(page)));
756
if (height == ULINT_UNDEFINED) {
757
/* We are in the root node */
759
height = btr_page_get_level(page, mtr);
763
btr_cur_latch_leaves(page, space, page_no,
764
latch_mode, cursor, mtr);
767
page_cur_open_on_rnd_user_rec(page, page_cursor);
778
node_ptr = page_cur_get_rec(page_cursor);
779
offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
780
ULINT_UNDEFINED, &heap);
781
/* Go to the child node */
782
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
785
if (UNIV_LIKELY_NULL(heap)) {
790
/*==================== B-TREE INSERT =========================*/
792
/*****************************************************************
793
Inserts a record if there is enough space, or if enough space can
794
be freed by reorganizing. Differs from _optimistic_insert because
795
no heuristics is applied to whether it pays to use CPU time for
796
reorganizing the page or not. */
799
btr_cur_insert_if_possible(
800
/*=======================*/
801
/* out: pointer to inserted record if succeed,
803
btr_cur_t* cursor, /* in: cursor on page after which to insert;
804
cursor stays valid */
805
dtuple_t* tuple, /* in: tuple to insert; the size info need not
806
have been stored to tuple */
807
ibool* reorg, /* out: TRUE if reorganization occurred */
808
mtr_t* mtr) /* in: mtr */
810
page_cur_t* page_cursor;
814
ut_ad(dtuple_check_typed(tuple));
818
page = btr_cur_get_page(cursor);
820
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
821
MTR_MEMO_PAGE_X_FIX));
822
page_cursor = btr_cur_get_page_cur(cursor);
824
/* Now, try the insert */
825
rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index, mtr);
828
/* If record did not fit, reorganize */
830
btr_page_reorganize(page, cursor->index, mtr);
834
page_cur_search(page, cursor->index, tuple,
835
PAGE_CUR_LE, page_cursor);
837
rec = page_cur_tuple_insert(page_cursor, tuple,
844
/*****************************************************************
845
For an insert, checks the locks and does the undo logging if desired. */
848
btr_cur_ins_lock_and_undo(
849
/*======================*/
850
/* out: DB_SUCCESS, DB_WAIT_LOCK,
851
DB_FAIL, or error number */
852
ulint flags, /* in: undo logging and locking flags: if
853
not zero, the parameters index and thr
854
should be specified */
855
btr_cur_t* cursor, /* in: cursor on page after which to insert */
856
dtuple_t* entry, /* in: entry to insert */
857
que_thr_t* thr, /* in: query thread or NULL */
858
ibool* inherit)/* out: TRUE if the inserted new record maybe
859
should inherit LOCK_GAP type locks from the
867
/* Check if we have to wait for a lock: enqueue an explicit lock
870
rec = btr_cur_get_rec(cursor);
871
index = cursor->index;
873
err = lock_rec_insert_check_and_lock(flags, rec, index, thr, inherit);
875
if (err != DB_SUCCESS) {
880
if ((index->type & DICT_CLUSTERED) && !(index->type & DICT_IBUF)) {
882
err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
886
if (err != DB_SUCCESS) {
891
/* Now we can fill in the roll ptr field in entry */
893
if (!(flags & BTR_KEEP_SYS_FLAG)) {
895
row_upd_index_entry_sys_field(entry, index,
896
DATA_ROLL_PTR, roll_ptr);
904
/*****************************************************************
905
Report information about a transaction. */
910
trx_t* trx, /* in: transaction */
911
const dict_index_t* index, /* in: index */
912
const char* op) /* in: operation */
914
fprintf(stderr, "Trx with id %lu %lu going to ",
915
ut_dulint_get_high(trx->id),
916
ut_dulint_get_low(trx->id));
918
dict_index_name_print(stderr, trx, index);
921
#endif /* UNIV_DEBUG */
923
/*****************************************************************
924
Tries to perform an insert to a page in an index tree, next to cursor.
925
It is assumed that mtr holds an x-latch on the page. The operation does
926
not succeed if there is too little space on the page. If there is just
927
one record on the page, the insert will always succeed; this is to
928
prevent trying to split a page with just one record. */
931
btr_cur_optimistic_insert(
932
/*======================*/
933
/* out: DB_SUCCESS, DB_WAIT_LOCK,
934
DB_FAIL, or error number */
935
ulint flags, /* in: undo logging and locking flags: if not
936
zero, the parameters index and thr should be
938
btr_cur_t* cursor, /* in: cursor on page after which to insert;
939
cursor stays valid */
940
dtuple_t* entry, /* in: entry to insert */
941
rec_t** rec, /* out: pointer to inserted record if
943
big_rec_t** big_rec,/* out: big rec vector whose fields have to
944
be stored externally by the caller, or
946
que_thr_t* thr, /* in: query thread or NULL */
947
mtr_t* mtr) /* in: mtr */
949
big_rec_t* big_rec_vec = NULL;
951
page_cur_t* page_cursor;
964
page = btr_cur_get_page(cursor);
965
index = cursor->index;
967
if (!dtuple_check_typed_no_assert(entry)) {
968
fputs("InnoDB: Error in a tuple to insert into ", stderr);
969
dict_index_name_print(stderr, thr_get_trx(thr), index);
972
if (btr_cur_print_record_ops && thr) {
973
btr_cur_trx_report(thr_get_trx(thr), index, "insert into ");
974
dtuple_print(stderr, entry);
976
#endif /* UNIV_DEBUG */
978
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
979
MTR_MEMO_PAGE_X_FIX));
980
max_size = page_get_max_insert_size_after_reorganize(page, 1);
981
level = btr_page_get_level(page, mtr);
983
calculate_sizes_again:
984
/* Calculate the record size when entry is converted to a record */
985
rec_size = rec_get_converted_size(index, entry);
988
>= ut_min(page_get_free_space_of_empty(page_is_comp(page)) / 2,
989
REC_MAX_DATA_SIZE)) {
991
/* The record is so big that we have to store some fields
992
externally on separate database pages */
994
big_rec_vec = dtuple_convert_big_rec(index, entry, NULL, 0);
996
if (big_rec_vec == NULL) {
998
return(DB_TOO_BIG_RECORD);
1001
goto calculate_sizes_again;
1004
/* If there have been many consecutive inserts, and we are on the leaf
1005
level, check if we have to split the page to reserve enough free space
1006
for future updates of records. */
1010
if ((type & DICT_CLUSTERED)
1011
&& (dict_index_get_space_reserve() + rec_size > max_size)
1012
&& (page_get_n_recs(page) >= 2)
1014
&& (btr_page_get_split_rec_to_right(cursor, &dummy_rec)
1015
|| btr_page_get_split_rec_to_left(cursor, &dummy_rec))) {
1018
dtuple_convert_back_big_rec(index, entry, big_rec_vec);
1024
if (!(((max_size >= rec_size)
1025
&& (max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT))
1026
|| (page_get_max_insert_size(page, 1) >= rec_size)
1027
|| (page_get_n_recs(page) <= 1))) {
1030
dtuple_convert_back_big_rec(index, entry, big_rec_vec);
1035
/* Check locks and write to the undo log, if specified */
1036
err = btr_cur_ins_lock_and_undo(flags, cursor, entry, thr, &inherit);
1038
if (err != DB_SUCCESS) {
1041
dtuple_convert_back_big_rec(index, entry, big_rec_vec);
1046
page_cursor = btr_cur_get_page_cur(cursor);
1050
/* Now, try the insert */
1052
*rec = page_cur_insert_rec_low(page_cursor, entry, index,
1054
if (UNIV_UNLIKELY(!(*rec))) {
1055
/* If the record did not fit, reorganize */
1056
btr_page_reorganize(page, index, mtr);
1058
ut_ad(page_get_max_insert_size(page, 1) == max_size);
1062
page_cur_search(page, index, entry, PAGE_CUR_LE, page_cursor);
1064
*rec = page_cur_tuple_insert(page_cursor, entry, index, mtr);
1066
if (UNIV_UNLIKELY(!*rec)) {
1067
fputs("InnoDB: Error: cannot insert tuple ", stderr);
1068
dtuple_print(stderr, entry);
1069
fputs(" into ", stderr);
1070
dict_index_name_print(stderr, thr_get_trx(thr), index);
1071
fprintf(stderr, "\nInnoDB: max insert size %lu\n",
1077
#ifdef BTR_CUR_HASH_ADAPT
1078
if (!reorg && (0 == level) && (cursor->flag == BTR_CUR_HASH)) {
1079
btr_search_update_hash_node_on_insert(cursor);
1081
btr_search_update_hash_on_insert(cursor);
1085
if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
1087
lock_update_insert(*rec);
1091
fprintf(stderr, "Insert into page %lu, max ins size %lu,"
1092
" rec %lu ind type %lu\n",
1093
buf_frame_get_page_no(page), max_size,
1094
rec_size + PAGE_DIR_SLOT_SIZE, type);
1096
if (!(type & DICT_CLUSTERED)) {
1097
/* We have added a record to page: update its free bits */
1098
ibuf_update_free_bits_if_full(cursor->index, page, max_size,
1099
rec_size + PAGE_DIR_SLOT_SIZE);
1102
*big_rec = big_rec_vec;
1107
/*****************************************************************
1108
Performs an insert on a page of an index tree. It is assumed that mtr
1109
holds an x-latch on the tree and on the cursor page. If the insert is
1110
made on the leaf level, to avoid deadlocks, mtr must also own x-latches
1111
to brothers of page, if those brothers exist. */
1114
btr_cur_pessimistic_insert(
1115
/*=======================*/
1116
/* out: DB_SUCCESS or error number */
1117
ulint flags, /* in: undo logging and locking flags: if not
1118
zero, the parameter thr should be
1119
specified; if no undo logging is specified,
1120
then the caller must have reserved enough
1121
free extents in the file space so that the
1122
insertion will certainly succeed */
1123
btr_cur_t* cursor, /* in: cursor after which to insert;
1124
cursor stays valid */
1125
dtuple_t* entry, /* in: entry to insert */
1126
rec_t** rec, /* out: pointer to inserted record if
1128
big_rec_t** big_rec,/* out: big rec vector whose fields have to
1129
be stored externally by the caller, or
1131
que_thr_t* thr, /* in: query thread or NULL */
1132
mtr_t* mtr) /* in: mtr */
1134
dict_index_t* index = cursor->index;
1135
big_rec_t* big_rec_vec = NULL;
1140
ulint n_extents = 0;
1143
ut_ad(dtuple_check_typed(entry));
1147
page = btr_cur_get_page(cursor);
1149
ut_ad(mtr_memo_contains(mtr,
1150
dict_index_get_lock(btr_cur_get_index(cursor)),
1152
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1153
MTR_MEMO_PAGE_X_FIX));
1155
/* Try first an optimistic insert; reset the cursor flag: we do not
1156
assume anything of how it was positioned */
1158
cursor->flag = BTR_CUR_BINARY;
1160
err = btr_cur_optimistic_insert(flags, cursor, entry, rec, big_rec,
1162
if (err != DB_FAIL) {
1167
/* Retry with a pessimistic insert. Check locks and write to undo log,
1170
err = btr_cur_ins_lock_and_undo(flags, cursor, entry, thr, &dummy_inh);
1172
if (err != DB_SUCCESS) {
1177
if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
1178
/* First reserve enough free space for the file segments
1179
of the index tree, so that the insert will not fail because
1182
n_extents = cursor->tree_height / 16 + 3;
1184
success = fsp_reserve_free_extents(&n_reserved, index->space,
1185
n_extents, FSP_NORMAL, mtr);
1187
err = DB_OUT_OF_FILE_SPACE;
1193
if (rec_get_converted_size(index, entry)
1194
>= ut_min(page_get_free_space_of_empty(page_is_comp(page)) / 2,
1195
REC_MAX_DATA_SIZE)) {
1197
/* The record is so big that we have to store some fields
1198
externally on separate database pages */
1200
big_rec_vec = dtuple_convert_big_rec(index, entry, NULL, 0);
1202
if (big_rec_vec == NULL) {
1204
if (n_extents > 0) {
1205
fil_space_release_free_extents(index->space,
1208
return(DB_TOO_BIG_RECORD);
1212
if (dict_index_get_page(index) == buf_frame_get_page_no(page)) {
1214
/* The page is the root page */
1215
*rec = btr_root_raise_and_insert(cursor, entry, mtr);
1217
*rec = btr_page_split_and_insert(cursor, entry, mtr);
1220
btr_cur_position(index, page_rec_get_prev(*rec), cursor);
1222
#ifdef BTR_CUR_ADAPT
1223
btr_search_update_hash_on_insert(cursor);
1225
if (!(flags & BTR_NO_LOCKING_FLAG)) {
1227
lock_update_insert(*rec);
1232
if (n_extents > 0) {
1233
fil_space_release_free_extents(index->space, n_reserved);
1236
*big_rec = big_rec_vec;
1241
/*==================== B-TREE UPDATE =========================*/
1243
/*****************************************************************
1244
For an update, checks the locks and does the undo logging. */
1247
btr_cur_upd_lock_and_undo(
1248
/*======================*/
1249
/* out: DB_SUCCESS, DB_WAIT_LOCK, or error
1251
ulint flags, /* in: undo logging and locking flags */
1252
btr_cur_t* cursor, /* in: cursor on record to update */
1253
upd_t* update, /* in: update vector */
1254
ulint cmpl_info,/* in: compiler info on secondary index
1256
que_thr_t* thr, /* in: query thread */
1257
dulint* roll_ptr)/* out: roll pointer */
1259
dict_index_t* index;
1263
ut_ad(cursor && update && thr && roll_ptr);
1265
rec = btr_cur_get_rec(cursor);
1266
index = cursor->index;
1268
if (!(index->type & DICT_CLUSTERED)) {
1269
/* We do undo logging only when we update a clustered index
1271
return(lock_sec_rec_modify_check_and_lock(flags, rec, index,
1275
/* Check if we have to wait for a lock: enqueue an explicit lock
1280
if (!(flags & BTR_NO_LOCKING_FLAG)) {
1281
mem_heap_t* heap = NULL;
1282
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1283
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
1285
err = lock_clust_rec_modify_check_and_lock(
1287
rec_get_offsets(rec, index, offsets_,
1288
ULINT_UNDEFINED, &heap), thr);
1289
if (UNIV_LIKELY_NULL(heap)) {
1290
mem_heap_free(heap);
1292
if (err != DB_SUCCESS) {
1298
/* Append the info about the update in the undo log */
1300
err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
1301
index, NULL, update,
1302
cmpl_info, rec, roll_ptr);
1306
/***************************************************************
1307
Writes a redo log record of updating a record in-place. */
1310
btr_cur_update_in_place_log(
1311
/*========================*/
1312
ulint flags, /* in: flags */
1313
rec_t* rec, /* in: record */
1314
dict_index_t* index, /* in: index where cursor positioned */
1315
upd_t* update, /* in: update vector */
1316
trx_t* trx, /* in: transaction */
1317
dulint roll_ptr, /* in: roll ptr */
1318
mtr_t* mtr) /* in: mtr */
1321
page_t* page = page_align(rec);
1323
ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
1325
log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
1326
? MLOG_COMP_REC_UPDATE_IN_PLACE
1327
: MLOG_REC_UPDATE_IN_PLACE,
1328
1 + DATA_ROLL_PTR_LEN + 14 + 2
1332
/* Logging in mtr is switched off during crash recovery */
1336
/* The code below assumes index is a clustered index: change index to
1337
the clustered index if we are updating a secondary index record (or we
1338
could as well skip writing the sys col values to the log in this case
1339
because they are not needed for a secondary index record update) */
1341
index = dict_table_get_first_index(index->table);
1343
mach_write_to_1(log_ptr, flags);
1346
log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
1348
mach_write_to_2(log_ptr, page_offset(rec));
1351
row_upd_index_write_log(update, log_ptr, mtr);
1354
/***************************************************************
1355
Parses a redo log record of updating a record in-place. */
1358
btr_cur_parse_update_in_place(
1359
/*==========================*/
1360
/* out: end of log record or NULL */
1361
byte* ptr, /* in: buffer */
1362
byte* end_ptr,/* in: buffer end */
1363
page_t* page, /* in: page or NULL */
1364
dict_index_t* index) /* in: index corresponding to page */
1376
if (end_ptr < ptr + 1) {
1381
flags = mach_read_from_1(ptr);
1384
ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
1391
if (end_ptr < ptr + 2) {
1396
rec_offset = mach_read_from_2(ptr);
1399
ut_a(rec_offset <= UNIV_PAGE_SIZE);
1401
heap = mem_heap_create(256);
1403
ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
1405
if (!ptr || !page) {
1410
ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
1411
rec = page + rec_offset;
1413
/* We do not need to reserve btr_search_latch, as the page is only
1414
being recovered, and there cannot be a hash index to it. */
1416
offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
1418
if (!(flags & BTR_KEEP_SYS_FLAG)) {
1419
row_upd_rec_sys_fields_in_recovery(rec, offsets,
1420
pos, trx_id, roll_ptr);
1423
row_upd_rec_in_place(rec, offsets, update);
1426
mem_heap_free(heap);
1431
/*****************************************************************
1432
Updates a record when the update causes no size changes in its fields.
1433
We assume here that the ordering fields of the record do not change. */
1436
btr_cur_update_in_place(
1437
/*====================*/
1438
/* out: DB_SUCCESS or error number */
1439
ulint flags, /* in: undo logging and locking flags */
1440
btr_cur_t* cursor, /* in: cursor on the record to update;
1441
cursor stays valid and positioned on the
1443
upd_t* update, /* in: update vector */
1444
ulint cmpl_info,/* in: compiler info on secondary index
1446
que_thr_t* thr, /* in: query thread */
1447
mtr_t* mtr) /* in: mtr */
1449
dict_index_t* index;
1453
dulint roll_ptr = ut_dulint_zero;
1455
ulint was_delete_marked;
1456
mem_heap_t* heap = NULL;
1457
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1458
ulint* offsets = offsets_;
1459
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
1461
rec = btr_cur_get_rec(cursor);
1462
index = cursor->index;
1463
ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
1464
trx = thr_get_trx(thr);
1465
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1467
if (btr_cur_print_record_ops && thr) {
1468
btr_cur_trx_report(trx, index, "update ");
1469
rec_print_new(stderr, rec, offsets);
1471
#endif /* UNIV_DEBUG */
1473
/* Do lock checking and undo logging */
1474
err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
1476
if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
1478
if (UNIV_LIKELY_NULL(heap)) {
1479
mem_heap_free(heap);
1484
block = buf_block_align(rec);
1485
ut_ad(!!page_is_comp(buf_block_get_frame(block))
1486
== dict_table_is_comp(index->table));
1488
if (block->is_hashed) {
1489
/* The function row_upd_changes_ord_field_binary works only
1490
if the update vector was built for a clustered index, we must
1491
NOT call it if index is secondary */
1493
if (!(index->type & DICT_CLUSTERED)
1494
|| row_upd_changes_ord_field_binary(NULL, index, update)) {
1496
/* Remove possible hash index pointer to this record */
1497
btr_search_update_hash_on_delete(cursor);
1500
rw_lock_x_lock(&btr_search_latch);
1503
if (!(flags & BTR_KEEP_SYS_FLAG)) {
1504
row_upd_rec_sys_fields(rec, index, offsets, trx, roll_ptr);
1507
was_delete_marked = rec_get_deleted_flag(
1508
rec, page_is_comp(buf_block_get_frame(block)));
1510
row_upd_rec_in_place(rec, offsets, update);
1512
if (block->is_hashed) {
1513
rw_lock_x_unlock(&btr_search_latch);
1516
btr_cur_update_in_place_log(flags, rec, index, update, trx, roll_ptr,
1518
if (was_delete_marked
1519
&& !rec_get_deleted_flag(rec, page_is_comp(
1520
buf_block_get_frame(block)))) {
1521
/* The new updated record owns its possible externally
1524
btr_cur_unmark_extern_fields(rec, mtr, offsets);
1527
if (UNIV_LIKELY_NULL(heap)) {
1528
mem_heap_free(heap);
1533
/*****************************************************************
1534
Tries to update a record on a page in an index tree. It is assumed that mtr
1535
holds an x-latch on the page. The operation does not succeed if there is too
1536
little space on the page or if the update would result in too empty a page,
1537
so that tree compression is recommended. We assume here that the ordering
1538
fields of the record do not change. */
1541
btr_cur_optimistic_update(
1542
/*======================*/
1543
/* out: DB_SUCCESS, or DB_OVERFLOW if the
1544
updated record does not fit, DB_UNDERFLOW
1545
if the page would become too empty */
1546
ulint flags, /* in: undo logging and locking flags */
1547
btr_cur_t* cursor, /* in: cursor on the record to update;
1548
cursor stays valid and positioned on the
1550
upd_t* update, /* in: update vector; this must also
1551
contain trx id and roll ptr fields */
1552
ulint cmpl_info,/* in: compiler info on secondary index
1554
que_thr_t* thr, /* in: query thread */
1555
mtr_t* mtr) /* in: mtr */
1557
dict_index_t* index;
1558
page_cur_t* page_cursor;
1565
dtuple_t* new_entry;
1569
ibool reorganized = FALSE;
1573
page = btr_cur_get_page(cursor);
1574
rec = btr_cur_get_rec(cursor);
1575
index = cursor->index;
1576
ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
1578
heap = mem_heap_create(1024);
1579
offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
1582
if (btr_cur_print_record_ops && thr) {
1583
btr_cur_trx_report(thr_get_trx(thr), index, "update ");
1584
rec_print_new(stderr, rec, offsets);
1586
#endif /* UNIV_DEBUG */
1588
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1589
MTR_MEMO_PAGE_X_FIX));
1590
if (!row_upd_changes_field_size_or_external(index, offsets, update)) {
1592
/* The simplest and the most common case: the update does not
1593
change the size of any field and none of the updated fields is
1594
externally stored in rec or update */
1595
mem_heap_free(heap);
1596
return(btr_cur_update_in_place(flags, cursor, update,
1597
cmpl_info, thr, mtr));
1600
for (i = 0; i < upd_get_n_fields(update); i++) {
1601
if (upd_get_nth_field(update, i)->extern_storage) {
1603
/* Externally stored fields are treated in pessimistic
1606
mem_heap_free(heap);
1607
return(DB_OVERFLOW);
1611
if (rec_offs_any_extern(offsets)) {
1612
/* Externally stored fields are treated in pessimistic
1615
mem_heap_free(heap);
1616
return(DB_OVERFLOW);
1619
page_cursor = btr_cur_get_page_cur(cursor);
1621
new_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap);
1623
row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
1625
old_rec_size = rec_offs_size(offsets);
1626
new_rec_size = rec_get_converted_size(index, new_entry);
1628
if (UNIV_UNLIKELY(new_rec_size
1629
>= (page_get_free_space_of_empty(page_is_comp(page))
1632
mem_heap_free(heap);
1634
return(DB_OVERFLOW);
1637
max_size = old_rec_size
1638
+ page_get_max_insert_size_after_reorganize(page, 1);
1640
if (UNIV_UNLIKELY(page_get_data_size(page)
1641
- old_rec_size + new_rec_size
1642
< BTR_CUR_PAGE_COMPRESS_LIMIT)) {
1644
/* The page would become too empty */
1646
mem_heap_free(heap);
1648
return(DB_UNDERFLOW);
1651
if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
1652
&& (max_size >= new_rec_size))
1653
|| (page_get_n_recs(page) <= 1))) {
1655
/* There was not enough space, or it did not pay to
1656
reorganize: for simplicity, we decide what to do assuming a
1657
reorganization is needed, though it might not be necessary */
1659
mem_heap_free(heap);
1661
return(DB_OVERFLOW);
1664
/* Do lock checking and undo logging */
1665
err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, thr,
1667
if (err != DB_SUCCESS) {
1669
mem_heap_free(heap);
1674
/* Ok, we may do the replacement. Store on the page infimum the
1675
explicit locks on rec, before deleting rec (see the comment in
1676
.._pessimistic_update). */
1678
lock_rec_store_on_page_infimum(page, rec);
1680
btr_search_update_hash_on_delete(cursor);
1682
page_cur_delete_rec(page_cursor, index, offsets, mtr);
1684
page_cur_move_to_prev(page_cursor);
1686
trx = thr_get_trx(thr);
1688
if (!(flags & BTR_KEEP_SYS_FLAG)) {
1689
row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
1691
row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
1695
rec = btr_cur_insert_if_possible(cursor, new_entry, &reorganized, mtr);
1697
ut_a(rec); /* <- We calculated above the insert would fit */
1699
if (!rec_get_deleted_flag(rec, page_is_comp(page))) {
1700
/* The new inserted record owns its possible externally
1703
offsets = rec_get_offsets(rec, index, offsets,
1704
ULINT_UNDEFINED, &heap);
1705
btr_cur_unmark_extern_fields(rec, mtr, offsets);
1708
/* Restore the old explicit lock state on the record */
1710
lock_rec_restore_from_page_infimum(rec, page);
1712
page_cur_move_to_next(page_cursor);
1714
mem_heap_free(heap);
1719
/*****************************************************************
1720
If, in a split, a new supremum record was created as the predecessor of the
1721
updated record, the supremum record must inherit exactly the locks on the
1722
updated record. In the split it may have inherited locks from the successor
1723
of the updated record, which is not correct. This function restores the
1724
right locks for the new supremum. */
1727
btr_cur_pess_upd_restore_supremum(
1728
/*==============================*/
1729
rec_t* rec, /* in: updated record */
1730
mtr_t* mtr) /* in: mtr */
1737
page = buf_frame_align(rec);
1739
if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
1740
/* Updated record is not the first user record on its page */
1745
space = buf_frame_get_space_id(page);
1746
prev_page_no = btr_page_get_prev(page, mtr);
1748
ut_ad(prev_page_no != FIL_NULL);
1749
prev_page = buf_page_get_with_no_latch(space, prev_page_no, mtr);
1750
#ifdef UNIV_BTR_DEBUG
1751
ut_a(btr_page_get_next(prev_page, mtr)
1752
== buf_frame_get_page_no(page));
1753
#endif /* UNIV_BTR_DEBUG */
1755
/* We must already have an x-latch to prev_page! */
1756
ut_ad(mtr_memo_contains(mtr, buf_block_align(prev_page),
1757
MTR_MEMO_PAGE_X_FIX));
1759
lock_rec_reset_and_inherit_gap_locks(page_get_supremum_rec(prev_page),
1763
/*****************************************************************
1764
Performs an update of a record on a page of a tree. It is assumed
1765
that mtr holds an x-latch on the tree and on the cursor page. If the
1766
update is made on the leaf level, to avoid deadlocks, mtr must also
1767
own x-latches to brothers of page, if those brothers exist. We assume
1768
here that the ordering fields of the record do not change. */
1771
btr_cur_pessimistic_update(
1772
/*=======================*/
1773
/* out: DB_SUCCESS or error code */
1774
ulint flags, /* in: undo logging, locking, and rollback
1776
btr_cur_t* cursor, /* in: cursor on the record to update */
1777
big_rec_t** big_rec,/* out: big rec vector whose fields have to
1778
be stored externally by the caller, or NULL */
1779
upd_t* update, /* in: update vector; this is allowed also
1780
contain trx id and roll ptr fields, but
1781
the values in update vector have no effect */
1782
ulint cmpl_info,/* in: compiler info on secondary index
1784
que_thr_t* thr, /* in: query thread */
1785
mtr_t* mtr) /* in: mtr */
1787
big_rec_t* big_rec_vec = NULL;
1788
big_rec_t* dummy_big_rec;
1789
dict_index_t* index;
1792
page_cur_t* page_cursor;
1793
dtuple_t* new_entry;
1797
ibool dummy_reorganized;
1802
ulint n_extents = 0;
1807
ulint* offsets = NULL;
1811
page = btr_cur_get_page(cursor);
1812
rec = btr_cur_get_rec(cursor);
1813
index = cursor->index;
1815
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1817
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
1818
MTR_MEMO_PAGE_X_FIX));
1820
optim_err = btr_cur_optimistic_update(flags, cursor, update,
1821
cmpl_info, thr, mtr);
1823
if (optim_err != DB_UNDERFLOW && optim_err != DB_OVERFLOW) {
1828
/* Do lock checking and undo logging */
1829
err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
1831
if (err != DB_SUCCESS) {
1836
if (optim_err == DB_OVERFLOW) {
1837
/* First reserve enough free space for the file segments
1838
of the index tree, so that the update will not fail because
1841
n_extents = cursor->tree_height / 16 + 3;
1843
if (flags & BTR_NO_UNDO_LOG_FLAG) {
1844
reserve_flag = FSP_CLEANING;
1846
reserve_flag = FSP_NORMAL;
1849
success = fsp_reserve_free_extents(&n_reserved, index->space,
1853
err = DB_OUT_OF_FILE_SPACE;
1859
heap = mem_heap_create(1024);
1860
offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
1862
trx = thr_get_trx(thr);
1864
new_entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap);
1866
row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
1868
if (!(flags & BTR_KEEP_SYS_FLAG)) {
1869
row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
1871
row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
1875
if (flags & BTR_NO_UNDO_LOG_FLAG) {
1876
/* We are in a transaction rollback undoing a row
1877
update: we must free possible externally stored fields
1878
which got new values in the update, if they are not
1879
inherited values. They can be inherited if we have
1880
updated the primary key to another value, and then
1881
update it back again. */
1883
ut_a(big_rec_vec == NULL);
1885
btr_rec_free_updated_extern_fields(index, rec, offsets,
1889
/* We have to set appropriate extern storage bits in the new
1890
record to be inserted: we have to remember which fields were such */
1892
ext_vect = mem_heap_alloc(heap, sizeof(ulint)
1893
* dict_index_get_n_fields(index));
1894
ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
1895
offsets = rec_get_offsets(rec, index, offsets,
1896
ULINT_UNDEFINED, &heap);
1897
n_ext_vect = btr_push_update_extern_fields(ext_vect, offsets, update);
1899
if (UNIV_UNLIKELY(rec_get_converted_size(index, new_entry)
1900
>= ut_min(page_get_free_space_of_empty(
1901
page_is_comp(page)) / 2,
1902
REC_MAX_DATA_SIZE))) {
1904
big_rec_vec = dtuple_convert_big_rec(index, new_entry,
1905
ext_vect, n_ext_vect);
1906
if (big_rec_vec == NULL) {
1908
err = DB_TOO_BIG_RECORD;
1909
goto return_after_reservations;
1913
page_cursor = btr_cur_get_page_cur(cursor);
1915
/* Store state of explicit locks on rec on the page infimum record,
1916
before deleting rec. The page infimum acts as a dummy carrier of the
1917
locks, taking care also of lock releases, before we can move the locks
1918
back on the actual record. There is a special case: if we are
1919
inserting on the root page and the insert causes a call of
1920
btr_root_raise_and_insert. Therefore we cannot in the lock system
1921
delete the lock structs set on the root page even if the root
1922
page carries just node pointers. */
1924
lock_rec_store_on_page_infimum(buf_frame_align(rec), rec);
1926
btr_search_update_hash_on_delete(cursor);
1928
page_cur_delete_rec(page_cursor, index, offsets, mtr);
1930
page_cur_move_to_prev(page_cursor);
1932
rec = btr_cur_insert_if_possible(cursor, new_entry,
1933
&dummy_reorganized, mtr);
1934
ut_a(rec || optim_err != DB_UNDERFLOW);
1937
lock_rec_restore_from_page_infimum(rec, page);
1938
rec_set_field_extern_bits(rec, index,
1939
ext_vect, n_ext_vect, mtr);
1941
offsets = rec_get_offsets(rec, index, offsets,
1942
ULINT_UNDEFINED, &heap);
1944
if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
1945
/* The new inserted record owns its possible externally
1947
btr_cur_unmark_extern_fields(rec, mtr, offsets);
1950
btr_cur_compress_if_useful(cursor, mtr);
1953
goto return_after_reservations;
1956
if (page_cur_is_before_first(page_cursor)) {
1957
/* The record to be updated was positioned as the first user
1958
record on its page */
1965
/* The first parameter means that no lock checking and undo logging
1966
is made in the insert */
1968
err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
1969
| BTR_NO_LOCKING_FLAG
1970
| BTR_KEEP_SYS_FLAG,
1971
cursor, new_entry, &rec,
1972
&dummy_big_rec, NULL, mtr);
1974
ut_a(err == DB_SUCCESS);
1975
ut_a(dummy_big_rec == NULL);
1977
rec_set_field_extern_bits(rec, index, ext_vect, n_ext_vect, mtr);
1978
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1980
if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
1981
/* The new inserted record owns its possible externally
1984
btr_cur_unmark_extern_fields(rec, mtr, offsets);
1987
lock_rec_restore_from_page_infimum(rec, page);
1989
/* If necessary, restore also the correct lock state for a new,
1990
preceding supremum record created in a page split. While the old
1991
record was nonexistent, the supremum might have inherited its locks
1992
from a wrong record. */
1995
btr_cur_pess_upd_restore_supremum(rec, mtr);
1998
return_after_reservations:
1999
mem_heap_free(heap);
2001
if (n_extents > 0) {
2002
fil_space_release_free_extents(index->space, n_reserved);
2005
*big_rec = big_rec_vec;
2010
/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
2012
/********************************************************************
2013
Writes the redo log record for delete marking or unmarking of an index
2017
btr_cur_del_mark_set_clust_rec_log(
2018
/*===============================*/
2019
ulint flags, /* in: flags */
2020
rec_t* rec, /* in: record */
2021
dict_index_t* index, /* in: index of the record */
2022
ibool val, /* in: value to set */
2023
trx_t* trx, /* in: deleting transaction */
2024
dulint roll_ptr,/* in: roll ptr to the undo log record */
2025
mtr_t* mtr) /* in: mtr */
2031
ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
2033
log_ptr = mlog_open_and_write_index(mtr, rec, index,
2034
page_rec_is_comp(rec)
2035
? MLOG_COMP_REC_CLUST_DELETE_MARK
2036
: MLOG_REC_CLUST_DELETE_MARK,
2037
1 + 1 + DATA_ROLL_PTR_LEN
2041
/* Logging in mtr is switched off during crash recovery */
2045
mach_write_to_1(log_ptr, flags);
2047
mach_write_to_1(log_ptr, val);
2050
log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
2052
mach_write_to_2(log_ptr, page_offset(rec));
2055
mlog_close(mtr, log_ptr);
2058
/********************************************************************
2059
Parses the redo log record for delete marking or unmarking of a clustered
2063
btr_cur_parse_del_mark_set_clust_rec(
2064
/*=================================*/
2065
/* out: end of log record or NULL */
2066
byte* ptr, /* in: buffer */
2067
byte* end_ptr,/* in: buffer end */
2068
dict_index_t* index, /* in: index corresponding to page */
2069
page_t* page) /* in: page or NULL */
2080
|| !!page_is_comp(page) == dict_table_is_comp(index->table));
2082
if (end_ptr < ptr + 2) {
2087
flags = mach_read_from_1(ptr);
2089
val = mach_read_from_1(ptr);
2092
ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
2099
if (end_ptr < ptr + 2) {
2104
offset = mach_read_from_2(ptr);
2107
ut_a(offset <= UNIV_PAGE_SIZE);
2110
rec = page + offset;
2112
if (!(flags & BTR_KEEP_SYS_FLAG)) {
2113
mem_heap_t* heap = NULL;
2114
ulint offsets_[REC_OFFS_NORMAL_SIZE];
2115
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
2117
row_upd_rec_sys_fields_in_recovery(
2118
rec, rec_get_offsets(rec, index, offsets_,
2119
ULINT_UNDEFINED, &heap),
2120
pos, trx_id, roll_ptr);
2121
if (UNIV_LIKELY_NULL(heap)) {
2122
mem_heap_free(heap);
2126
/* We do not need to reserve btr_search_latch, as the page
2127
is only being recovered, and there cannot be a hash index to
2130
rec_set_deleted_flag(rec, page_is_comp(page), val);
2136
/***************************************************************
2137
Marks a clustered index record deleted. Writes an undo log record to
2138
undo log on this delete marking. Writes in the trx id field the id
2139
of the deleting transaction, and in the roll ptr field pointer to the
2140
undo log record created. */
2143
btr_cur_del_mark_set_clust_rec(
2144
/*===========================*/
2145
/* out: DB_SUCCESS, DB_LOCK_WAIT, or error
2147
ulint flags, /* in: undo logging and locking flags */
2148
btr_cur_t* cursor, /* in: cursor */
2149
ibool val, /* in: value to set */
2150
que_thr_t* thr, /* in: query thread */
2151
mtr_t* mtr) /* in: mtr */
2153
dict_index_t* index;
2159
mem_heap_t* heap = NULL;
2160
ulint offsets_[REC_OFFS_NORMAL_SIZE];
2161
ulint* offsets = offsets_;
2162
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
2164
rec = btr_cur_get_rec(cursor);
2165
index = cursor->index;
2166
ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
2167
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
2170
if (btr_cur_print_record_ops && thr) {
2171
btr_cur_trx_report(thr_get_trx(thr), index, "del mark ");
2172
rec_print_new(stderr, rec, offsets);
2174
#endif /* UNIV_DEBUG */
2176
ut_ad(index->type & DICT_CLUSTERED);
2177
ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
2179
err = lock_clust_rec_modify_check_and_lock(flags,
2180
rec, index, offsets, thr);
2182
if (err != DB_SUCCESS) {
2184
if (UNIV_LIKELY_NULL(heap)) {
2185
mem_heap_free(heap);
2190
err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
2191
index, NULL, NULL, 0, rec,
2193
if (err != DB_SUCCESS) {
2195
if (UNIV_LIKELY_NULL(heap)) {
2196
mem_heap_free(heap);
2201
block = buf_block_align(rec);
2203
if (block->is_hashed) {
2204
rw_lock_x_lock(&btr_search_latch);
2207
rec_set_deleted_flag(rec, rec_offs_comp(offsets), val);
2209
trx = thr_get_trx(thr);
2211
if (!(flags & BTR_KEEP_SYS_FLAG)) {
2212
row_upd_rec_sys_fields(rec, index, offsets, trx, roll_ptr);
2215
if (block->is_hashed) {
2216
rw_lock_x_unlock(&btr_search_latch);
2219
btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx,
2221
if (UNIV_LIKELY_NULL(heap)) {
2222
mem_heap_free(heap);
2227
/********************************************************************
2228
Writes the redo log record for a delete mark setting of a secondary
2232
btr_cur_del_mark_set_sec_rec_log(
2233
/*=============================*/
2234
rec_t* rec, /* in: record */
2235
ibool val, /* in: value to set */
2236
mtr_t* mtr) /* in: mtr */
2241
log_ptr = mlog_open(mtr, 11 + 1 + 2);
2244
/* Logging in mtr is switched off during crash recovery:
2245
in that case mlog_open returns NULL */
2249
log_ptr = mlog_write_initial_log_record_fast(
2250
rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
2251
mach_write_to_1(log_ptr, val);
2254
mach_write_to_2(log_ptr, page_offset(rec));
2257
mlog_close(mtr, log_ptr);
2260
/********************************************************************
2261
Parses the redo log record for delete marking or unmarking of a secondary
2265
btr_cur_parse_del_mark_set_sec_rec(
2266
/*===============================*/
2267
/* out: end of log record or NULL */
2268
byte* ptr, /* in: buffer */
2269
byte* end_ptr,/* in: buffer end */
2270
page_t* page) /* in: page or NULL */
2276
if (end_ptr < ptr + 3) {
2281
val = mach_read_from_1(ptr);
2284
offset = mach_read_from_2(ptr);
2287
ut_a(offset <= UNIV_PAGE_SIZE);
2290
rec = page + offset;
2292
/* We do not need to reserve btr_search_latch, as the page
2293
is only being recovered, and there cannot be a hash index to
2296
rec_set_deleted_flag(rec, page_is_comp(page), val);
2302
/***************************************************************
2303
Sets a secondary index record delete mark to TRUE or FALSE. */
2306
btr_cur_del_mark_set_sec_rec(
2307
/*=========================*/
2308
/* out: DB_SUCCESS, DB_LOCK_WAIT, or error
2310
ulint flags, /* in: locking flag */
2311
btr_cur_t* cursor, /* in: cursor */
2312
ibool val, /* in: value to set */
2313
que_thr_t* thr, /* in: query thread */
2314
mtr_t* mtr) /* in: mtr */
2320
rec = btr_cur_get_rec(cursor);
2323
if (btr_cur_print_record_ops && thr) {
2324
btr_cur_trx_report(thr_get_trx(thr), cursor->index,
2326
rec_print(stderr, rec, cursor->index);
2328
#endif /* UNIV_DEBUG */
2330
err = lock_sec_rec_modify_check_and_lock(flags, rec, cursor->index,
2332
if (err != DB_SUCCESS) {
2337
block = buf_block_align(rec);
2338
ut_ad(!!page_is_comp(buf_block_get_frame(block))
2339
== dict_table_is_comp(cursor->index->table));
2341
if (block->is_hashed) {
2342
rw_lock_x_lock(&btr_search_latch);
2345
rec_set_deleted_flag(rec, page_is_comp(buf_block_get_frame(block)),
2348
if (block->is_hashed) {
2349
rw_lock_x_unlock(&btr_search_latch);
2352
btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
2357
/***************************************************************
2358
Sets a secondary index record delete mark to FALSE. This function is only
2359
used by the insert buffer insert merge mechanism. */
2362
btr_cur_del_unmark_for_ibuf(
2363
/*========================*/
2364
rec_t* rec, /* in: record to delete unmark */
2365
mtr_t* mtr) /* in: mtr */
2367
/* We do not need to reserve btr_search_latch, as the page has just
2368
been read to the buffer pool and there cannot be a hash index to it. */
2370
rec_set_deleted_flag(rec, page_is_comp(buf_frame_align(rec)), FALSE);
2372
btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr);
2375
/*==================== B-TREE RECORD REMOVE =========================*/
2377
/*****************************************************************
2378
Tries to compress a page of the tree on the leaf level. It is assumed
2379
that mtr holds an x-latch on the tree and on the cursor page. To avoid
2380
deadlocks, mtr must also own x-latches to brothers of page, if those
2381
brothers exist. NOTE: it is assumed that the caller has reserved enough
2382
free extents so that the compression will always succeed if done! */
2387
btr_cur_t* cursor, /* in: cursor on the page to compress;
2388
cursor does not stay valid */
2389
mtr_t* mtr) /* in: mtr */
2391
ut_ad(mtr_memo_contains(mtr,
2392
dict_index_get_lock(btr_cur_get_index(cursor)),
2394
ut_ad(mtr_memo_contains(mtr, buf_block_align(btr_cur_get_rec(cursor)),
2395
MTR_MEMO_PAGE_X_FIX));
2396
ut_ad(btr_page_get_level(btr_cur_get_page(cursor), mtr) == 0);
2398
btr_compress(cursor, mtr);
2401
/*****************************************************************
2402
Tries to compress a page of the tree if it seems useful. It is assumed
2403
that mtr holds an x-latch on the tree and on the cursor page. To avoid
2404
deadlocks, mtr must also own x-latches to brothers of page, if those
2405
brothers exist. NOTE: it is assumed that the caller has reserved enough
2406
free extents so that the compression will always succeed if done! */
2409
btr_cur_compress_if_useful(
2410
/*=======================*/
2411
/* out: TRUE if compression occurred */
2412
btr_cur_t* cursor, /* in: cursor on the page to compress;
2413
cursor does not stay valid if compression
2415
mtr_t* mtr) /* in: mtr */
2417
ut_ad(mtr_memo_contains(mtr,
2418
dict_index_get_lock(btr_cur_get_index(cursor)),
2420
ut_ad(mtr_memo_contains(mtr, buf_block_align(btr_cur_get_rec(cursor)),
2421
MTR_MEMO_PAGE_X_FIX));
2423
if (btr_cur_compress_recommendation(cursor, mtr)) {
2425
btr_compress(cursor, mtr);
2433
/***********************************************************
2434
Removes the record on which the tree cursor is positioned on a leaf page.
2435
It is assumed that the mtr has an x-latch on the page where the cursor is
2436
positioned, but no latch on the whole tree. */
2439
btr_cur_optimistic_delete(
2440
/*======================*/
2441
/* out: TRUE if success, i.e., the page
2442
did not become too empty */
2443
btr_cur_t* cursor, /* in: cursor on leaf page, on the record to
2444
delete; cursor stays valid: if deletion
2445
succeeds, on function exit it points to the
2446
successor of the deleted record */
2447
mtr_t* mtr) /* in: mtr */
2452
mem_heap_t* heap = NULL;
2453
ulint offsets_[REC_OFFS_NORMAL_SIZE];
2454
ulint* offsets = offsets_;
2455
ibool no_compress_needed;
2456
*offsets_ = (sizeof offsets_) / sizeof *offsets_;
2458
ut_ad(mtr_memo_contains(mtr, buf_block_align(btr_cur_get_rec(cursor)),
2459
MTR_MEMO_PAGE_X_FIX));
2460
/* This is intended only for leaf page deletions */
2462
page = btr_cur_get_page(cursor);
2464
ut_ad(btr_page_get_level(page, mtr) == 0);
2466
rec = btr_cur_get_rec(cursor);
2467
offsets = rec_get_offsets(rec, cursor->index, offsets,
2468
ULINT_UNDEFINED, &heap);
2470
no_compress_needed = !rec_offs_any_extern(offsets)
2471
&& btr_cur_can_delete_without_compress(
2472
cursor, rec_offs_size(offsets), mtr);
2474
if (no_compress_needed) {
2476
lock_update_delete(rec);
2478
btr_search_update_hash_on_delete(cursor);
2480
max_ins_size = page_get_max_insert_size_after_reorganize(
2482
page_cur_delete_rec(btr_cur_get_page_cur(cursor),
2483
cursor->index, offsets, mtr);
2485
ibuf_update_free_bits_low(cursor->index, page, max_ins_size,
2489
if (UNIV_LIKELY_NULL(heap)) {
2490
mem_heap_free(heap);
2493
return(no_compress_needed);
2496
/*****************************************************************
2497
Removes the record on which the tree cursor is positioned. Tries
2498
to compress the page if its fillfactor drops below a threshold
2499
or if it is the only page on the level. It is assumed that mtr holds
2500
an x-latch on the tree and on the cursor page. To avoid deadlocks,
2501
mtr must also own x-latches to brothers of page, if those brothers
2505
btr_cur_pessimistic_delete(
2506
/*=======================*/
2507
/* out: TRUE if compression occurred */
2508
ulint* err, /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
2509
the latter may occur because we may have
2510
to update node pointers on upper levels,
2511
and in the case of variable length keys
2512
these may actually grow in size */
2513
ibool has_reserved_extents, /* in: TRUE if the
2514
caller has already reserved enough free
2515
extents so that he knows that the operation
2517
btr_cur_t* cursor, /* in: cursor on the record to delete;
2518
if compression does not occur, the cursor
2519
stays valid: it points to successor of
2520
deleted record on function exit */
2521
ibool in_rollback,/* in: TRUE if called in rollback */
2522
mtr_t* mtr) /* in: mtr */
2525
dict_index_t* index;
2528
ulint n_extents = 0;
2536
page = btr_cur_get_page(cursor);
2537
index = btr_cur_get_index(cursor);
2539
ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2541
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
2542
MTR_MEMO_PAGE_X_FIX));
2543
if (!has_reserved_extents) {
2544
/* First reserve enough free space for the file segments
2545
of the index tree, so that the node pointer updates will
2546
not fail because of lack of space */
2548
n_extents = cursor->tree_height / 32 + 1;
2550
success = fsp_reserve_free_extents(&n_reserved,
2555
*err = DB_OUT_OF_FILE_SPACE;
2561
heap = mem_heap_create(1024);
2562
rec = btr_cur_get_rec(cursor);
2564
offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
2566
/* Free externally stored fields if the record is neither
2567
a node pointer nor in two-byte format.
2568
This avoids an unnecessary loop. */
2569
if (page_is_comp(page)
2570
? !rec_get_node_ptr_flag(rec)
2571
: !rec_get_1byte_offs_flag(rec)) {
2572
btr_rec_free_externally_stored_fields(index,
2577
if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
2578
&& UNIV_UNLIKELY(dict_index_get_page(btr_cur_get_index(cursor))
2579
!= buf_frame_get_page_no(page))) {
2581
/* If there is only one record, drop the whole page in
2582
btr_discard_page, if this is not the root page */
2584
btr_discard_page(cursor, mtr);
2589
goto return_after_reservations;
2592
lock_update_delete(rec);
2593
level = btr_page_get_level(page, mtr);
2596
&& UNIV_UNLIKELY(rec == page_rec_get_next(
2597
page_get_infimum_rec(page)))) {
2599
rec_t* next_rec = page_rec_get_next(rec);
2601
if (btr_page_get_prev(page, mtr) == FIL_NULL) {
2603
/* If we delete the leftmost node pointer on a
2604
non-leaf level, we must mark the new leftmost node
2605
pointer as the predefined minimum record */
2607
btr_set_min_rec_mark(next_rec, page_is_comp(page),
2610
/* Otherwise, if we delete the leftmost node pointer
2611
on a page, we have to change the father node pointer
2612
so that it is equal to the new leftmost node pointer
2615
btr_node_ptr_delete(index, page, mtr);
2617
node_ptr = dict_index_build_node_ptr(
2618
index, next_rec, buf_frame_get_page_no(page),
2621
btr_insert_on_non_leaf_level(index,
2622
level + 1, node_ptr, mtr);
2626
btr_search_update_hash_on_delete(cursor);
2628
page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
2630
ut_ad(btr_check_node_ptr(index, page, mtr));
2634
return_after_reservations:
2635
mem_heap_free(heap);
2638
ret = btr_cur_compress_if_useful(cursor, mtr);
2641
if (n_extents > 0) {
2642
fil_space_release_free_extents(index->space, n_reserved);
2648
/***********************************************************************
2649
Adds path information to the cursor for the current page, for which
2650
the binary search has been performed. */
2653
btr_cur_add_path_info(
2654
/*==================*/
2655
btr_cur_t* cursor, /* in: cursor positioned on a page */
2656
ulint height, /* in: height of the page in tree;
2657
0 means leaf node */
2658
ulint root_height) /* in: root node height in tree */
2663
ut_a(cursor->path_arr);
2665
if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
2666
/* Do nothing; return empty path */
2668
slot = cursor->path_arr;
2669
slot->nth_rec = ULINT_UNDEFINED;
2675
/* Mark end of slots for path */
2676
slot = cursor->path_arr + root_height + 1;
2677
slot->nth_rec = ULINT_UNDEFINED;
2680
rec = btr_cur_get_rec(cursor);
2682
slot = cursor->path_arr + (root_height - height);
2684
slot->nth_rec = page_rec_get_n_recs_before(rec);
2685
slot->n_recs = page_get_n_recs(buf_frame_align(rec));
2688
/***********************************************************************
2689
Estimates the number of rows in a given index range. */
2692
btr_estimate_n_rows_in_range(
2693
/*=========================*/
2694
/* out: estimated number of rows */
2695
dict_index_t* index, /* in: index */
2696
dtuple_t* tuple1, /* in: range start, may also be empty tuple */
2697
ulint mode1, /* in: search mode for range start */
2698
dtuple_t* tuple2, /* in: range end, may also be empty tuple */
2699
ulint mode2) /* in: search mode for range end */
2701
btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS];
2702
btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
2708
ulint divergence_level;
2715
cursor.path_arr = path1;
2717
if (dtuple_get_n_fields(tuple1) > 0) {
2719
btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
2720
BTR_SEARCH_LEAF | BTR_ESTIMATE,
2723
btr_cur_open_at_index_side(TRUE, index,
2724
BTR_SEARCH_LEAF | BTR_ESTIMATE,
2732
cursor.path_arr = path2;
2734
if (dtuple_get_n_fields(tuple2) > 0) {
2736
btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
2737
BTR_SEARCH_LEAF | BTR_ESTIMATE,
2740
btr_cur_open_at_index_side(FALSE, index,
2741
BTR_SEARCH_LEAF | BTR_ESTIMATE,
2747
/* We have the path information for the range in path1 and path2 */
2750
diverged = FALSE; /* This becomes true when the path is not
2751
the same any more */
2752
diverged_lot = FALSE; /* This becomes true when the paths are
2753
not the same or adjacent any more */
2754
divergence_level = 1000000; /* This is the level where paths diverged
2756
for (i = 0; ; i++) {
2757
ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
2762
if (slot1->nth_rec == ULINT_UNDEFINED
2763
|| slot2->nth_rec == ULINT_UNDEFINED) {
2765
if (i > divergence_level + 1) {
2766
/* In trees whose height is > 1 our algorithm
2767
tends to underestimate: multiply the estimate
2770
n_rows = n_rows * 2;
2773
/* Do not estimate the number of rows in the range
2774
to over 1 / 2 of the estimated rows in the whole
2777
if (n_rows > index->table->stat_n_rows / 2) {
2778
n_rows = index->table->stat_n_rows / 2;
2780
/* If there are just 0 or 1 rows in the table,
2781
then we estimate all rows are in the range */
2784
n_rows = index->table->stat_n_rows;
2791
if (!diverged && slot1->nth_rec != slot2->nth_rec) {
2795
if (slot1->nth_rec < slot2->nth_rec) {
2796
n_rows = slot2->nth_rec - slot1->nth_rec;
2799
diverged_lot = TRUE;
2800
divergence_level = i;
2803
/* Maybe the tree has changed between
2809
} else if (diverged && !diverged_lot) {
2811
if (slot1->nth_rec < slot1->n_recs
2812
|| slot2->nth_rec > 1) {
2814
diverged_lot = TRUE;
2815
divergence_level = i;
2819
if (slot1->nth_rec < slot1->n_recs) {
2820
n_rows += slot1->n_recs
2824
if (slot2->nth_rec > 1) {
2825
n_rows += slot2->nth_rec - 1;
2828
} else if (diverged_lot) {
2830
n_rows = (n_rows * (slot1->n_recs + slot2->n_recs))
2836
/***********************************************************************
2837
Estimates the number of different key values in a given index, for
2838
each n-column prefix of the index where n <= dict_index_get_n_unique(index).
2839
The estimates are stored in the array index->stat_n_diff_key_vals. */
2842
btr_estimate_number_of_different_key_vals(
2843
/*======================================*/
2844
dict_index_t* index) /* in: index */
2850
ulint matched_fields;
2851
ulint matched_bytes;
2852
ib_longlong* n_diff;
2853
ulint not_empty_flag = 0;
2854
ulint total_external_size = 0;
2859
mem_heap_t* heap = NULL;
2860
ulint offsets_rec_[REC_OFFS_NORMAL_SIZE];
2861
ulint offsets_next_rec_[REC_OFFS_NORMAL_SIZE];
2862
ulint* offsets_rec = offsets_rec_;
2863
ulint* offsets_next_rec= offsets_next_rec_;
2864
*offsets_rec_ = (sizeof offsets_rec_) / sizeof *offsets_rec_;
2866
= (sizeof offsets_next_rec_) / sizeof *offsets_next_rec_;
2868
n_cols = dict_index_get_n_unique(index);
2870
n_diff = mem_alloc((n_cols + 1) * sizeof(ib_longlong));
2872
memset(n_diff, 0, (n_cols + 1) * sizeof(ib_longlong));
2874
/* We sample some pages in the index to get an estimate */
2876
for (i = 0; i < BTR_KEY_VAL_ESTIMATE_N_PAGES; i++) {
2880
btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
2882
/* Count the number of different key values for each prefix of
2883
the key on this index page. If the prefix does not determine
2884
the index record uniquely in te B-tree, then we subtract one
2885
because otherwise our algorithm would give a wrong estimate
2886
for an index where there is just one key value. */
2888
page = btr_cur_get_page(&cursor);
2890
supremum = page_get_supremum_rec(page);
2891
rec = page_rec_get_next(page_get_infimum_rec(page));
2893
if (rec != supremum) {
2895
offsets_rec = rec_get_offsets(rec, index, offsets_rec,
2896
ULINT_UNDEFINED, &heap);
2899
while (rec != supremum) {
2900
rec_t* next_rec = page_rec_get_next(rec);
2901
if (next_rec == supremum) {
2907
offsets_next_rec = rec_get_offsets(next_rec, index,
2911
cmp_rec_rec_with_match(rec, next_rec,
2912
offsets_rec, offsets_next_rec,
2913
index, &matched_fields,
2916
for (j = matched_fields + 1; j <= n_cols; j++) {
2917
/* We add one if this index record has
2918
a different prefix from the previous */
2924
+= btr_rec_get_externally_stored_len(
2928
/* Initialize offsets_rec for the next round
2929
and assign the old offsets_rec buffer to
2930
offsets_next_rec. */
2932
ulint* offsets_tmp = offsets_rec;
2933
offsets_rec = offsets_next_rec;
2934
offsets_next_rec = offsets_tmp;
2939
if (n_cols == dict_index_get_n_unique_in_tree(index)) {
2941
/* If there is more than one leaf page in the tree,
2942
we add one because we know that the first record
2943
on the page certainly had a different prefix than the
2944
last record on the previous index page in the
2945
alphabetical order. Before this fix, if there was
2946
just one big record on each clustered index page, the
2947
algorithm grossly underestimated the number of rows
2950
if (btr_page_get_prev(page, &mtr) != FIL_NULL
2951
|| btr_page_get_next(page, &mtr) != FIL_NULL) {
2957
offsets_rec = rec_get_offsets(rec, index, offsets_rec,
2958
ULINT_UNDEFINED, &heap);
2959
total_external_size += btr_rec_get_externally_stored_len(
2964
/* If we saw k borders between different key values on
2965
BTR_KEY_VAL_ESTIMATE_N_PAGES leaf pages, we can estimate how many
2966
there will be in index->stat_n_leaf_pages */
2968
/* We must take into account that our sample actually represents
2969
also the pages used for external storage of fields (those pages are
2970
included in index->stat_n_leaf_pages) */
2972
for (j = 0; j <= n_cols; j++) {
2973
index->stat_n_diff_key_vals[j]
2975
* (ib_longlong)index->stat_n_leaf_pages
2976
+ BTR_KEY_VAL_ESTIMATE_N_PAGES - 1
2977
+ total_external_size
2979
/ (BTR_KEY_VAL_ESTIMATE_N_PAGES
2980
+ total_external_size));
2982
/* If the tree is small, smaller than
2983
10 * BTR_KEY_VAL_ESTIMATE_N_PAGES + total_external_size, then
2984
the above estimate is ok. For bigger trees it is common that we
2985
do not see any borders between key values in the few pages
2986
we pick. But still there may be BTR_KEY_VAL_ESTIMATE_N_PAGES
2987
different key values, or even more. Let us try to approximate
2990
add_on = index->stat_n_leaf_pages
2991
/ (10 * (BTR_KEY_VAL_ESTIMATE_N_PAGES
2992
+ total_external_size));
2994
if (add_on > BTR_KEY_VAL_ESTIMATE_N_PAGES) {
2995
add_on = BTR_KEY_VAL_ESTIMATE_N_PAGES;
2998
index->stat_n_diff_key_vals[j] += add_on;
3002
if (UNIV_LIKELY_NULL(heap)) {
3003
mem_heap_free(heap);
3007
/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
3009
/***************************************************************
3010
Gets the externally stored size of a record, in units of a database page. */
3013
btr_rec_get_externally_stored_len(
3014
/*==============================*/
3015
/* out: externally stored part,
3016
in units of a database page */
3017
rec_t* rec, /* in: record */
3018
const ulint* offsets)/* in: array returned by rec_get_offsets() */
3024
ulint total_extern_len = 0;
3027
ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
3028
n_fields = rec_offs_n_fields(offsets);
3030
for (i = 0; i < n_fields; i++) {
3031
if (rec_offs_nth_extern(offsets, i)) {
3033
data = rec_get_nth_field(rec, offsets, i, &local_len);
3035
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
3037
extern_len = mach_read_from_4(data + local_len
3038
+ BTR_EXTERN_LEN + 4);
3040
total_extern_len += ut_calc_align(extern_len,
3045
return(total_extern_len / UNIV_PAGE_SIZE);
3048
/***********************************************************************
3049
Sets the ownership bit of an externally stored field in a record. */
3052
btr_cur_set_ownership_of_extern_field(
3053
/*==================================*/
3054
rec_t* rec, /* in: clustered index record */
3055
const ulint* offsets,/* in: array returned by rec_get_offsets() */
3056
ulint i, /* in: field number */
3057
ibool val, /* in: value to set */
3058
mtr_t* mtr) /* in: mtr */
3064
data = rec_get_nth_field(rec, offsets, i, &local_len);
3066
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
3068
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
3070
byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
3073
byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
3075
byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
3078
mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
3082
/***********************************************************************
3083
Marks not updated extern fields as not-owned by this record. The ownership
3084
is transferred to the updated record which is inserted elsewhere in the
3085
index tree. In purge only the owner of externally stored field is allowed
3086
to free the field. */
3089
btr_cur_mark_extern_inherited_fields(
3090
/*=================================*/
3091
rec_t* rec, /* in: record in a clustered index */
3092
const ulint* offsets,/* in: array returned by rec_get_offsets() */
3093
upd_t* update, /* in: update vector */
3094
mtr_t* mtr) /* in: mtr */
3101
ut_ad(rec_offs_validate(rec, NULL, offsets));
3102
ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
3103
n = rec_offs_n_fields(offsets);
3105
for (i = 0; i < n; i++) {
3106
if (rec_offs_nth_extern(offsets, i)) {
3108
/* Check it is not in updated fields */
3112
for (j = 0; j < upd_get_n_fields(update);
3114
if (upd_get_nth_field(update, j)
3122
btr_cur_set_ownership_of_extern_field(
3123
rec, offsets, i, FALSE, mtr);
3129
/***********************************************************************
3130
The complement of the previous function: in an update entry may inherit
3131
some externally stored fields from a record. We must mark them as inherited
3132
in entry, so that they are not freed in a rollback. */
3135
btr_cur_mark_dtuple_inherited_extern(
3136
/*=================================*/
3137
dtuple_t* entry, /* in: updated entry to be inserted to
3139
ulint* ext_vec, /* in: array of extern fields in the
3141
ulint n_ext_vec, /* in: number of elements in ext_vec */
3142
upd_t* update) /* in: update vector */
3152
if (ext_vec == NULL) {
3157
for (i = 0; i < n_ext_vec; i++) {
3159
/* Check ext_vec[i] is in updated fields */
3162
for (j = 0; j < upd_get_n_fields(update); j++) {
3163
if (upd_get_nth_field(update, j)->field_no
3170
dfield = dtuple_get_nth_field(entry, ext_vec[i]);
3172
data = (byte*) dfield_get_data(dfield);
3173
len = dfield_get_len(dfield);
3175
len -= BTR_EXTERN_FIELD_REF_SIZE;
3177
byte_val = mach_read_from_1(data + len
3180
byte_val = byte_val | BTR_EXTERN_INHERITED_FLAG;
3182
mach_write_to_1(data + len + BTR_EXTERN_LEN, byte_val);
3187
/***********************************************************************
3188
Marks all extern fields in a record as owned by the record. This function
3189
should be called if the delete mark of a record is removed: a not delete
3190
marked record always owns all its extern fields. */
3193
btr_cur_unmark_extern_fields(
3194
/*=========================*/
3195
rec_t* rec, /* in: record in a clustered index */
3196
mtr_t* mtr, /* in: mtr */
3197
const ulint* offsets)/* in: array returned by rec_get_offsets() */
3202
ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
3203
n = rec_offs_n_fields(offsets);
3205
for (i = 0; i < n; i++) {
3206
if (rec_offs_nth_extern(offsets, i)) {
3208
btr_cur_set_ownership_of_extern_field(rec, offsets, i,
3214
/***********************************************************************
3215
Marks all extern fields in a dtuple as owned by the record. */
3218
btr_cur_unmark_dtuple_extern_fields(
3219
/*================================*/
3220
dtuple_t* entry, /* in: clustered index entry */
3221
ulint* ext_vec, /* in: array of numbers of fields
3222
which have been stored externally */
3223
ulint n_ext_vec) /* in: number of elements in ext_vec */
3231
for (i = 0; i < n_ext_vec; i++) {
3232
dfield = dtuple_get_nth_field(entry, ext_vec[i]);
3234
data = (byte*) dfield_get_data(dfield);
3235
len = dfield_get_len(dfield);
3237
len -= BTR_EXTERN_FIELD_REF_SIZE;
3239
byte_val = mach_read_from_1(data + len + BTR_EXTERN_LEN);
3241
byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
3243
mach_write_to_1(data + len + BTR_EXTERN_LEN, byte_val);
3247
/***********************************************************************
3248
Stores the positions of the fields marked as extern storage in the update
3249
vector, and also those fields who are marked as extern storage in rec
3250
and not mentioned in updated fields. We use this function to remember
3251
which fields we must mark as extern storage in a record inserted for an
3255
btr_push_update_extern_fields(
3256
/*==========================*/
3257
/* out: number of values stored in ext_vect */
3258
ulint* ext_vect,/* in: array of ulints, must be preallocated
3259
to have space for all fields in rec */
3260
const ulint* offsets,/* in: array returned by rec_get_offsets() */
3261
upd_t* update) /* in: update vector or NULL */
3270
n = upd_get_n_fields(update);
3272
for (i = 0; i < n; i++) {
3274
if (upd_get_nth_field(update, i)->extern_storage) {
3276
ext_vect[n_pushed] = upd_get_nth_field(
3277
update, i)->field_no;
3284
n = rec_offs_n_fields(offsets);
3286
for (i = 0; i < n; i++) {
3287
if (rec_offs_nth_extern(offsets, i)) {
3289
/* Check it is not in updated fields */
3293
for (j = 0; j < upd_get_n_fields(update);
3295
if (upd_get_nth_field(update, j)
3303
ext_vect[n_pushed] = i;
3312
/***********************************************************************
3313
Returns the length of a BLOB part stored on the header page. */
3316
btr_blob_get_part_len(
3317
/*==================*/
3318
/* out: part length */
3319
byte* blob_header) /* in: blob header */
3321
return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
3324
/***********************************************************************
3325
Returns the page number where the next BLOB part is stored. */
3328
btr_blob_get_next_page_no(
3329
/*======================*/
3330
/* out: page number or FIL_NULL if
3332
byte* blob_header) /* in: blob header */
3334
return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
3337
/***********************************************************************
3338
Stores the fields in big_rec_vec to the tablespace and puts pointers to
3339
them in rec. The fields are stored on pages allocated from leaf node
3340
file segment of the index tree. */
3343
btr_store_big_rec_extern_fields(
3344
/*============================*/
3345
/* out: DB_SUCCESS or error */
3346
dict_index_t* index, /* in: index of rec; the index tree
3347
MUST be X-latched */
3348
rec_t* rec, /* in: record */
3349
const ulint* offsets, /* in: rec_get_offsets(rec, index);
3350
the "external storage" flags in offsets
3351
will not correspond to rec when
3352
this function returns */
3353
big_rec_t* big_rec_vec, /* in: vector containing fields
3354
to be stored externally */
3355
mtr_t* local_mtr __attribute__((unused))) /* in: mtr
3356
containing the latch to rec and to the
3373
ut_ad(rec_offs_validate(rec, index, offsets));
3374
ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
3376
ut_ad(mtr_memo_contains(local_mtr, buf_block_align(rec),
3377
MTR_MEMO_PAGE_X_FIX));
3378
ut_a(index->type & DICT_CLUSTERED);
3380
space_id = buf_frame_get_space_id(rec);
3382
/* We have to create a file segment to the tablespace
3383
for each field and put the pointer to the field in rec */
3385
for (i = 0; i < big_rec_vec->n_fields; i++) {
3387
data = rec_get_nth_field(rec, offsets,
3388
big_rec_vec->fields[i].field_no,
3390
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
3391
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
3392
extern_len = big_rec_vec->fields[i].len;
3394
ut_a(extern_len > 0);
3396
prev_page_no = FIL_NULL;
3398
while (extern_len > 0) {
3401
if (prev_page_no == FIL_NULL) {
3402
hint_page_no = buf_frame_get_page_no(rec) + 1;
3404
hint_page_no = prev_page_no + 1;
3407
page = btr_page_alloc(index, hint_page_no,
3408
FSP_NO_DIR, 0, &mtr);
3413
return(DB_OUT_OF_FILE_SPACE);
3416
mlog_write_ulint(page + FIL_PAGE_TYPE,
3420
page_no = buf_frame_get_page_no(page);
3422
if (prev_page_no != FIL_NULL) {
3423
prev_page = buf_page_get(space_id,
3427
#ifdef UNIV_SYNC_DEBUG
3428
buf_page_dbg_add_level(prev_page,
3429
SYNC_EXTERN_STORAGE);
3430
#endif /* UNIV_SYNC_DEBUG */
3432
mlog_write_ulint(prev_page + FIL_PAGE_DATA
3433
+ BTR_BLOB_HDR_NEXT_PAGE_NO,
3434
page_no, MLOG_4BYTES, &mtr);
3437
if (extern_len > (UNIV_PAGE_SIZE - FIL_PAGE_DATA
3439
- FIL_PAGE_DATA_END)) {
3440
store_len = UNIV_PAGE_SIZE - FIL_PAGE_DATA
3442
- FIL_PAGE_DATA_END;
3444
store_len = extern_len;
3447
mlog_write_string(page + FIL_PAGE_DATA
3448
+ BTR_BLOB_HDR_SIZE,
3449
big_rec_vec->fields[i].data
3450
+ big_rec_vec->fields[i].len
3453
mlog_write_ulint(page + FIL_PAGE_DATA
3454
+ BTR_BLOB_HDR_PART_LEN,
3455
store_len, MLOG_4BYTES, &mtr);
3456
mlog_write_ulint(page + FIL_PAGE_DATA
3457
+ BTR_BLOB_HDR_NEXT_PAGE_NO,
3458
FIL_NULL, MLOG_4BYTES, &mtr);
3460
extern_len -= store_len;
3462
rec_page = buf_page_get(space_id,
3463
buf_frame_get_page_no(data),
3465
#ifdef UNIV_SYNC_DEBUG
3466
buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK);
3467
#endif /* UNIV_SYNC_DEBUG */
3468
mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, 0,
3470
mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4,
3471
big_rec_vec->fields[i].len
3475
if (prev_page_no == FIL_NULL) {
3476
mlog_write_ulint(data + local_len
3477
+ BTR_EXTERN_SPACE_ID,
3481
mlog_write_ulint(data + local_len
3482
+ BTR_EXTERN_PAGE_NO,
3486
mlog_write_ulint(data + local_len
3487
+ BTR_EXTERN_OFFSET,
3491
/* Set the bit denoting that this field
3492
in rec is stored externally */
3494
rec_set_nth_field_extern_bit(
3496
big_rec_vec->fields[i].field_no,
3500
prev_page_no = page_no;
3509
/***********************************************************************
3510
Frees the space in an externally stored field to the file space
3511
management if the field in data is owned the externally stored field,
3512
in a rollback we may have the additional condition that the field must
3513
not be inherited. */
3516
btr_free_externally_stored_field(
3517
/*=============================*/
3518
dict_index_t* index, /* in: index of the data, the index
3519
tree MUST be X-latched; if the tree
3520
height is 1, then also the root page
3521
must be X-latched! (this is relevant
3522
in the case this function is called
3523
from purge where 'data' is located on
3524
an undo log page, not an index
3526
byte* data, /* in: internally stored data
3527
+ reference to the externally
3529
ulint local_len, /* in: length of data */
3530
ibool do_not_free_inherited,/* in: TRUE if called in a
3531
rollback and we do not want to free
3533
mtr_t* local_mtr __attribute__((unused))) /* in: mtr
3534
containing the latch to data an an
3535
X-latch to the index tree */
3547
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
3548
ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
3550
ut_ad(mtr_memo_contains(local_mtr, buf_block_align(data),
3551
MTR_MEMO_PAGE_X_FIX));
3552
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
3553
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
3558
rec_page = buf_page_get(buf_frame_get_space_id(data),
3559
buf_frame_get_page_no(data),
3561
#ifdef UNIV_SYNC_DEBUG
3562
buf_page_dbg_add_level(rec_page, SYNC_NO_ORDER_CHECK);
3563
#endif /* UNIV_SYNC_DEBUG */
3564
space_id = mach_read_from_4(data + local_len
3565
+ BTR_EXTERN_SPACE_ID);
3567
page_no = mach_read_from_4(data + local_len
3568
+ BTR_EXTERN_PAGE_NO);
3570
offset = mach_read_from_4(data + local_len
3571
+ BTR_EXTERN_OFFSET);
3572
extern_len = mach_read_from_4(data + local_len
3573
+ BTR_EXTERN_LEN + 4);
3575
/* If extern len is 0, then there is no external storage data
3578
if (extern_len == 0) {
3585
if (mach_read_from_1(data + local_len + BTR_EXTERN_LEN)
3586
& BTR_EXTERN_OWNER_FLAG) {
3587
/* This field does not own the externally
3588
stored field: do not free! */
3595
if (do_not_free_inherited
3596
&& mach_read_from_1(data + local_len + BTR_EXTERN_LEN)
3597
& BTR_EXTERN_INHERITED_FLAG) {
3598
/* Rollback and inherited field: do not free! */
3605
page = buf_page_get(space_id, page_no, RW_X_LATCH, &mtr);
3606
#ifdef UNIV_SYNC_DEBUG
3607
buf_page_dbg_add_level(page, SYNC_EXTERN_STORAGE);
3608
#endif /* UNIV_SYNC_DEBUG */
3609
next_page_no = mach_read_from_4(page + FIL_PAGE_DATA
3610
+ BTR_BLOB_HDR_NEXT_PAGE_NO);
3612
part_len = btr_blob_get_part_len(page + FIL_PAGE_DATA);
3614
ut_a(extern_len >= part_len);
3616
/* We must supply the page level (= 0) as an argument
3617
because we did not store it on the page (we save the space
3618
overhead from an index page header. */
3620
btr_page_free_low(index, page, 0, &mtr);
3622
mlog_write_ulint(data + local_len + BTR_EXTERN_PAGE_NO,
3625
mlog_write_ulint(data + local_len + BTR_EXTERN_LEN + 4,
3626
extern_len - part_len,
3628
if (next_page_no == FIL_NULL) {
3629
ut_a(extern_len - part_len == 0);
3632
if (extern_len - part_len == 0) {
3633
ut_a(next_page_no == FIL_NULL);
3640
/***************************************************************
3641
Frees the externally stored fields for a record. */
3644
btr_rec_free_externally_stored_fields(
3645
/*==================================*/
3646
dict_index_t* index, /* in: index of the data, the index
3647
tree MUST be X-latched */
3648
rec_t* rec, /* in: record */
3649
const ulint* offsets,/* in: rec_get_offsets(rec, index) */
3650
ibool do_not_free_inherited,/* in: TRUE if called in a
3651
rollback and we do not want to free
3653
mtr_t* mtr) /* in: mini-transaction handle which contains
3654
an X-latch to record page and to the index
3662
ut_ad(rec_offs_validate(rec, index, offsets));
3663
ut_ad(mtr_memo_contains(mtr, buf_block_align(rec),
3664
MTR_MEMO_PAGE_X_FIX));
3665
/* Free possible externally stored fields in the record */
3667
ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
3668
n_fields = rec_offs_n_fields(offsets);
3670
for (i = 0; i < n_fields; i++) {
3671
if (rec_offs_nth_extern(offsets, i)) {
3673
data = rec_get_nth_field(rec, offsets, i, &len);
3674
btr_free_externally_stored_field(index, data, len,
3675
do_not_free_inherited,
3681
/***************************************************************
3682
Frees the externally stored fields for a record, if the field is mentioned
3683
in the update vector. */
3686
btr_rec_free_updated_extern_fields(
3687
/*===============================*/
3688
dict_index_t* index, /* in: index of rec; the index tree MUST be
3690
rec_t* rec, /* in: record */
3691
const ulint* offsets,/* in: rec_get_offsets(rec, index) */
3692
upd_t* update, /* in: update vector */
3693
ibool do_not_free_inherited,/* in: TRUE if called in a
3694
rollback and we do not want to free
3696
mtr_t* mtr) /* in: mini-transaction handle which contains
3697
an X-latch to record page and to the tree */
3699
upd_field_t* ufield;
3705
ut_ad(rec_offs_validate(rec, index, offsets));
3706
ut_ad(mtr_memo_contains(mtr, buf_block_align(rec),
3707
MTR_MEMO_PAGE_X_FIX));
3709
/* Free possible externally stored fields in the record */
3711
n_fields = upd_get_n_fields(update);
3713
for (i = 0; i < n_fields; i++) {
3714
ufield = upd_get_nth_field(update, i);
3716
if (rec_offs_nth_extern(offsets, ufield->field_no)) {
3718
data = rec_get_nth_field(rec, offsets,
3719
ufield->field_no, &len);
3720
btr_free_externally_stored_field(index, data, len,
3721
do_not_free_inherited,
3727
/***********************************************************************
3728
Copies an externally stored field of a record to mem heap. Parameter
3729
data contains a pointer to 'internally' stored part of the field:
3730
possibly some data, and the reference to the externally stored part in
3731
the last 20 bytes of data. */
3734
btr_copy_externally_stored_field(
3735
/*=============================*/
3736
/* out: the whole field copied to heap */
3737
ulint* len, /* out: length of the whole field */
3738
byte* data, /* in: 'internally' stored part of the
3739
field containing also the reference to
3740
the external part */
3741
ulint local_len,/* in: length of data */
3742
mem_heap_t* heap) /* in: mem heap */
3755
ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
3757
local_len -= BTR_EXTERN_FIELD_REF_SIZE;
3759
space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
3761
page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
3763
offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
3765
/* Currently a BLOB cannot be bigger that 4 GB; we
3766
leave the 4 upper bytes in the length field unused */
3768
extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
3770
buf = mem_heap_alloc(heap, local_len + extern_len);
3772
ut_memcpy(buf, data, local_len);
3773
copied_len = local_len;
3775
if (extern_len == 0) {
3784
page = buf_page_get(space_id, page_no, RW_S_LATCH, &mtr);
3785
#ifdef UNIV_SYNC_DEBUG
3786
buf_page_dbg_add_level(page, SYNC_EXTERN_STORAGE);
3787
#endif /* UNIV_SYNC_DEBUG */
3788
blob_header = page + offset;
3790
part_len = btr_blob_get_part_len(blob_header);
3792
ut_memcpy(buf + copied_len, blob_header + BTR_BLOB_HDR_SIZE,
3794
copied_len += part_len;
3796
page_no = btr_blob_get_next_page_no(blob_header);
3800
if (page_no == FIL_NULL) {
3801
ut_a(copied_len == local_len + extern_len);
3808
/* On other BLOB pages except the first the BLOB header
3809
always is at the page data start: */
3811
offset = FIL_PAGE_DATA;
3813
ut_a(copied_len < local_len + extern_len);
3817
/***********************************************************************
3818
Copies an externally stored field of a record to mem heap. */
3821
btr_rec_copy_externally_stored_field(
3822
/*=================================*/
3823
/* out: the field copied to heap */
3824
rec_t* rec, /* in: record */
3825
const ulint* offsets,/* in: array returned by rec_get_offsets() */
3826
ulint no, /* in: field number */
3827
ulint* len, /* out: length of the field */
3828
mem_heap_t* heap) /* in: mem heap */
3833
ut_ad(rec_offs_validate(rec, NULL, offsets));
3834
ut_a(rec_offs_nth_extern(offsets, no));
3836
/* An externally stored field can contain some initial
3837
data from the field, and in the last 20 bytes it has the
3838
space id, page number, and offset where the rest of the
3839
field data is stored, and the data length in addition to
3840
the data stored locally. We may need to store some data
3841
locally to get the local record length above the 128 byte
3842
limit so that field offsets are stored in two bytes, and
3843
the extern bit is available in those two bytes. */
3845
data = rec_get_nth_field(rec, offsets, no, &local_len);
3847
return(btr_copy_externally_stored_field(len, data, local_len, heap));