~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to storage/innobase/btr/btr0cur.c

  • Committer: jay
  • Date: 2008-12-23 00:18:10 UTC
  • Revision ID: jay@piggy.tangent.org-20081223001810-026ibij22q2842k1
Had a --regex-replace by accident. Should have been --replace_column call.  Only showed up in make test, not running single test, because InnoDB key numbers were different with multiple test running.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/******************************************************
 
2
The index tree cursor
 
3
 
 
4
All changes that row operations make to a B-tree or the records
 
5
there must go through this module! Undo log records are written here
 
6
of every modify or insert of a clustered index record.
 
7
 
 
8
                        NOTE!!!
 
9
To make sure we do not run out of disk space during a pessimistic
 
10
insert or update, we have to reserve 2 x the height of the index tree
 
11
many pages in the tablespace before we start the operation, because
 
12
if leaf splitting has been started, it is difficult to undo, except
 
13
by crashing the database and doing a roll-forward.
 
14
 
 
15
(c) 1994-2001 Innobase Oy
 
16
 
 
17
Created 10/16/1994 Heikki Tuuri
 
18
*******************************************************/
 
19
 
 
20
#include "btr0cur.h"
 
21
 
 
22
#ifdef UNIV_NONINL
 
23
#include "btr0cur.ic"
 
24
#endif
 
25
 
 
26
#include "page0page.h"
 
27
#include "page0zip.h"
 
28
#include "rem0rec.h"
 
29
#include "rem0cmp.h"
 
30
#include "buf0lru.h"
 
31
#include "btr0btr.h"
 
32
#include "btr0sea.h"
 
33
#include "row0upd.h"
 
34
#include "trx0rec.h"
 
35
#include "trx0roll.h" /* trx_is_recv() */
 
36
#include "que0que.h"
 
37
#include "row0row.h"
 
38
#include "srv0srv.h"
 
39
#include "ibuf0ibuf.h"
 
40
#include "lock0lock.h"
 
41
#include "zlib.h"
 
42
 
 
43
#ifdef UNIV_DEBUG
 
44
/* If the following is set to TRUE, this module prints a lot of
 
45
trace information of individual record operations */
 
46
UNIV_INTERN ibool       btr_cur_print_record_ops = FALSE;
 
47
#endif /* UNIV_DEBUG */
 
48
 
 
49
UNIV_INTERN ulint       btr_cur_n_non_sea       = 0;
 
50
UNIV_INTERN ulint       btr_cur_n_sea           = 0;
 
51
UNIV_INTERN ulint       btr_cur_n_non_sea_old   = 0;
 
52
UNIV_INTERN ulint       btr_cur_n_sea_old       = 0;
 
53
 
 
54
/* In the optimistic insert, if the insert does not fit, but this much space
 
55
can be released by page reorganize, then it is reorganized */
 
56
 
 
57
#define BTR_CUR_PAGE_REORGANIZE_LIMIT   (UNIV_PAGE_SIZE / 32)
 
58
 
 
59
/* The structure of a BLOB part header */
 
60
/*--------------------------------------*/
 
61
#define BTR_BLOB_HDR_PART_LEN           0       /* BLOB part len on this
 
62
                                                page */
 
63
#define BTR_BLOB_HDR_NEXT_PAGE_NO       4       /* next BLOB part page no,
 
64
                                                FIL_NULL if none */
 
65
/*--------------------------------------*/
 
66
#define BTR_BLOB_HDR_SIZE               8
 
67
 
 
68
/* A BLOB field reference full of zero, for use in assertions and tests.
 
69
Initially, BLOB field references are set to zero, in
 
70
dtuple_convert_big_rec(). */
 
71
UNIV_INTERN const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE];
 
72
 
 
73
/***********************************************************************
 
74
Marks all extern fields in a record as owned by the record. This function
 
75
should be called if the delete mark of a record is removed: a not delete
 
76
marked record always owns all its extern fields. */
 
77
static
 
78
void
 
79
btr_cur_unmark_extern_fields(
 
80
/*=========================*/
 
81
        page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed
 
82
                                part will be updated, or NULL */
 
83
        rec_t*          rec,    /* in/out: record in a clustered index */
 
84
        dict_index_t*   index,  /* in: index of the page */
 
85
        const ulint*    offsets,/* in: array returned by rec_get_offsets() */
 
86
        mtr_t*          mtr);   /* in: mtr, or NULL if not logged */
 
87
/***********************************************************************
 
88
Adds path information to the cursor for the current page, for which
 
89
the binary search has been performed. */
 
90
static
 
91
void
 
92
btr_cur_add_path_info(
 
93
/*==================*/
 
94
        btr_cur_t*      cursor,         /* in: cursor positioned on a page */
 
95
        ulint           height,         /* in: height of the page in tree;
 
96
                                        0 means leaf node */
 
97
        ulint           root_height);   /* in: root node height in tree */
 
98
/***************************************************************
 
99
Frees the externally stored fields for a record, if the field is mentioned
 
100
in the update vector. */
 
101
static
 
102
void
 
103
btr_rec_free_updated_extern_fields(
 
104
/*===============================*/
 
105
        dict_index_t*   index,  /* in: index of rec; the index tree MUST be
 
106
                                X-latched */
 
107
        rec_t*          rec,    /* in: record */
 
108
        page_zip_des_t* page_zip,/* in: compressed page whose uncompressed
 
109
                                part will be updated, or NULL */
 
110
        const ulint*    offsets,/* in: rec_get_offsets(rec, index) */
 
111
        const upd_t*    update, /* in: update vector */
 
112
        enum trx_rb_ctx rb_ctx, /* in: rollback context */
 
113
        mtr_t*          mtr);   /* in: mini-transaction handle which contains
 
114
                                an X-latch to record page and to the tree */
 
115
/***************************************************************
 
116
Frees the externally stored fields for a record. */
 
117
static
 
118
void
 
119
btr_rec_free_externally_stored_fields(
 
120
/*==================================*/
 
121
        dict_index_t*   index,  /* in: index of the data, the index
 
122
                                tree MUST be X-latched */
 
123
        rec_t*          rec,    /* in: record */
 
124
        const ulint*    offsets,/* in: rec_get_offsets(rec, index) */
 
125
        page_zip_des_t* page_zip,/* in: compressed page whose uncompressed
 
126
                                part will be updated, or NULL */
 
127
        enum trx_rb_ctx rb_ctx, /* in: rollback context */
 
128
        mtr_t*          mtr);   /* in: mini-transaction handle which contains
 
129
                                an X-latch to record page and to the index
 
130
                                tree */
 
131
/***************************************************************
 
132
Gets the externally stored size of a record, in units of a database page. */
 
133
static
 
134
ulint
 
135
btr_rec_get_externally_stored_len(
 
136
/*==============================*/
 
137
                                /* out: externally stored part,
 
138
                                in units of a database page */
 
139
        rec_t*          rec,    /* in: record */
 
140
        const ulint*    offsets);/* in: array returned by rec_get_offsets() */
 
141
 
 
142
/**********************************************************
 
143
The following function is used to set the deleted bit of a record. */
 
144
UNIV_INLINE
 
145
void
 
146
btr_rec_set_deleted_flag(
 
147
/*=====================*/
 
148
                                /* out: TRUE on success;
 
149
                                FALSE on page_zip overflow */
 
150
        rec_t*          rec,    /* in/out: physical record */
 
151
        page_zip_des_t* page_zip,/* in/out: compressed page (or NULL) */
 
152
        ulint           flag)   /* in: nonzero if delete marked */
 
153
{
 
154
        if (page_rec_is_comp(rec)) {
 
155
                rec_set_deleted_flag_new(rec, page_zip, flag);
 
156
        } else {
 
157
                ut_ad(!page_zip);
 
158
                rec_set_deleted_flag_old(rec, flag);
 
159
        }
 
160
}
 
161
 
 
162
/*==================== B-TREE SEARCH =========================*/
 
163
 
 
164
/************************************************************************
 
165
Latches the leaf page or pages requested. */
 
166
static
 
167
void
 
168
btr_cur_latch_leaves(
 
169
/*=================*/
 
170
        page_t*         page,           /* in: leaf page where the search
 
171
                                        converged */
 
172
        ulint           space,          /* in: space id */
 
173
        ulint           zip_size,       /* in: compressed page size in bytes
 
174
                                        or 0 for uncompressed pages */
 
175
        ulint           page_no,        /* in: page number of the leaf */
 
176
        ulint           latch_mode,     /* in: BTR_SEARCH_LEAF, ... */
 
177
        btr_cur_t*      cursor,         /* in: cursor */
 
178
        mtr_t*          mtr)            /* in: mtr */
 
179
{
 
180
        ulint           mode;
 
181
        ulint           left_page_no;
 
182
        ulint           right_page_no;
 
183
        buf_block_t*    get_block;
 
184
 
 
185
        ut_ad(page && mtr);
 
186
 
 
187
        switch (latch_mode) {
 
188
        case BTR_SEARCH_LEAF:
 
189
        case BTR_MODIFY_LEAF:
 
190
                mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH;
 
191
                get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
 
192
#ifdef UNIV_BTR_DEBUG
 
193
                ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
 
194
#endif /* UNIV_BTR_DEBUG */
 
195
                get_block->check_index_page_at_flush = TRUE;
 
196
                return;
 
197
        case BTR_MODIFY_TREE:
 
198
                /* x-latch also brothers from left to right */
 
199
                left_page_no = btr_page_get_prev(page, mtr);
 
200
 
 
201
                if (left_page_no != FIL_NULL) {
 
202
                        get_block = btr_block_get(space, zip_size,
 
203
                                                  left_page_no,
 
204
                                                  RW_X_LATCH, mtr);
 
205
#ifdef UNIV_BTR_DEBUG
 
206
                        ut_a(page_is_comp(get_block->frame)
 
207
                             == page_is_comp(page));
 
208
                        ut_a(btr_page_get_next(get_block->frame, mtr)
 
209
                             == page_get_page_no(page));
 
210
#endif /* UNIV_BTR_DEBUG */
 
211
                        get_block->check_index_page_at_flush = TRUE;
 
212
                }
 
213
 
 
214
                get_block = btr_block_get(space, zip_size, page_no,
 
215
                                          RW_X_LATCH, mtr);
 
216
#ifdef UNIV_BTR_DEBUG
 
217
                ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
 
218
#endif /* UNIV_BTR_DEBUG */
 
219
                get_block->check_index_page_at_flush = TRUE;
 
220
 
 
221
                right_page_no = btr_page_get_next(page, mtr);
 
222
 
 
223
                if (right_page_no != FIL_NULL) {
 
224
                        get_block = btr_block_get(space, zip_size,
 
225
                                                  right_page_no,
 
226
                                                  RW_X_LATCH, mtr);
 
227
#ifdef UNIV_BTR_DEBUG
 
228
                        ut_a(page_is_comp(get_block->frame)
 
229
                             == page_is_comp(page));
 
230
                        ut_a(btr_page_get_prev(get_block->frame, mtr)
 
231
                             == page_get_page_no(page));
 
232
#endif /* UNIV_BTR_DEBUG */
 
233
                        get_block->check_index_page_at_flush = TRUE;
 
234
                }
 
235
 
 
236
                return;
 
237
 
 
238
        case BTR_SEARCH_PREV:
 
239
        case BTR_MODIFY_PREV:
 
240
                mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
 
241
                /* latch also left brother */
 
242
                left_page_no = btr_page_get_prev(page, mtr);
 
243
 
 
244
                if (left_page_no != FIL_NULL) {
 
245
                        get_block = btr_block_get(space, zip_size,
 
246
                                                  left_page_no, mode, mtr);
 
247
                        cursor->left_block = get_block;
 
248
#ifdef UNIV_BTR_DEBUG
 
249
                        ut_a(page_is_comp(get_block->frame)
 
250
                             == page_is_comp(page));
 
251
                        ut_a(btr_page_get_next(get_block->frame, mtr)
 
252
                             == page_get_page_no(page));
 
253
#endif /* UNIV_BTR_DEBUG */
 
254
                        get_block->check_index_page_at_flush = TRUE;
 
255
                }
 
256
 
 
257
                get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
 
258
#ifdef UNIV_BTR_DEBUG
 
259
                ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
 
260
#endif /* UNIV_BTR_DEBUG */
 
261
                get_block->check_index_page_at_flush = TRUE;
 
262
                return;
 
263
        }
 
264
 
 
265
        ut_error;
 
266
}
 
267
 
 
268
/************************************************************************
 
269
Searches an index tree and positions a tree cursor on a given level.
 
270
NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
 
271
to node pointer page number fields on the upper levels of the tree!
 
272
Note that if mode is PAGE_CUR_LE, which is used in inserts, then
 
273
cursor->up_match and cursor->low_match both will have sensible values.
 
274
If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
 
275
 
 
276
If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
 
277
search tuple should be performed in the B-tree. InnoDB does an insert
 
278
immediately after the cursor. Thus, the cursor may end up on a user record,
 
279
or on a page infimum record. */
 
280
UNIV_INTERN
 
281
void
 
282
btr_cur_search_to_nth_level(
 
283
/*========================*/
 
284
        dict_index_t*   index,  /* in: index */
 
285
        ulint           level,  /* in: the tree level of search */
 
286
        const dtuple_t* tuple,  /* in: data tuple; NOTE: n_fields_cmp in
 
287
                                tuple must be set so that it cannot get
 
288
                                compared to the node ptr page number field! */
 
289
        ulint           mode,   /* in: PAGE_CUR_L, ...;
 
290
                                Inserts should always be made using
 
291
                                PAGE_CUR_LE to search the position! */
 
292
        ulint           latch_mode, /* in: BTR_SEARCH_LEAF, ..., ORed with
 
293
                                BTR_INSERT and BTR_ESTIMATE;
 
294
                                cursor->left_block is used to store a pointer
 
295
                                to the left neighbor page, in the cases
 
296
                                BTR_SEARCH_PREV and BTR_MODIFY_PREV;
 
297
                                NOTE that if has_search_latch
 
298
                                is != 0, we maybe do not have a latch set
 
299
                                on the cursor page, we assume
 
300
                                the caller uses his search latch
 
301
                                to protect the record! */
 
302
        btr_cur_t*      cursor, /* in/out: tree cursor; the cursor page is
 
303
                                s- or x-latched, but see also above! */
 
304
        ulint           has_search_latch,/* in: info on the latch mode the
 
305
                                caller currently has on btr_search_latch:
 
306
                                RW_S_LATCH, or 0 */
 
307
        mtr_t*          mtr)    /* in: mtr */
 
308
{
 
309
        page_cur_t*     page_cursor;
 
310
        page_t*         page;
 
311
        buf_block_t*    guess;
 
312
        rec_t*          node_ptr;
 
313
        ulint           page_no;
 
314
        ulint           space;
 
315
        ulint           up_match;
 
316
        ulint           up_bytes;
 
317
        ulint           low_match;
 
318
        ulint           low_bytes;
 
319
        ulint           height;
 
320
        ulint           savepoint;
 
321
        ulint           rw_latch;
 
322
        ulint           page_mode;
 
323
        ulint           insert_planned;
 
324
        ulint           buf_mode;
 
325
        ulint           estimate;
 
326
        ulint           ignore_sec_unique;
 
327
        ulint           root_height = 0; /* remove warning */
 
328
#ifdef BTR_CUR_ADAPT
 
329
        btr_search_t*   info;
 
330
#endif
 
331
        mem_heap_t*     heap            = NULL;
 
332
        ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
333
        ulint*          offsets         = offsets_;
 
334
        rec_offs_init(offsets_);
 
335
        /* Currently, PAGE_CUR_LE is the only search mode used for searches
 
336
        ending to upper levels */
 
337
 
 
338
        ut_ad(level == 0 || mode == PAGE_CUR_LE);
 
339
        ut_ad(dict_index_check_search_tuple(index, tuple));
 
340
        ut_ad(!dict_index_is_ibuf(index) || ibuf_inside());
 
341
        ut_ad(dtuple_check_typed(tuple));
 
342
 
 
343
#ifdef UNIV_DEBUG
 
344
        cursor->up_match = ULINT_UNDEFINED;
 
345
        cursor->low_match = ULINT_UNDEFINED;
 
346
#endif
 
347
        insert_planned = latch_mode & BTR_INSERT;
 
348
        estimate = latch_mode & BTR_ESTIMATE;
 
349
        ignore_sec_unique = latch_mode & BTR_IGNORE_SEC_UNIQUE;
 
350
        latch_mode = latch_mode & ~(BTR_INSERT | BTR_ESTIMATE
 
351
                                    | BTR_IGNORE_SEC_UNIQUE);
 
352
 
 
353
        ut_ad(!insert_planned || (mode == PAGE_CUR_LE));
 
354
 
 
355
        cursor->flag = BTR_CUR_BINARY;
 
356
        cursor->index = index;
 
357
 
 
358
#ifndef BTR_CUR_ADAPT
 
359
        guess = NULL;
 
360
#else
 
361
        info = btr_search_get_info(index);
 
362
 
 
363
        guess = info->root_guess;
 
364
 
 
365
#ifdef BTR_CUR_HASH_ADAPT
 
366
 
 
367
#ifdef UNIV_SEARCH_PERF_STAT
 
368
        info->n_searches++;
 
369
#endif
 
370
        if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED
 
371
            && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
 
372
            && !estimate
 
373
#ifdef PAGE_CUR_LE_OR_EXTENDS
 
374
            && mode != PAGE_CUR_LE_OR_EXTENDS
 
375
#endif /* PAGE_CUR_LE_OR_EXTENDS */
 
376
            && !UNIV_UNLIKELY(btr_search_disabled)
 
377
            && btr_search_guess_on_hash(index, info, tuple, mode,
 
378
                                        latch_mode, cursor,
 
379
                                        has_search_latch, mtr)) {
 
380
 
 
381
                /* Search using the hash index succeeded */
 
382
 
 
383
                ut_ad(cursor->up_match != ULINT_UNDEFINED
 
384
                      || mode != PAGE_CUR_GE);
 
385
                ut_ad(cursor->up_match != ULINT_UNDEFINED
 
386
                      || mode != PAGE_CUR_LE);
 
387
                ut_ad(cursor->low_match != ULINT_UNDEFINED
 
388
                      || mode != PAGE_CUR_LE);
 
389
                btr_cur_n_sea++;
 
390
 
 
391
                return;
 
392
        }
 
393
#endif /* BTR_CUR_HASH_ADAPT */
 
394
#endif /* BTR_CUR_ADAPT */
 
395
        btr_cur_n_non_sea++;
 
396
 
 
397
        /* If the hash search did not succeed, do binary search down the
 
398
        tree */
 
399
 
 
400
        if (has_search_latch) {
 
401
                /* Release possible search latch to obey latching order */
 
402
                rw_lock_s_unlock(&btr_search_latch);
 
403
        }
 
404
 
 
405
        /* Store the position of the tree latch we push to mtr so that we
 
406
        know how to release it when we have latched leaf node(s) */
 
407
 
 
408
        savepoint = mtr_set_savepoint(mtr);
 
409
 
 
410
        if (latch_mode == BTR_MODIFY_TREE) {
 
411
                mtr_x_lock(dict_index_get_lock(index), mtr);
 
412
 
 
413
        } else if (latch_mode == BTR_CONT_MODIFY_TREE) {
 
414
                /* Do nothing */
 
415
                ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 
416
                                        MTR_MEMO_X_LOCK));
 
417
        } else {
 
418
                mtr_s_lock(dict_index_get_lock(index), mtr);
 
419
        }
 
420
 
 
421
        page_cursor = btr_cur_get_page_cur(cursor);
 
422
 
 
423
        space = dict_index_get_space(index);
 
424
        page_no = dict_index_get_page(index);
 
425
 
 
426
        up_match = 0;
 
427
        up_bytes = 0;
 
428
        low_match = 0;
 
429
        low_bytes = 0;
 
430
 
 
431
        height = ULINT_UNDEFINED;
 
432
        rw_latch = RW_NO_LATCH;
 
433
        buf_mode = BUF_GET;
 
434
 
 
435
        /* We use these modified search modes on non-leaf levels of the
 
436
        B-tree. These let us end up in the right B-tree leaf. In that leaf
 
437
        we use the original search mode. */
 
438
 
 
439
        switch (mode) {
 
440
        case PAGE_CUR_GE:
 
441
                page_mode = PAGE_CUR_L;
 
442
                break;
 
443
        case PAGE_CUR_G:
 
444
                page_mode = PAGE_CUR_LE;
 
445
                break;
 
446
        default:
 
447
#ifdef PAGE_CUR_LE_OR_EXTENDS
 
448
                ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
 
449
                      || mode == PAGE_CUR_LE_OR_EXTENDS);
 
450
#else /* PAGE_CUR_LE_OR_EXTENDS */
 
451
                ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
 
452
#endif /* PAGE_CUR_LE_OR_EXTENDS */
 
453
                page_mode = mode;
 
454
                break;
 
455
        }
 
456
 
 
457
        /* Loop and search until we arrive at the desired level */
 
458
 
 
459
        for (;;) {
 
460
                ulint           zip_size;
 
461
                buf_block_t*    block;
 
462
retry_page_get:
 
463
                zip_size = dict_table_zip_size(index->table);
 
464
 
 
465
                block = buf_page_get_gen(space, zip_size, page_no,
 
466
                                         rw_latch, guess, buf_mode,
 
467
                                         __FILE__, __LINE__, mtr);
 
468
                if (block == NULL) {
 
469
                        /* This must be a search to perform an insert;
 
470
                        try insert to the insert buffer */
 
471
 
 
472
                        ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
 
473
                        ut_ad(insert_planned);
 
474
                        ut_ad(cursor->thr);
 
475
 
 
476
                        if (ibuf_should_try(index, ignore_sec_unique)
 
477
                            && ibuf_insert(tuple, index, space, zip_size,
 
478
                                           page_no, cursor->thr)) {
 
479
                                /* Insertion to the insert buffer succeeded */
 
480
                                cursor->flag = BTR_CUR_INSERT_TO_IBUF;
 
481
                                if (UNIV_LIKELY_NULL(heap)) {
 
482
                                        mem_heap_free(heap);
 
483
                                }
 
484
                                goto func_exit;
 
485
                        }
 
486
 
 
487
                        /* Insert to the insert buffer did not succeed:
 
488
                        retry page get */
 
489
 
 
490
                        buf_mode = BUF_GET;
 
491
 
 
492
                        goto retry_page_get;
 
493
                }
 
494
 
 
495
                page = buf_block_get_frame(block);
 
496
#ifdef UNIV_ZIP_DEBUG
 
497
                if (rw_latch != RW_NO_LATCH) {
 
498
                        const page_zip_des_t*   page_zip
 
499
                                = buf_block_get_page_zip(block);
 
500
                        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
501
                }
 
502
#endif /* UNIV_ZIP_DEBUG */
 
503
 
 
504
                block->check_index_page_at_flush = TRUE;
 
505
 
 
506
                if (rw_latch != RW_NO_LATCH) {
 
507
                        buf_block_dbg_add_level(block, SYNC_TREE_NODE);
 
508
                }
 
509
 
 
510
                ut_ad(0 == ut_dulint_cmp(index->id,
 
511
                                         btr_page_get_index_id(page)));
 
512
 
 
513
                if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
 
514
                        /* We are in the root node */
 
515
 
 
516
                        height = btr_page_get_level(page, mtr);
 
517
                        root_height = height;
 
518
                        cursor->tree_height = root_height + 1;
 
519
#ifdef BTR_CUR_ADAPT
 
520
                        if (block != guess) {
 
521
                                info->root_guess = block;
 
522
                        }
 
523
#endif
 
524
                }
 
525
 
 
526
                if (height == 0) {
 
527
                        if (rw_latch == RW_NO_LATCH) {
 
528
 
 
529
                                btr_cur_latch_leaves(page, space, zip_size,
 
530
                                                     page_no, latch_mode,
 
531
                                                     cursor, mtr);
 
532
                        }
 
533
 
 
534
                        if ((latch_mode != BTR_MODIFY_TREE)
 
535
                            && (latch_mode != BTR_CONT_MODIFY_TREE)) {
 
536
 
 
537
                                /* Release the tree s-latch */
 
538
 
 
539
                                mtr_release_s_latch_at_savepoint(
 
540
                                        mtr, savepoint,
 
541
                                        dict_index_get_lock(index));
 
542
                        }
 
543
 
 
544
                        page_mode = mode;
 
545
                }
 
546
 
 
547
                page_cur_search_with_match(block, index, tuple, page_mode,
 
548
                                           &up_match, &up_bytes,
 
549
                                           &low_match, &low_bytes,
 
550
                                           page_cursor);
 
551
 
 
552
                if (estimate) {
 
553
                        btr_cur_add_path_info(cursor, height, root_height);
 
554
                }
 
555
 
 
556
                /* If this is the desired level, leave the loop */
 
557
 
 
558
                ut_ad(height == btr_page_get_level(
 
559
                              page_cur_get_page(page_cursor), mtr));
 
560
 
 
561
                if (level == height) {
 
562
 
 
563
                        if (level > 0) {
 
564
                                /* x-latch the page */
 
565
                                page = btr_page_get(space, zip_size,
 
566
                                                    page_no, RW_X_LATCH, mtr);
 
567
                                ut_a((ibool)!!page_is_comp(page)
 
568
                                     == dict_table_is_comp(index->table));
 
569
                        }
 
570
 
 
571
                        break;
 
572
                }
 
573
 
 
574
                ut_ad(height > 0);
 
575
 
 
576
                height--;
 
577
 
 
578
                if ((height == 0) && (latch_mode <= BTR_MODIFY_LEAF)) {
 
579
 
 
580
                        rw_latch = latch_mode;
 
581
 
 
582
                        if (insert_planned
 
583
                            && ibuf_should_try(index, ignore_sec_unique)) {
 
584
 
 
585
                                /* Try insert to the insert buffer if the
 
586
                                page is not in the buffer pool */
 
587
 
 
588
                                buf_mode = BUF_GET_IF_IN_POOL;
 
589
                        }
 
590
                }
 
591
 
 
592
                guess = NULL;
 
593
 
 
594
                node_ptr = page_cur_get_rec(page_cursor);
 
595
                offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
 
596
                                          ULINT_UNDEFINED, &heap);
 
597
                /* Go to the child node */
 
598
                page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
 
599
        }
 
600
 
 
601
        if (UNIV_LIKELY_NULL(heap)) {
 
602
                mem_heap_free(heap);
 
603
        }
 
604
 
 
605
        if (level == 0) {
 
606
                cursor->low_match = low_match;
 
607
                cursor->low_bytes = low_bytes;
 
608
                cursor->up_match = up_match;
 
609
                cursor->up_bytes = up_bytes;
 
610
 
 
611
#ifdef BTR_CUR_ADAPT
 
612
                if (!UNIV_UNLIKELY(btr_search_disabled)) {
 
613
 
 
614
                        btr_search_info_update(index, cursor);
 
615
                }
 
616
#endif
 
617
                ut_ad(cursor->up_match != ULINT_UNDEFINED
 
618
                      || mode != PAGE_CUR_GE);
 
619
                ut_ad(cursor->up_match != ULINT_UNDEFINED
 
620
                      || mode != PAGE_CUR_LE);
 
621
                ut_ad(cursor->low_match != ULINT_UNDEFINED
 
622
                      || mode != PAGE_CUR_LE);
 
623
        }
 
624
 
 
625
func_exit:
 
626
        if (has_search_latch) {
 
627
 
 
628
                rw_lock_s_lock(&btr_search_latch);
 
629
        }
 
630
}
 
631
 
 
632
/*********************************************************************
 
633
Opens a cursor at either end of an index. */
 
634
UNIV_INTERN
 
635
void
 
636
btr_cur_open_at_index_side(
 
637
/*=======================*/
 
638
        ibool           from_left,      /* in: TRUE if open to the low end,
 
639
                                        FALSE if to the high end */
 
640
        dict_index_t*   index,          /* in: index */
 
641
        ulint           latch_mode,     /* in: latch mode */
 
642
        btr_cur_t*      cursor,         /* in: cursor */
 
643
        mtr_t*          mtr)            /* in: mtr */
 
644
{
 
645
        page_cur_t*     page_cursor;
 
646
        ulint           page_no;
 
647
        ulint           space;
 
648
        ulint           zip_size;
 
649
        ulint           height;
 
650
        ulint           root_height = 0; /* remove warning */
 
651
        rec_t*          node_ptr;
 
652
        ulint           estimate;
 
653
        ulint           savepoint;
 
654
        mem_heap_t*     heap            = NULL;
 
655
        ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
656
        ulint*          offsets         = offsets_;
 
657
        rec_offs_init(offsets_);
 
658
 
 
659
        estimate = latch_mode & BTR_ESTIMATE;
 
660
        latch_mode = latch_mode & ~BTR_ESTIMATE;
 
661
 
 
662
        /* Store the position of the tree latch we push to mtr so that we
 
663
        know how to release it when we have latched the leaf node */
 
664
 
 
665
        savepoint = mtr_set_savepoint(mtr);
 
666
 
 
667
        if (latch_mode == BTR_MODIFY_TREE) {
 
668
                mtr_x_lock(dict_index_get_lock(index), mtr);
 
669
        } else {
 
670
                mtr_s_lock(dict_index_get_lock(index), mtr);
 
671
        }
 
672
 
 
673
        page_cursor = btr_cur_get_page_cur(cursor);
 
674
        cursor->index = index;
 
675
 
 
676
        space = dict_index_get_space(index);
 
677
        zip_size = dict_table_zip_size(index->table);
 
678
        page_no = dict_index_get_page(index);
 
679
 
 
680
        height = ULINT_UNDEFINED;
 
681
 
 
682
        for (;;) {
 
683
                buf_block_t*    block;
 
684
                page_t*         page;
 
685
                block = buf_page_get_gen(space, zip_size, page_no,
 
686
                                         RW_NO_LATCH, NULL, BUF_GET,
 
687
                                         __FILE__, __LINE__, mtr);
 
688
                page = buf_block_get_frame(block);
 
689
                ut_ad(0 == ut_dulint_cmp(index->id,
 
690
                                         btr_page_get_index_id(page)));
 
691
 
 
692
                block->check_index_page_at_flush = TRUE;
 
693
 
 
694
                if (height == ULINT_UNDEFINED) {
 
695
                        /* We are in the root node */
 
696
 
 
697
                        height = btr_page_get_level(page, mtr);
 
698
                        root_height = height;
 
699
                }
 
700
 
 
701
                if (height == 0) {
 
702
                        btr_cur_latch_leaves(page, space, zip_size, page_no,
 
703
                                             latch_mode, cursor, mtr);
 
704
 
 
705
                        /* In versions <= 3.23.52 we had forgotten to
 
706
                        release the tree latch here. If in an index scan
 
707
                        we had to scan far to find a record visible to the
 
708
                        current transaction, that could starve others
 
709
                        waiting for the tree latch. */
 
710
 
 
711
                        if ((latch_mode != BTR_MODIFY_TREE)
 
712
                            && (latch_mode != BTR_CONT_MODIFY_TREE)) {
 
713
 
 
714
                                /* Release the tree s-latch */
 
715
 
 
716
                                mtr_release_s_latch_at_savepoint(
 
717
                                        mtr, savepoint,
 
718
                                        dict_index_get_lock(index));
 
719
                        }
 
720
                }
 
721
 
 
722
                if (from_left) {
 
723
                        page_cur_set_before_first(block, page_cursor);
 
724
                } else {
 
725
                        page_cur_set_after_last(block, page_cursor);
 
726
                }
 
727
 
 
728
                if (height == 0) {
 
729
                        if (estimate) {
 
730
                                btr_cur_add_path_info(cursor, height,
 
731
                                                      root_height);
 
732
                        }
 
733
 
 
734
                        break;
 
735
                }
 
736
 
 
737
                ut_ad(height > 0);
 
738
 
 
739
                if (from_left) {
 
740
                        page_cur_move_to_next(page_cursor);
 
741
                } else {
 
742
                        page_cur_move_to_prev(page_cursor);
 
743
                }
 
744
 
 
745
                if (estimate) {
 
746
                        btr_cur_add_path_info(cursor, height, root_height);
 
747
                }
 
748
 
 
749
                height--;
 
750
 
 
751
                node_ptr = page_cur_get_rec(page_cursor);
 
752
                offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
 
753
                                          ULINT_UNDEFINED, &heap);
 
754
                /* Go to the child node */
 
755
                page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
 
756
        }
 
757
 
 
758
        if (UNIV_LIKELY_NULL(heap)) {
 
759
                mem_heap_free(heap);
 
760
        }
 
761
}
 
762
 
 
763
/**************************************************************************
 
764
Positions a cursor at a randomly chosen position within a B-tree. */
 
765
UNIV_INTERN
 
766
void
 
767
btr_cur_open_at_rnd_pos(
 
768
/*====================*/
 
769
        dict_index_t*   index,          /* in: index */
 
770
        ulint           latch_mode,     /* in: BTR_SEARCH_LEAF, ... */
 
771
        btr_cur_t*      cursor,         /* in/out: B-tree cursor */
 
772
        mtr_t*          mtr)            /* in: mtr */
 
773
{
 
774
        page_cur_t*     page_cursor;
 
775
        ulint           page_no;
 
776
        ulint           space;
 
777
        ulint           zip_size;
 
778
        ulint           height;
 
779
        rec_t*          node_ptr;
 
780
        mem_heap_t*     heap            = NULL;
 
781
        ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
782
        ulint*          offsets         = offsets_;
 
783
        rec_offs_init(offsets_);
 
784
 
 
785
        if (latch_mode == BTR_MODIFY_TREE) {
 
786
                mtr_x_lock(dict_index_get_lock(index), mtr);
 
787
        } else {
 
788
                mtr_s_lock(dict_index_get_lock(index), mtr);
 
789
        }
 
790
 
 
791
        page_cursor = btr_cur_get_page_cur(cursor);
 
792
        cursor->index = index;
 
793
 
 
794
        space = dict_index_get_space(index);
 
795
        zip_size = dict_table_zip_size(index->table);
 
796
        page_no = dict_index_get_page(index);
 
797
 
 
798
        height = ULINT_UNDEFINED;
 
799
 
 
800
        for (;;) {
 
801
                buf_block_t*    block;
 
802
                page_t*         page;
 
803
 
 
804
                block = buf_page_get_gen(space, zip_size, page_no,
 
805
                                         RW_NO_LATCH, NULL, BUF_GET,
 
806
                                         __FILE__, __LINE__, mtr);
 
807
                page = buf_block_get_frame(block);
 
808
                ut_ad(0 == ut_dulint_cmp(index->id,
 
809
                                         btr_page_get_index_id(page)));
 
810
 
 
811
                if (height == ULINT_UNDEFINED) {
 
812
                        /* We are in the root node */
 
813
 
 
814
                        height = btr_page_get_level(page, mtr);
 
815
                }
 
816
 
 
817
                if (height == 0) {
 
818
                        btr_cur_latch_leaves(page, space, zip_size, page_no,
 
819
                                             latch_mode, cursor, mtr);
 
820
                }
 
821
 
 
822
                page_cur_open_on_rnd_user_rec(block, page_cursor);
 
823
 
 
824
                if (height == 0) {
 
825
 
 
826
                        break;
 
827
                }
 
828
 
 
829
                ut_ad(height > 0);
 
830
 
 
831
                height--;
 
832
 
 
833
                node_ptr = page_cur_get_rec(page_cursor);
 
834
                offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
 
835
                                          ULINT_UNDEFINED, &heap);
 
836
                /* Go to the child node */
 
837
                page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
 
838
        }
 
839
 
 
840
        if (UNIV_LIKELY_NULL(heap)) {
 
841
                mem_heap_free(heap);
 
842
        }
 
843
}
 
844
 
 
845
/*==================== B-TREE INSERT =========================*/
 
846
 
 
847
/*****************************************************************
 
848
Inserts a record if there is enough space, or if enough space can
 
849
be freed by reorganizing. Differs from btr_cur_optimistic_insert because
 
850
no heuristics is applied to whether it pays to use CPU time for
 
851
reorganizing the page or not. */
 
852
static
 
853
rec_t*
 
854
btr_cur_insert_if_possible(
 
855
/*=======================*/
 
856
                                /* out: pointer to inserted record if succeed,
 
857
                                else NULL */
 
858
        btr_cur_t*      cursor, /* in: cursor on page after which to insert;
 
859
                                cursor stays valid */
 
860
        const dtuple_t* tuple,  /* in: tuple to insert; the size info need not
 
861
                                have been stored to tuple */
 
862
        ulint           n_ext,  /* in: number of externally stored columns */
 
863
        mtr_t*          mtr)    /* in: mtr */
 
864
{
 
865
        page_cur_t*     page_cursor;
 
866
        buf_block_t*    block;
 
867
        rec_t*          rec;
 
868
 
 
869
        ut_ad(dtuple_check_typed(tuple));
 
870
 
 
871
        block = btr_cur_get_block(cursor);
 
872
 
 
873
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
874
        page_cursor = btr_cur_get_page_cur(cursor);
 
875
 
 
876
        /* Now, try the insert */
 
877
        rec = page_cur_tuple_insert(page_cursor, tuple,
 
878
                                    cursor->index, n_ext, mtr);
 
879
 
 
880
        if (UNIV_UNLIKELY(!rec)) {
 
881
                /* If record did not fit, reorganize */
 
882
 
 
883
                if (btr_page_reorganize(block, cursor->index, mtr)) {
 
884
 
 
885
                        page_cur_search(block, cursor->index, tuple,
 
886
                                        PAGE_CUR_LE, page_cursor);
 
887
 
 
888
                        rec = page_cur_tuple_insert(page_cursor, tuple,
 
889
                                                    cursor->index, n_ext, mtr);
 
890
                }
 
891
        }
 
892
 
 
893
        return(rec);
 
894
}
 
895
 
 
896
/*****************************************************************
 
897
For an insert, checks the locks and does the undo logging if desired. */
 
898
UNIV_INLINE
 
899
ulint
 
900
btr_cur_ins_lock_and_undo(
 
901
/*======================*/
 
902
                                /* out: DB_SUCCESS, DB_WAIT_LOCK,
 
903
                                DB_FAIL, or error number */
 
904
        ulint           flags,  /* in: undo logging and locking flags: if
 
905
                                not zero, the parameters index and thr
 
906
                                should be specified */
 
907
        btr_cur_t*      cursor, /* in: cursor on page after which to insert */
 
908
        const dtuple_t* entry,  /* in: entry to insert */
 
909
        que_thr_t*      thr,    /* in: query thread or NULL */
 
910
        ibool*          inherit)/* out: TRUE if the inserted new record maybe
 
911
                                should inherit LOCK_GAP type locks from the
 
912
                                successor record */
 
913
{
 
914
        dict_index_t*   index;
 
915
        ulint           err;
 
916
        rec_t*          rec;
 
917
        dulint          roll_ptr;
 
918
 
 
919
        /* Check if we have to wait for a lock: enqueue an explicit lock
 
920
        request if yes */
 
921
 
 
922
        rec = btr_cur_get_rec(cursor);
 
923
        index = cursor->index;
 
924
 
 
925
        err = lock_rec_insert_check_and_lock(flags, rec,
 
926
                                             btr_cur_get_block(cursor),
 
927
                                             index, thr, inherit);
 
928
 
 
929
        if (err != DB_SUCCESS) {
 
930
 
 
931
                return(err);
 
932
        }
 
933
 
 
934
        if (dict_index_is_clust(index) && !dict_index_is_ibuf(index)) {
 
935
 
 
936
                err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
 
937
                                                    thr, index, entry,
 
938
                                                    NULL, 0, NULL,
 
939
                                                    &roll_ptr);
 
940
                if (err != DB_SUCCESS) {
 
941
 
 
942
                        return(err);
 
943
                }
 
944
 
 
945
                /* Now we can fill in the roll ptr field in entry */
 
946
 
 
947
                if (!(flags & BTR_KEEP_SYS_FLAG)) {
 
948
 
 
949
                        row_upd_index_entry_sys_field(entry, index,
 
950
                                                      DATA_ROLL_PTR, roll_ptr);
 
951
                }
 
952
        }
 
953
 
 
954
        return(DB_SUCCESS);
 
955
}
 
956
 
 
957
#ifdef UNIV_DEBUG
 
958
/*****************************************************************
 
959
Report information about a transaction. */
 
960
static
 
961
void
 
962
btr_cur_trx_report(
 
963
/*===============*/
 
964
        trx_t*                  trx,    /* in: transaction */
 
965
        const dict_index_t*     index,  /* in: index */
 
966
        const char*             op)     /* in: operation */
 
967
{
 
968
        fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ",
 
969
                TRX_ID_PREP_PRINTF(trx->id));
 
970
        fputs(op, stderr);
 
971
        dict_index_name_print(stderr, trx, index);
 
972
        putc('\n', stderr);
 
973
}
 
974
#endif /* UNIV_DEBUG */
 
975
 
 
976
/*****************************************************************
 
977
Tries to perform an insert to a page in an index tree, next to cursor.
 
978
It is assumed that mtr holds an x-latch on the page. The operation does
 
979
not succeed if there is too little space on the page. If there is just
 
980
one record on the page, the insert will always succeed; this is to
 
981
prevent trying to split a page with just one record. */
 
982
UNIV_INTERN
 
983
ulint
 
984
btr_cur_optimistic_insert(
 
985
/*======================*/
 
986
                                /* out: DB_SUCCESS, DB_WAIT_LOCK,
 
987
                                DB_FAIL, or error number */
 
988
        ulint           flags,  /* in: undo logging and locking flags: if not
 
989
                                zero, the parameters index and thr should be
 
990
                                specified */
 
991
        btr_cur_t*      cursor, /* in: cursor on page after which to insert;
 
992
                                cursor stays valid */
 
993
        dtuple_t*       entry,  /* in/out: entry to insert */
 
994
        rec_t**         rec,    /* out: pointer to inserted record if
 
995
                                succeed */
 
996
        big_rec_t**     big_rec,/* out: big rec vector whose fields have to
 
997
                                be stored externally by the caller, or
 
998
                                NULL */
 
999
        ulint           n_ext,  /* in: number of externally stored columns */
 
1000
        que_thr_t*      thr,    /* in: query thread or NULL */
 
1001
        mtr_t*          mtr)    /* in: mtr; if this function returns
 
1002
                                DB_SUCCESS on a leaf page of a secondary
 
1003
                                index in a compressed tablespace, the
 
1004
                                mtr must be committed before latching
 
1005
                                any further pages */
 
1006
{
 
1007
        big_rec_t*      big_rec_vec     = NULL;
 
1008
        dict_index_t*   index;
 
1009
        page_cur_t*     page_cursor;
 
1010
        buf_block_t*    block;
 
1011
        page_t*         page;
 
1012
        ulint           max_size;
 
1013
        rec_t*          dummy_rec;
 
1014
        ibool           leaf;
 
1015
        ibool           reorg;
 
1016
        ibool           inherit;
 
1017
        ulint           zip_size;
 
1018
        ulint           rec_size;
 
1019
        mem_heap_t*     heap            = NULL;
 
1020
        ulint           err;
 
1021
 
 
1022
        *big_rec = NULL;
 
1023
 
 
1024
        block = btr_cur_get_block(cursor);
 
1025
        page = buf_block_get_frame(block);
 
1026
        index = cursor->index;
 
1027
        zip_size = buf_block_get_zip_size(block);
 
1028
#ifdef UNIV_DEBUG_VALGRIND
 
1029
        if (zip_size) {
 
1030
                UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
 
1031
                UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
 
1032
        }
 
1033
#endif /* UNIV_DEBUG_VALGRIND */
 
1034
 
 
1035
        if (!dtuple_check_typed_no_assert(entry)) {
 
1036
                fputs("InnoDB: Error in a tuple to insert into ", stderr);
 
1037
                dict_index_name_print(stderr, thr_get_trx(thr), index);
 
1038
        }
 
1039
#ifdef UNIV_DEBUG
 
1040
        if (btr_cur_print_record_ops && thr) {
 
1041
                btr_cur_trx_report(thr_get_trx(thr), index, "insert into ");
 
1042
                dtuple_print(stderr, entry);
 
1043
        }
 
1044
#endif /* UNIV_DEBUG */
 
1045
 
 
1046
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
1047
        max_size = page_get_max_insert_size_after_reorganize(page, 1);
 
1048
        leaf = page_is_leaf(page);
 
1049
 
 
1050
        /* Calculate the record size when entry is converted to a record */
 
1051
        rec_size = rec_get_converted_size(index, entry, n_ext);
 
1052
 
 
1053
        if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
 
1054
                                   dtuple_get_n_fields(entry), zip_size)) {
 
1055
 
 
1056
                /* The record is so big that we have to store some fields
 
1057
                externally on separate database pages */
 
1058
                big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
 
1059
 
 
1060
                if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
 
1061
 
 
1062
                        return(DB_TOO_BIG_RECORD);
 
1063
                }
 
1064
 
 
1065
                rec_size = rec_get_converted_size(index, entry, n_ext);
 
1066
        }
 
1067
 
 
1068
        if (UNIV_UNLIKELY(zip_size)) {
 
1069
                /* Estimate the free space of an empty compressed page.
 
1070
                Subtract one byte for the encoded heap_no in the
 
1071
                modification log. */
 
1072
                ulint   free_space_zip = page_zip_empty_size(
 
1073
                        cursor->index->n_fields, zip_size) - 1;
 
1074
                ulint   n_uniq = dict_index_get_n_unique_in_tree(index);
 
1075
 
 
1076
                ut_ad(dict_table_is_comp(index->table));
 
1077
 
 
1078
                /* There should be enough room for two node pointer
 
1079
                records on an empty non-leaf page.  This prevents
 
1080
                infinite page splits. */
 
1081
 
 
1082
                if (UNIV_LIKELY(entry->n_fields >= n_uniq)
 
1083
                    && UNIV_UNLIKELY(REC_NODE_PTR_SIZE
 
1084
                                     + rec_get_converted_size_comp_prefix(
 
1085
                                             index, entry->fields, n_uniq,
 
1086
                                             NULL)
 
1087
                                     /* On a compressed page, there is
 
1088
                                     a two-byte entry in the dense
 
1089
                                     page directory for every record.
 
1090
                                     But there is no record header. */
 
1091
                                     - (REC_N_NEW_EXTRA_BYTES - 2)
 
1092
                                     > free_space_zip / 2)) {
 
1093
 
 
1094
                        if (big_rec_vec) {
 
1095
                                dtuple_convert_back_big_rec(
 
1096
                                        index, entry, big_rec_vec);
 
1097
                        }
 
1098
 
 
1099
                        if (heap) {
 
1100
                                mem_heap_free(heap);
 
1101
                        }
 
1102
 
 
1103
                        return(DB_TOO_BIG_RECORD);
 
1104
                }
 
1105
        }
 
1106
 
 
1107
        /* If there have been many consecutive inserts, and we are on the leaf
 
1108
        level, check if we have to split the page to reserve enough free space
 
1109
        for future updates of records. */
 
1110
 
 
1111
        if (dict_index_is_clust(index)
 
1112
            && (page_get_n_recs(page) >= 2)
 
1113
            && UNIV_LIKELY(leaf)
 
1114
            && (dict_index_get_space_reserve() + rec_size > max_size)
 
1115
            && (btr_page_get_split_rec_to_right(cursor, &dummy_rec)
 
1116
                || btr_page_get_split_rec_to_left(cursor, &dummy_rec))) {
 
1117
fail:
 
1118
                err = DB_FAIL;
 
1119
fail_err:
 
1120
 
 
1121
                if (big_rec_vec) {
 
1122
                        dtuple_convert_back_big_rec(index, entry, big_rec_vec);
 
1123
                }
 
1124
 
 
1125
                if (UNIV_LIKELY_NULL(heap)) {
 
1126
                        mem_heap_free(heap);
 
1127
                }
 
1128
 
 
1129
                return(err);
 
1130
        }
 
1131
 
 
1132
        if (UNIV_UNLIKELY(max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
 
1133
             || max_size < rec_size)
 
1134
            && UNIV_LIKELY(page_get_n_recs(page) > 1)
 
1135
            && page_get_max_insert_size(page, 1) < rec_size) {
 
1136
 
 
1137
                goto fail;
 
1138
        }
 
1139
 
 
1140
        /* Check locks and write to the undo log, if specified */
 
1141
        err = btr_cur_ins_lock_and_undo(flags, cursor, entry, thr, &inherit);
 
1142
 
 
1143
        if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
 
1144
 
 
1145
                goto fail_err;
 
1146
        }
 
1147
 
 
1148
        page_cursor = btr_cur_get_page_cur(cursor);
 
1149
 
 
1150
        /* Now, try the insert */
 
1151
 
 
1152
        {
 
1153
                const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
 
1154
                *rec = page_cur_tuple_insert(page_cursor, entry, index,
 
1155
                                             n_ext, mtr);
 
1156
                reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
 
1157
 
 
1158
                if (UNIV_UNLIKELY(reorg)) {
 
1159
                        ut_a(zip_size);
 
1160
                        ut_a(*rec);
 
1161
                }
 
1162
        }
 
1163
 
 
1164
        if (UNIV_UNLIKELY(!*rec) && UNIV_LIKELY(!reorg)) {
 
1165
                /* If the record did not fit, reorganize */
 
1166
                if (UNIV_UNLIKELY(!btr_page_reorganize(block, index, mtr))) {
 
1167
                        ut_a(zip_size);
 
1168
 
 
1169
                        goto fail;
 
1170
                }
 
1171
 
 
1172
                ut_ad(zip_size
 
1173
                      || page_get_max_insert_size(page, 1) == max_size);
 
1174
 
 
1175
                reorg = TRUE;
 
1176
 
 
1177
                page_cur_search(block, index, entry, PAGE_CUR_LE, page_cursor);
 
1178
 
 
1179
                *rec = page_cur_tuple_insert(page_cursor, entry, index,
 
1180
                                             n_ext, mtr);
 
1181
 
 
1182
                if (UNIV_UNLIKELY(!*rec)) {
 
1183
                        if (UNIV_LIKELY(zip_size != 0)) {
 
1184
 
 
1185
                                goto fail;
 
1186
                        }
 
1187
 
 
1188
                        fputs("InnoDB: Error: cannot insert tuple ", stderr);
 
1189
                        dtuple_print(stderr, entry);
 
1190
                        fputs(" into ", stderr);
 
1191
                        dict_index_name_print(stderr, thr_get_trx(thr), index);
 
1192
                        fprintf(stderr, "\nInnoDB: max insert size %lu\n",
 
1193
                                (ulong) max_size);
 
1194
                        ut_error;
 
1195
                }
 
1196
        }
 
1197
 
 
1198
        if (UNIV_LIKELY_NULL(heap)) {
 
1199
                mem_heap_free(heap);
 
1200
        }
 
1201
 
 
1202
#ifdef BTR_CUR_HASH_ADAPT
 
1203
        if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
 
1204
                btr_search_update_hash_node_on_insert(cursor);
 
1205
        } else {
 
1206
                btr_search_update_hash_on_insert(cursor);
 
1207
        }
 
1208
#endif
 
1209
 
 
1210
        if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
 
1211
 
 
1212
                lock_update_insert(block, *rec);
 
1213
        }
 
1214
 
 
1215
#if 0
 
1216
        fprintf(stderr, "Insert into page %lu, max ins size %lu,"
 
1217
                " rec %lu ind type %lu\n",
 
1218
                buf_block_get_page_no(block), max_size,
 
1219
                rec_size + PAGE_DIR_SLOT_SIZE, index->type);
 
1220
#endif
 
1221
        if (!dict_index_is_clust(index) && leaf) {
 
1222
                /* Update the free bits of the B-tree page in the
 
1223
                insert buffer bitmap. */
 
1224
 
 
1225
                /* The free bits in the insert buffer bitmap must
 
1226
                never exceed the free space on a page.  It is safe to
 
1227
                decrement or reset the bits in the bitmap in a
 
1228
                mini-transaction that is committed before the
 
1229
                mini-transaction that affects the free space. */
 
1230
 
 
1231
                /* It is unsafe to increment the bits in a separately
 
1232
                committed mini-transaction, because in crash recovery,
 
1233
                the free bits could momentarily be set too high. */
 
1234
 
 
1235
                if (zip_size) {
 
1236
                        /* Update the bits in the same mini-transaction. */
 
1237
                        ibuf_update_free_bits_zip(block, mtr);
 
1238
                } else {
 
1239
                        /* Decrement the bits in a separate
 
1240
                        mini-transaction. */
 
1241
                        ibuf_update_free_bits_if_full(
 
1242
                                block, max_size,
 
1243
                                rec_size + PAGE_DIR_SLOT_SIZE);
 
1244
                }
 
1245
        }
 
1246
 
 
1247
        *big_rec = big_rec_vec;
 
1248
 
 
1249
        return(DB_SUCCESS);
 
1250
}
 
1251
 
 
1252
/*****************************************************************
 
1253
Performs an insert on a page of an index tree. It is assumed that mtr
 
1254
holds an x-latch on the tree and on the cursor page. If the insert is
 
1255
made on the leaf level, to avoid deadlocks, mtr must also own x-latches
 
1256
to brothers of page, if those brothers exist. */
 
1257
UNIV_INTERN
 
1258
ulint
 
1259
btr_cur_pessimistic_insert(
 
1260
/*=======================*/
 
1261
                                /* out: DB_SUCCESS or error number */
 
1262
        ulint           flags,  /* in: undo logging and locking flags: if not
 
1263
                                zero, the parameter thr should be
 
1264
                                specified; if no undo logging is specified,
 
1265
                                then the caller must have reserved enough
 
1266
                                free extents in the file space so that the
 
1267
                                insertion will certainly succeed */
 
1268
        btr_cur_t*      cursor, /* in: cursor after which to insert;
 
1269
                                cursor stays valid */
 
1270
        dtuple_t*       entry,  /* in/out: entry to insert */
 
1271
        rec_t**         rec,    /* out: pointer to inserted record if
 
1272
                                succeed */
 
1273
        big_rec_t**     big_rec,/* out: big rec vector whose fields have to
 
1274
                                be stored externally by the caller, or
 
1275
                                NULL */
 
1276
        ulint           n_ext,  /* in: number of externally stored columns */
 
1277
        que_thr_t*      thr,    /* in: query thread or NULL */
 
1278
        mtr_t*          mtr)    /* in: mtr */
 
1279
{
 
1280
        dict_index_t*   index           = cursor->index;
 
1281
        ulint           zip_size        = dict_table_zip_size(index->table);
 
1282
        big_rec_t*      big_rec_vec     = NULL;
 
1283
        mem_heap_t*     heap            = NULL;
 
1284
        ulint           err;
 
1285
        ibool           dummy_inh;
 
1286
        ibool           success;
 
1287
        ulint           n_extents       = 0;
 
1288
        ulint           n_reserved;
 
1289
 
 
1290
        ut_ad(dtuple_check_typed(entry));
 
1291
 
 
1292
        *big_rec = NULL;
 
1293
 
 
1294
        ut_ad(mtr_memo_contains(mtr,
 
1295
                                dict_index_get_lock(btr_cur_get_index(cursor)),
 
1296
                                MTR_MEMO_X_LOCK));
 
1297
        ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
 
1298
                                MTR_MEMO_PAGE_X_FIX));
 
1299
 
 
1300
        /* Try first an optimistic insert; reset the cursor flag: we do not
 
1301
        assume anything of how it was positioned */
 
1302
 
 
1303
        cursor->flag = BTR_CUR_BINARY;
 
1304
 
 
1305
        err = btr_cur_optimistic_insert(flags, cursor, entry, rec,
 
1306
                                        big_rec, n_ext, thr, mtr);
 
1307
        if (err != DB_FAIL) {
 
1308
 
 
1309
                return(err);
 
1310
        }
 
1311
 
 
1312
        /* Retry with a pessimistic insert. Check locks and write to undo log,
 
1313
        if specified */
 
1314
 
 
1315
        err = btr_cur_ins_lock_and_undo(flags, cursor, entry, thr, &dummy_inh);
 
1316
 
 
1317
        if (err != DB_SUCCESS) {
 
1318
 
 
1319
                return(err);
 
1320
        }
 
1321
 
 
1322
        if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
 
1323
                /* First reserve enough free space for the file segments
 
1324
                of the index tree, so that the insert will not fail because
 
1325
                of lack of space */
 
1326
 
 
1327
                n_extents = cursor->tree_height / 16 + 3;
 
1328
 
 
1329
                success = fsp_reserve_free_extents(&n_reserved, index->space,
 
1330
                                                   n_extents, FSP_NORMAL, mtr);
 
1331
                if (!success) {
 
1332
                        return(DB_OUT_OF_FILE_SPACE);
 
1333
                }
 
1334
        }
 
1335
 
 
1336
        if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
 
1337
                                   dict_table_is_comp(index->table),
 
1338
                                   dict_index_get_n_fields(index),
 
1339
                                   zip_size)) {
 
1340
                /* The record is so big that we have to store some fields
 
1341
                externally on separate database pages */
 
1342
 
 
1343
                if (UNIV_LIKELY_NULL(big_rec_vec)) {
 
1344
                        /* This should never happen, but we handle
 
1345
                        the situation in a robust manner. */
 
1346
                        ut_ad(0);
 
1347
                        dtuple_convert_back_big_rec(index, entry, big_rec_vec);
 
1348
                }
 
1349
 
 
1350
                big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
 
1351
 
 
1352
                if (big_rec_vec == NULL) {
 
1353
 
 
1354
                        if (n_extents > 0) {
 
1355
                                fil_space_release_free_extents(index->space,
 
1356
                                                               n_reserved);
 
1357
                        }
 
1358
                        return(DB_TOO_BIG_RECORD);
 
1359
                }
 
1360
        }
 
1361
 
 
1362
        if (dict_index_get_page(index)
 
1363
            == buf_block_get_page_no(btr_cur_get_block(cursor))) {
 
1364
 
 
1365
                /* The page is the root page */
 
1366
                *rec = btr_root_raise_and_insert(cursor, entry, n_ext, mtr);
 
1367
        } else {
 
1368
                *rec = btr_page_split_and_insert(cursor, entry, n_ext, mtr);
 
1369
        }
 
1370
 
 
1371
        if (UNIV_LIKELY_NULL(heap)) {
 
1372
                mem_heap_free(heap);
 
1373
        }
 
1374
 
 
1375
        ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
 
1376
 
 
1377
#ifdef BTR_CUR_ADAPT
 
1378
        btr_search_update_hash_on_insert(cursor);
 
1379
#endif
 
1380
        if (!(flags & BTR_NO_LOCKING_FLAG)) {
 
1381
 
 
1382
                lock_update_insert(btr_cur_get_block(cursor), *rec);
 
1383
        }
 
1384
 
 
1385
        if (n_extents > 0) {
 
1386
                fil_space_release_free_extents(index->space, n_reserved);
 
1387
        }
 
1388
 
 
1389
        *big_rec = big_rec_vec;
 
1390
 
 
1391
        return(DB_SUCCESS);
 
1392
}
 
1393
 
 
1394
/*==================== B-TREE UPDATE =========================*/
 
1395
 
 
1396
/*****************************************************************
 
1397
For an update, checks the locks and does the undo logging. */
 
1398
UNIV_INLINE
 
1399
ulint
 
1400
btr_cur_upd_lock_and_undo(
 
1401
/*======================*/
 
1402
                                /* out: DB_SUCCESS, DB_WAIT_LOCK, or error
 
1403
                                number */
 
1404
        ulint           flags,  /* in: undo logging and locking flags */
 
1405
        btr_cur_t*      cursor, /* in: cursor on record to update */
 
1406
        const upd_t*    update, /* in: update vector */
 
1407
        ulint           cmpl_info,/* in: compiler info on secondary index
 
1408
                                updates */
 
1409
        que_thr_t*      thr,    /* in: query thread */
 
1410
        dulint*         roll_ptr)/* out: roll pointer */
 
1411
{
 
1412
        dict_index_t*   index;
 
1413
        rec_t*          rec;
 
1414
        ulint           err;
 
1415
 
 
1416
        ut_ad(cursor && update && thr && roll_ptr);
 
1417
 
 
1418
        rec = btr_cur_get_rec(cursor);
 
1419
        index = cursor->index;
 
1420
 
 
1421
        if (!dict_index_is_clust(index)) {
 
1422
                /* We do undo logging only when we update a clustered index
 
1423
                record */
 
1424
                return(lock_sec_rec_modify_check_and_lock(
 
1425
                               flags, btr_cur_get_block(cursor), rec,
 
1426
                               index, thr));
 
1427
        }
 
1428
 
 
1429
        /* Check if we have to wait for a lock: enqueue an explicit lock
 
1430
        request if yes */
 
1431
 
 
1432
        err = DB_SUCCESS;
 
1433
 
 
1434
        if (!(flags & BTR_NO_LOCKING_FLAG)) {
 
1435
                mem_heap_t*     heap            = NULL;
 
1436
                ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
1437
                rec_offs_init(offsets_);
 
1438
 
 
1439
                err = lock_clust_rec_modify_check_and_lock(
 
1440
                        flags, btr_cur_get_block(cursor), rec, index,
 
1441
                        rec_get_offsets(rec, index, offsets_,
 
1442
                                        ULINT_UNDEFINED, &heap), thr);
 
1443
                if (UNIV_LIKELY_NULL(heap)) {
 
1444
                        mem_heap_free(heap);
 
1445
                }
 
1446
                if (err != DB_SUCCESS) {
 
1447
 
 
1448
                        return(err);
 
1449
                }
 
1450
        }
 
1451
 
 
1452
        /* Append the info about the update in the undo log */
 
1453
 
 
1454
        err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
 
1455
                                            index, NULL, update,
 
1456
                                            cmpl_info, rec, roll_ptr);
 
1457
        return(err);
 
1458
}
 
1459
 
 
1460
/***************************************************************
 
1461
Writes a redo log record of updating a record in-place. */
 
1462
UNIV_INLINE
 
1463
void
 
1464
btr_cur_update_in_place_log(
 
1465
/*========================*/
 
1466
        ulint           flags,          /* in: flags */
 
1467
        rec_t*          rec,            /* in: record */
 
1468
        dict_index_t*   index,          /* in: index where cursor positioned */
 
1469
        const upd_t*    update,         /* in: update vector */
 
1470
        trx_t*          trx,            /* in: transaction */
 
1471
        dulint          roll_ptr,       /* in: roll ptr */
 
1472
        mtr_t*          mtr)            /* in: mtr */
 
1473
{
 
1474
        byte*   log_ptr;
 
1475
        page_t* page    = page_align(rec);
 
1476
        ut_ad(flags < 256);
 
1477
        ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
 
1478
 
 
1479
        log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
 
1480
                                            ? MLOG_COMP_REC_UPDATE_IN_PLACE
 
1481
                                            : MLOG_REC_UPDATE_IN_PLACE,
 
1482
                                            1 + DATA_ROLL_PTR_LEN + 14 + 2
 
1483
                                            + MLOG_BUF_MARGIN);
 
1484
 
 
1485
        if (!log_ptr) {
 
1486
                /* Logging in mtr is switched off during crash recovery */
 
1487
                return;
 
1488
        }
 
1489
 
 
1490
        /* The code below assumes index is a clustered index: change index to
 
1491
        the clustered index if we are updating a secondary index record (or we
 
1492
        could as well skip writing the sys col values to the log in this case
 
1493
        because they are not needed for a secondary index record update) */
 
1494
 
 
1495
        index = dict_table_get_first_index(index->table);
 
1496
 
 
1497
        mach_write_to_1(log_ptr, flags);
 
1498
        log_ptr++;
 
1499
 
 
1500
        log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
 
1501
                                                mtr);
 
1502
        mach_write_to_2(log_ptr, page_offset(rec));
 
1503
        log_ptr += 2;
 
1504
 
 
1505
        row_upd_index_write_log(update, log_ptr, mtr);
 
1506
}
 
1507
 
 
1508
/***************************************************************
 
1509
Parses a redo log record of updating a record in-place. */
 
1510
UNIV_INTERN
 
1511
byte*
 
1512
btr_cur_parse_update_in_place(
 
1513
/*==========================*/
 
1514
                                /* out: end of log record or NULL */
 
1515
        byte*           ptr,    /* in: buffer */
 
1516
        byte*           end_ptr,/* in: buffer end */
 
1517
        page_t*         page,   /* in/out: page or NULL */
 
1518
        page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */
 
1519
        dict_index_t*   index)  /* in: index corresponding to page */
 
1520
{
 
1521
        ulint   flags;
 
1522
        rec_t*  rec;
 
1523
        upd_t*  update;
 
1524
        ulint   pos;
 
1525
        dulint  trx_id;
 
1526
        dulint  roll_ptr;
 
1527
        ulint   rec_offset;
 
1528
        mem_heap_t* heap;
 
1529
        ulint*  offsets;
 
1530
 
 
1531
        if (end_ptr < ptr + 1) {
 
1532
 
 
1533
                return(NULL);
 
1534
        }
 
1535
 
 
1536
        flags = mach_read_from_1(ptr);
 
1537
        ptr++;
 
1538
 
 
1539
        ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
 
1540
 
 
1541
        if (ptr == NULL) {
 
1542
 
 
1543
                return(NULL);
 
1544
        }
 
1545
 
 
1546
        if (end_ptr < ptr + 2) {
 
1547
 
 
1548
                return(NULL);
 
1549
        }
 
1550
 
 
1551
        rec_offset = mach_read_from_2(ptr);
 
1552
        ptr += 2;
 
1553
 
 
1554
        ut_a(rec_offset <= UNIV_PAGE_SIZE);
 
1555
 
 
1556
        heap = mem_heap_create(256);
 
1557
 
 
1558
        ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
 
1559
 
 
1560
        if (!ptr || !page) {
 
1561
 
 
1562
                goto func_exit;
 
1563
        }
 
1564
 
 
1565
        ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
 
1566
        rec = page + rec_offset;
 
1567
 
 
1568
        /* We do not need to reserve btr_search_latch, as the page is only
 
1569
        being recovered, and there cannot be a hash index to it. */
 
1570
 
 
1571
        offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
 
1572
 
 
1573
        if (!(flags & BTR_KEEP_SYS_FLAG)) {
 
1574
                row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
 
1575
                                                   pos, trx_id, roll_ptr);
 
1576
        }
 
1577
 
 
1578
        row_upd_rec_in_place(rec, index, offsets, update, page_zip);
 
1579
 
 
1580
func_exit:
 
1581
        mem_heap_free(heap);
 
1582
 
 
1583
        return(ptr);
 
1584
}
 
1585
 
 
1586
/*****************************************************************
 
1587
See if there is enough place in the page modification log to log
 
1588
an update-in-place. */
 
1589
static
 
1590
ibool
 
1591
btr_cur_update_alloc_zip(
 
1592
/*=====================*/
 
1593
                                /* out: TRUE if enough place */
 
1594
        page_zip_des_t* page_zip,/* in/out: compressed page */
 
1595
        buf_block_t*    block,  /* in/out: buffer page */
 
1596
        dict_index_t*   index,  /* in: the index corresponding to the block */
 
1597
        ulint           length, /* in: size needed */
 
1598
        mtr_t*          mtr)    /* in: mini-transaction */
 
1599
{
 
1600
        ut_a(page_zip == buf_block_get_page_zip(block));
 
1601
        ut_ad(page_zip);
 
1602
 
 
1603
        if (page_zip_available(page_zip, dict_index_is_clust(index),
 
1604
                               length, 0)) {
 
1605
                return(TRUE);
 
1606
        }
 
1607
 
 
1608
        if (!page_zip->m_nonempty) {
 
1609
                /* The page has been freshly compressed, so
 
1610
                recompressing it will not help. */
 
1611
                return(FALSE);
 
1612
        }
 
1613
 
 
1614
        if (!page_zip_compress(page_zip, buf_block_get_frame(block),
 
1615
                               index, mtr)) {
 
1616
                /* Unable to compress the page */
 
1617
                return(FALSE);
 
1618
        }
 
1619
 
 
1620
        /* After recompressing a page, we must make sure that the free
 
1621
        bits in the insert buffer bitmap will not exceed the free
 
1622
        space on the page.  Because this function will not attempt
 
1623
        recompression unless page_zip_available() fails above, it is
 
1624
        safe to reset the free bits if page_zip_available() fails
 
1625
        again, below.  The free bits can safely be reset in a separate
 
1626
        mini-transaction.  If page_zip_available() succeeds below, we
 
1627
        can be sure that the page_zip_compress() above did not reduce
 
1628
        the free space available on the page. */
 
1629
 
 
1630
        if (!page_zip_available(page_zip, dict_index_is_clust(index),
 
1631
                                length, 0)) {
 
1632
                /* Out of space: reset the free bits. */
 
1633
                if (!dict_index_is_clust(index)
 
1634
                    && page_is_leaf(buf_block_get_frame(block))) {
 
1635
                        ibuf_reset_free_bits(block);
 
1636
                }
 
1637
                return(FALSE);
 
1638
        }
 
1639
 
 
1640
        return(TRUE);
 
1641
}
 
1642
 
 
1643
/*****************************************************************
 
1644
Updates a record when the update causes no size changes in its fields.
 
1645
We assume here that the ordering fields of the record do not change. */
 
1646
UNIV_INTERN
 
1647
ulint
 
1648
btr_cur_update_in_place(
 
1649
/*====================*/
 
1650
                                /* out: DB_SUCCESS or error number */
 
1651
        ulint           flags,  /* in: undo logging and locking flags */
 
1652
        btr_cur_t*      cursor, /* in: cursor on the record to update;
 
1653
                                cursor stays valid and positioned on the
 
1654
                                same record */
 
1655
        const upd_t*    update, /* in: update vector */
 
1656
        ulint           cmpl_info,/* in: compiler info on secondary index
 
1657
                                updates */
 
1658
        que_thr_t*      thr,    /* in: query thread */
 
1659
        mtr_t*          mtr)    /* in: mtr; must be committed before
 
1660
                                latching any further pages */
 
1661
{
 
1662
        dict_index_t*   index;
 
1663
        buf_block_t*    block;
 
1664
        page_zip_des_t* page_zip;
 
1665
        ulint           err;
 
1666
        rec_t*          rec;
 
1667
        dulint          roll_ptr        = ut_dulint_zero;
 
1668
        trx_t*          trx;
 
1669
        ulint           was_delete_marked;
 
1670
        mem_heap_t*     heap            = NULL;
 
1671
        ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
1672
        ulint*          offsets         = offsets_;
 
1673
        rec_offs_init(offsets_);
 
1674
 
 
1675
        rec = btr_cur_get_rec(cursor);
 
1676
        index = cursor->index;
 
1677
        ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
 
1678
        trx = thr_get_trx(thr);
 
1679
        offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
 
1680
#ifdef UNIV_DEBUG
 
1681
        if (btr_cur_print_record_ops && thr) {
 
1682
                btr_cur_trx_report(trx, index, "update ");
 
1683
                rec_print_new(stderr, rec, offsets);
 
1684
        }
 
1685
#endif /* UNIV_DEBUG */
 
1686
 
 
1687
        block = btr_cur_get_block(cursor);
 
1688
        page_zip = buf_block_get_page_zip(block);
 
1689
 
 
1690
        /* Check that enough space is available on the compressed page. */
 
1691
        if (UNIV_LIKELY_NULL(page_zip)
 
1692
            && !btr_cur_update_alloc_zip(page_zip, block, index,
 
1693
                                         rec_offs_size(offsets), mtr)) {
 
1694
                return(DB_ZIP_OVERFLOW);
 
1695
        }
 
1696
 
 
1697
        /* Do lock checking and undo logging */
 
1698
        err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
 
1699
                                        thr, &roll_ptr);
 
1700
        if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
 
1701
 
 
1702
                if (UNIV_LIKELY_NULL(heap)) {
 
1703
                        mem_heap_free(heap);
 
1704
                }
 
1705
                return(err);
 
1706
        }
 
1707
 
 
1708
        if (block->is_hashed) {
 
1709
                /* The function row_upd_changes_ord_field_binary works only
 
1710
                if the update vector was built for a clustered index, we must
 
1711
                NOT call it if index is secondary */
 
1712
 
 
1713
                if (!dict_index_is_clust(index)
 
1714
                    || row_upd_changes_ord_field_binary(NULL, index, update)) {
 
1715
 
 
1716
                        /* Remove possible hash index pointer to this record */
 
1717
                        btr_search_update_hash_on_delete(cursor);
 
1718
                }
 
1719
 
 
1720
                rw_lock_x_lock(&btr_search_latch);
 
1721
        }
 
1722
 
 
1723
        if (!(flags & BTR_KEEP_SYS_FLAG)) {
 
1724
                row_upd_rec_sys_fields(rec, NULL,
 
1725
                                       index, offsets, trx, roll_ptr);
 
1726
        }
 
1727
 
 
1728
        was_delete_marked = rec_get_deleted_flag(
 
1729
                rec, page_is_comp(buf_block_get_frame(block)));
 
1730
 
 
1731
        row_upd_rec_in_place(rec, index, offsets, update, page_zip);
 
1732
 
 
1733
        if (block->is_hashed) {
 
1734
                rw_lock_x_unlock(&btr_search_latch);
 
1735
        }
 
1736
 
 
1737
        if (page_zip && !dict_index_is_clust(index)
 
1738
            && page_is_leaf(buf_block_get_frame(block))) {
 
1739
                /* Update the free bits in the insert buffer. */
 
1740
                ibuf_update_free_bits_zip(block, mtr);
 
1741
        }
 
1742
 
 
1743
        btr_cur_update_in_place_log(flags, rec, index, update,
 
1744
                                    trx, roll_ptr, mtr);
 
1745
 
 
1746
        if (was_delete_marked
 
1747
            && !rec_get_deleted_flag(rec, page_is_comp(
 
1748
                                             buf_block_get_frame(block)))) {
 
1749
                /* The new updated record owns its possible externally
 
1750
                stored fields */
 
1751
 
 
1752
                btr_cur_unmark_extern_fields(page_zip,
 
1753
                                             rec, index, offsets, mtr);
 
1754
        }
 
1755
 
 
1756
        if (UNIV_LIKELY_NULL(heap)) {
 
1757
                mem_heap_free(heap);
 
1758
        }
 
1759
        return(DB_SUCCESS);
 
1760
}
 
1761
 
 
1762
/*****************************************************************
 
1763
Tries to update a record on a page in an index tree. It is assumed that mtr
 
1764
holds an x-latch on the page. The operation does not succeed if there is too
 
1765
little space on the page or if the update would result in too empty a page,
 
1766
so that tree compression is recommended. We assume here that the ordering
 
1767
fields of the record do not change. */
 
1768
UNIV_INTERN
 
1769
ulint
 
1770
btr_cur_optimistic_update(
 
1771
/*======================*/
 
1772
                                /* out: DB_SUCCESS, or DB_OVERFLOW if the
 
1773
                                updated record does not fit, DB_UNDERFLOW
 
1774
                                if the page would become too empty, or
 
1775
                                DB_ZIP_OVERFLOW if there is not enough
 
1776
                                space left on the compressed page */
 
1777
        ulint           flags,  /* in: undo logging and locking flags */
 
1778
        btr_cur_t*      cursor, /* in: cursor on the record to update;
 
1779
                                cursor stays valid and positioned on the
 
1780
                                same record */
 
1781
        const upd_t*    update, /* in: update vector; this must also
 
1782
                                contain trx id and roll ptr fields */
 
1783
        ulint           cmpl_info,/* in: compiler info on secondary index
 
1784
                                updates */
 
1785
        que_thr_t*      thr,    /* in: query thread */
 
1786
        mtr_t*          mtr)    /* in: mtr; must be committed before
 
1787
                                latching any further pages */
 
1788
{
 
1789
        dict_index_t*   index;
 
1790
        page_cur_t*     page_cursor;
 
1791
        ulint           err;
 
1792
        buf_block_t*    block;
 
1793
        page_t*         page;
 
1794
        page_zip_des_t* page_zip;
 
1795
        rec_t*          rec;
 
1796
        rec_t*          orig_rec;
 
1797
        ulint           max_size;
 
1798
        ulint           new_rec_size;
 
1799
        ulint           old_rec_size;
 
1800
        dtuple_t*       new_entry;
 
1801
        dulint          roll_ptr;
 
1802
        trx_t*          trx;
 
1803
        mem_heap_t*     heap;
 
1804
        ulint           i;
 
1805
        ulint           n_ext;
 
1806
        ulint*          offsets;
 
1807
 
 
1808
        block = btr_cur_get_block(cursor);
 
1809
        page = buf_block_get_frame(block);
 
1810
        orig_rec = rec = btr_cur_get_rec(cursor);
 
1811
        index = cursor->index;
 
1812
        ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
 
1813
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
1814
 
 
1815
        heap = mem_heap_create(1024);
 
1816
        offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
 
1817
 
 
1818
#ifdef UNIV_DEBUG
 
1819
        if (btr_cur_print_record_ops && thr) {
 
1820
                btr_cur_trx_report(thr_get_trx(thr), index, "update ");
 
1821
                rec_print_new(stderr, rec, offsets);
 
1822
        }
 
1823
#endif /* UNIV_DEBUG */
 
1824
 
 
1825
        if (!row_upd_changes_field_size_or_external(index, offsets, update)) {
 
1826
 
 
1827
                /* The simplest and the most common case: the update does not
 
1828
                change the size of any field and none of the updated fields is
 
1829
                externally stored in rec or update, and there is enough space
 
1830
                on the compressed page to log the update. */
 
1831
 
 
1832
                mem_heap_free(heap);
 
1833
                return(btr_cur_update_in_place(flags, cursor, update,
 
1834
                                               cmpl_info, thr, mtr));
 
1835
        }
 
1836
 
 
1837
        if (rec_offs_any_extern(offsets)) {
 
1838
any_extern:
 
1839
                /* Externally stored fields are treated in pessimistic
 
1840
                update */
 
1841
 
 
1842
                mem_heap_free(heap);
 
1843
                return(DB_OVERFLOW);
 
1844
        }
 
1845
 
 
1846
        for (i = 0; i < upd_get_n_fields(update); i++) {
 
1847
                if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
 
1848
 
 
1849
                        goto any_extern;
 
1850
                }
 
1851
        }
 
1852
 
 
1853
        page_cursor = btr_cur_get_page_cur(cursor);
 
1854
 
 
1855
        new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
 
1856
                                           &n_ext, heap);
 
1857
        /* We checked above that there are no externally stored fields. */
 
1858
        ut_a(!n_ext);
 
1859
 
 
1860
        /* The page containing the clustered index record
 
1861
        corresponding to new_entry is latched in mtr.
 
1862
        Thus the following call is safe. */
 
1863
        row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
 
1864
                                                     FALSE, heap);
 
1865
        old_rec_size = rec_offs_size(offsets);
 
1866
        new_rec_size = rec_get_converted_size(index, new_entry, 0);
 
1867
 
 
1868
        page_zip = buf_block_get_page_zip(block);
 
1869
#ifdef UNIV_ZIP_DEBUG
 
1870
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
1871
#endif /* UNIV_ZIP_DEBUG */
 
1872
 
 
1873
        if (UNIV_LIKELY_NULL(page_zip)
 
1874
            && !btr_cur_update_alloc_zip(page_zip, block, index,
 
1875
                                         new_rec_size, mtr)) {
 
1876
                err = DB_ZIP_OVERFLOW;
 
1877
                goto err_exit;
 
1878
        }
 
1879
 
 
1880
        if (UNIV_UNLIKELY(new_rec_size
 
1881
                          >= (page_get_free_space_of_empty(page_is_comp(page))
 
1882
                              / 2))) {
 
1883
 
 
1884
                err = DB_OVERFLOW;
 
1885
                goto err_exit;
 
1886
        }
 
1887
 
 
1888
        if (UNIV_UNLIKELY(page_get_data_size(page)
 
1889
                          - old_rec_size + new_rec_size
 
1890
                          < BTR_CUR_PAGE_COMPRESS_LIMIT)) {
 
1891
 
 
1892
                /* The page would become too empty */
 
1893
 
 
1894
                err = DB_UNDERFLOW;
 
1895
                goto err_exit;
 
1896
        }
 
1897
 
 
1898
        max_size = old_rec_size
 
1899
                + page_get_max_insert_size_after_reorganize(page, 1);
 
1900
 
 
1901
        if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
 
1902
               && (max_size >= new_rec_size))
 
1903
              || (page_get_n_recs(page) <= 1))) {
 
1904
 
 
1905
                /* There was not enough space, or it did not pay to
 
1906
                reorganize: for simplicity, we decide what to do assuming a
 
1907
                reorganization is needed, though it might not be necessary */
 
1908
 
 
1909
                err = DB_OVERFLOW;
 
1910
                goto err_exit;
 
1911
        }
 
1912
 
 
1913
        /* Do lock checking and undo logging */
 
1914
        err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info, thr,
 
1915
                                        &roll_ptr);
 
1916
        if (err != DB_SUCCESS) {
 
1917
err_exit:
 
1918
                mem_heap_free(heap);
 
1919
                return(err);
 
1920
        }
 
1921
 
 
1922
        /* Ok, we may do the replacement. Store on the page infimum the
 
1923
        explicit locks on rec, before deleting rec (see the comment in
 
1924
        btr_cur_pessimistic_update). */
 
1925
 
 
1926
        lock_rec_store_on_page_infimum(block, rec);
 
1927
 
 
1928
        btr_search_update_hash_on_delete(cursor);
 
1929
 
 
1930
        /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
 
1931
        invokes rec_offs_make_valid() to point to the copied record that
 
1932
        the fields of new_entry point to.  We have to undo it here. */
 
1933
        ut_ad(rec_offs_validate(NULL, index, offsets));
 
1934
        rec_offs_make_valid(page_cur_get_rec(page_cursor), index, offsets);
 
1935
 
 
1936
        page_cur_delete_rec(page_cursor, index, offsets, mtr);
 
1937
 
 
1938
        page_cur_move_to_prev(page_cursor);
 
1939
 
 
1940
        trx = thr_get_trx(thr);
 
1941
 
 
1942
        if (!(flags & BTR_KEEP_SYS_FLAG)) {
 
1943
                row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
 
1944
                                              roll_ptr);
 
1945
                row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
 
1946
                                              trx->id);
 
1947
        }
 
1948
 
 
1949
        /* There are no externally stored columns in new_entry */
 
1950
        rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr);
 
1951
        ut_a(rec); /* <- We calculated above the insert would fit */
 
1952
 
 
1953
        if (page_zip && !dict_index_is_clust(index)
 
1954
            && page_is_leaf(page)) {
 
1955
                /* Update the free bits in the insert buffer. */
 
1956
                ibuf_update_free_bits_zip(block, mtr);
 
1957
        }
 
1958
 
 
1959
        /* Restore the old explicit lock state on the record */
 
1960
 
 
1961
        lock_rec_restore_from_page_infimum(block, rec, block);
 
1962
 
 
1963
        page_cur_move_to_next(page_cursor);
 
1964
 
 
1965
        mem_heap_free(heap);
 
1966
 
 
1967
        return(DB_SUCCESS);
 
1968
}
 
1969
 
 
1970
/*****************************************************************
 
1971
If, in a split, a new supremum record was created as the predecessor of the
 
1972
updated record, the supremum record must inherit exactly the locks on the
 
1973
updated record. In the split it may have inherited locks from the successor
 
1974
of the updated record, which is not correct. This function restores the
 
1975
right locks for the new supremum. */
 
1976
static
 
1977
void
 
1978
btr_cur_pess_upd_restore_supremum(
 
1979
/*==============================*/
 
1980
        buf_block_t*    block,  /* in: buffer block of rec */
 
1981
        const rec_t*    rec,    /* in: updated record */
 
1982
        mtr_t*          mtr)    /* in: mtr */
 
1983
{
 
1984
        page_t*         page;
 
1985
        buf_block_t*    prev_block;
 
1986
        ulint           space;
 
1987
        ulint           zip_size;
 
1988
        ulint           prev_page_no;
 
1989
 
 
1990
        page = buf_block_get_frame(block);
 
1991
 
 
1992
        if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
 
1993
                /* Updated record is not the first user record on its page */
 
1994
 
 
1995
                return;
 
1996
        }
 
1997
 
 
1998
        space = buf_block_get_space(block);
 
1999
        zip_size = buf_block_get_zip_size(block);
 
2000
        prev_page_no = btr_page_get_prev(page, mtr);
 
2001
 
 
2002
        ut_ad(prev_page_no != FIL_NULL);
 
2003
        prev_block = buf_page_get_with_no_latch(space, zip_size,
 
2004
                                                prev_page_no, mtr);
 
2005
#ifdef UNIV_BTR_DEBUG
 
2006
        ut_a(btr_page_get_next(prev_block->frame, mtr)
 
2007
             == page_get_page_no(page));
 
2008
#endif /* UNIV_BTR_DEBUG */
 
2009
 
 
2010
        /* We must already have an x-latch on prev_block! */
 
2011
        ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
 
2012
 
 
2013
        lock_rec_reset_and_inherit_gap_locks(prev_block, block,
 
2014
                                             PAGE_HEAP_NO_SUPREMUM,
 
2015
                                             page_rec_get_heap_no(rec));
 
2016
}
 
2017
 
 
2018
/*****************************************************************
 
2019
Performs an update of a record on a page of a tree. It is assumed
 
2020
that mtr holds an x-latch on the tree and on the cursor page. If the
 
2021
update is made on the leaf level, to avoid deadlocks, mtr must also
 
2022
own x-latches to brothers of page, if those brothers exist. We assume
 
2023
here that the ordering fields of the record do not change. */
 
2024
UNIV_INTERN
 
2025
ulint
 
2026
btr_cur_pessimistic_update(
 
2027
/*=======================*/
 
2028
                                /* out: DB_SUCCESS or error code */
 
2029
        ulint           flags,  /* in: undo logging, locking, and rollback
 
2030
                                flags */
 
2031
        btr_cur_t*      cursor, /* in: cursor on the record to update */
 
2032
        mem_heap_t**    heap,   /* in/out: pointer to memory heap, or NULL */
 
2033
        big_rec_t**     big_rec,/* out: big rec vector whose fields have to
 
2034
                                be stored externally by the caller, or NULL */
 
2035
        const upd_t*    update, /* in: update vector; this is allowed also
 
2036
                                contain trx id and roll ptr fields, but
 
2037
                                the values in update vector have no effect */
 
2038
        ulint           cmpl_info,/* in: compiler info on secondary index
 
2039
                                updates */
 
2040
        que_thr_t*      thr,    /* in: query thread */
 
2041
        mtr_t*          mtr)    /* in: mtr; must be committed before
 
2042
                                latching any further pages */
 
2043
{
 
2044
        big_rec_t*      big_rec_vec     = NULL;
 
2045
        big_rec_t*      dummy_big_rec;
 
2046
        dict_index_t*   index;
 
2047
        buf_block_t*    block;
 
2048
        page_t*         page;
 
2049
        page_zip_des_t* page_zip;
 
2050
        rec_t*          rec;
 
2051
        page_cur_t*     page_cursor;
 
2052
        dtuple_t*       new_entry;
 
2053
        ulint           err;
 
2054
        ulint           optim_err;
 
2055
        dulint          roll_ptr;
 
2056
        trx_t*          trx;
 
2057
        ibool           was_first;
 
2058
        ulint           n_extents       = 0;
 
2059
        ulint           n_reserved;
 
2060
        ulint           n_ext;
 
2061
        ulint*          offsets         = NULL;
 
2062
 
 
2063
        *big_rec = NULL;
 
2064
 
 
2065
        block = btr_cur_get_block(cursor);
 
2066
        page = buf_block_get_frame(block);
 
2067
        page_zip = buf_block_get_page_zip(block);
 
2068
        rec = btr_cur_get_rec(cursor);
 
2069
        index = cursor->index;
 
2070
 
 
2071
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 
2072
                                MTR_MEMO_X_LOCK));
 
2073
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2074
#ifdef UNIV_ZIP_DEBUG
 
2075
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
2076
#endif /* UNIV_ZIP_DEBUG */
 
2077
 
 
2078
        optim_err = btr_cur_optimistic_update(flags, cursor, update,
 
2079
                                              cmpl_info, thr, mtr);
 
2080
 
 
2081
        switch (optim_err) {
 
2082
        case DB_UNDERFLOW:
 
2083
        case DB_OVERFLOW:
 
2084
        case DB_ZIP_OVERFLOW:
 
2085
                break;
 
2086
        default:
 
2087
                return(optim_err);
 
2088
        }
 
2089
 
 
2090
        /* Do lock checking and undo logging */
 
2091
        err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
 
2092
                                        thr, &roll_ptr);
 
2093
        if (err != DB_SUCCESS) {
 
2094
 
 
2095
                return(err);
 
2096
        }
 
2097
 
 
2098
        if (optim_err == DB_OVERFLOW) {
 
2099
                ulint   reserve_flag;
 
2100
 
 
2101
                /* First reserve enough free space for the file segments
 
2102
                of the index tree, so that the update will not fail because
 
2103
                of lack of space */
 
2104
 
 
2105
                n_extents = cursor->tree_height / 16 + 3;
 
2106
 
 
2107
                if (flags & BTR_NO_UNDO_LOG_FLAG) {
 
2108
                        reserve_flag = FSP_CLEANING;
 
2109
                } else {
 
2110
                        reserve_flag = FSP_NORMAL;
 
2111
                }
 
2112
 
 
2113
                if (!fsp_reserve_free_extents(&n_reserved, index->space,
 
2114
                                              n_extents, reserve_flag, mtr)) {
 
2115
                        return(DB_OUT_OF_FILE_SPACE);
 
2116
                }
 
2117
        }
 
2118
 
 
2119
        if (!*heap) {
 
2120
                *heap = mem_heap_create(1024);
 
2121
        }
 
2122
        offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, heap);
 
2123
 
 
2124
        trx = thr_get_trx(thr);
 
2125
 
 
2126
        new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
 
2127
                                           &n_ext, *heap);
 
2128
        /* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
 
2129
        invokes rec_offs_make_valid() to point to the copied record that
 
2130
        the fields of new_entry point to.  We have to undo it here. */
 
2131
        ut_ad(rec_offs_validate(NULL, index, offsets));
 
2132
        rec_offs_make_valid(rec, index, offsets);
 
2133
 
 
2134
        /* The page containing the clustered index record
 
2135
        corresponding to new_entry is latched in mtr.  If the
 
2136
        clustered index record is delete-marked, then its externally
 
2137
        stored fields cannot have been purged yet, because then the
 
2138
        purge would also have removed the clustered index record
 
2139
        itself.  Thus the following call is safe. */
 
2140
        row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
 
2141
                                                     FALSE, *heap);
 
2142
        if (!(flags & BTR_KEEP_SYS_FLAG)) {
 
2143
                row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
 
2144
                                              roll_ptr);
 
2145
                row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
 
2146
                                              trx->id);
 
2147
        }
 
2148
 
 
2149
        if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(offsets)) {
 
2150
                /* We are in a transaction rollback undoing a row
 
2151
                update: we must free possible externally stored fields
 
2152
                which got new values in the update, if they are not
 
2153
                inherited values. They can be inherited if we have
 
2154
                updated the primary key to another value, and then
 
2155
                update it back again. */
 
2156
 
 
2157
                ut_ad(big_rec_vec == NULL);
 
2158
 
 
2159
                btr_rec_free_updated_extern_fields(
 
2160
                        index, rec, page_zip, offsets, update,
 
2161
                        trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr);
 
2162
        }
 
2163
 
 
2164
        /* We have to set appropriate extern storage bits in the new
 
2165
        record to be inserted: we have to remember which fields were such */
 
2166
 
 
2167
        ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
 
2168
        offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap);
 
2169
        n_ext += btr_push_update_extern_fields(new_entry, update, *heap);
 
2170
 
 
2171
        if (UNIV_LIKELY_NULL(page_zip)) {
 
2172
                ut_ad(page_is_comp(page));
 
2173
                if (page_zip_rec_needs_ext(
 
2174
                            rec_get_converted_size(index, new_entry, n_ext),
 
2175
                            TRUE,
 
2176
                            dict_index_get_n_fields(index),
 
2177
                            page_zip_get_size(page_zip))) {
 
2178
 
 
2179
                        goto make_external;
 
2180
                }
 
2181
        } else if (page_zip_rec_needs_ext(
 
2182
                           rec_get_converted_size(index, new_entry, n_ext),
 
2183
                           page_is_comp(page), 0, 0)) {
 
2184
make_external:
 
2185
                big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
 
2186
                if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
 
2187
 
 
2188
                        err = DB_TOO_BIG_RECORD;
 
2189
                        goto return_after_reservations;
 
2190
                }
 
2191
        }
 
2192
 
 
2193
        /* Store state of explicit locks on rec on the page infimum record,
 
2194
        before deleting rec. The page infimum acts as a dummy carrier of the
 
2195
        locks, taking care also of lock releases, before we can move the locks
 
2196
        back on the actual record. There is a special case: if we are
 
2197
        inserting on the root page and the insert causes a call of
 
2198
        btr_root_raise_and_insert. Therefore we cannot in the lock system
 
2199
        delete the lock structs set on the root page even if the root
 
2200
        page carries just node pointers. */
 
2201
 
 
2202
        lock_rec_store_on_page_infimum(block, rec);
 
2203
 
 
2204
        btr_search_update_hash_on_delete(cursor);
 
2205
 
 
2206
#ifdef UNIV_ZIP_DEBUG
 
2207
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
2208
#endif /* UNIV_ZIP_DEBUG */
 
2209
        page_cursor = btr_cur_get_page_cur(cursor);
 
2210
 
 
2211
        page_cur_delete_rec(page_cursor, index, offsets, mtr);
 
2212
 
 
2213
        page_cur_move_to_prev(page_cursor);
 
2214
 
 
2215
        rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr);
 
2216
 
 
2217
        if (rec) {
 
2218
                lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
 
2219
                                                   rec, block);
 
2220
 
 
2221
                offsets = rec_get_offsets(rec, index, offsets,
 
2222
                                          ULINT_UNDEFINED, heap);
 
2223
 
 
2224
                if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
 
2225
                        /* The new inserted record owns its possible externally
 
2226
                        stored fields */
 
2227
                        btr_cur_unmark_extern_fields(page_zip,
 
2228
                                                     rec, index, offsets, mtr);
 
2229
                }
 
2230
 
 
2231
                btr_cur_compress_if_useful(cursor, mtr);
 
2232
 
 
2233
                if (page_zip && !dict_index_is_clust(index)
 
2234
                    && page_is_leaf(page)) {
 
2235
                        /* Update the free bits in the insert buffer. */
 
2236
                        ibuf_update_free_bits_zip(block, mtr);
 
2237
                }
 
2238
 
 
2239
                err = DB_SUCCESS;
 
2240
                goto return_after_reservations;
 
2241
        } else {
 
2242
                ut_a(optim_err != DB_UNDERFLOW);
 
2243
 
 
2244
                /* Out of space: reset the free bits. */
 
2245
                if (!dict_index_is_clust(index)
 
2246
                    && page_is_leaf(page)) {
 
2247
                        ibuf_reset_free_bits(block);
 
2248
                }
 
2249
        }
 
2250
 
 
2251
        /* Was the record to be updated positioned as the first user
 
2252
        record on its page? */
 
2253
        was_first = page_cur_is_before_first(page_cursor);
 
2254
 
 
2255
        /* The first parameter means that no lock checking and undo logging
 
2256
        is made in the insert */
 
2257
 
 
2258
        err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
 
2259
                                         | BTR_NO_LOCKING_FLAG
 
2260
                                         | BTR_KEEP_SYS_FLAG,
 
2261
                                         cursor, new_entry, &rec,
 
2262
                                         &dummy_big_rec, n_ext, NULL, mtr);
 
2263
        ut_a(rec);
 
2264
        ut_a(err == DB_SUCCESS);
 
2265
        ut_a(dummy_big_rec == NULL);
 
2266
 
 
2267
        if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
 
2268
                /* The new inserted record owns its possible externally
 
2269
                stored fields */
 
2270
                buf_block_t*    rec_block = btr_cur_get_block(cursor);
 
2271
 
 
2272
#ifdef UNIV_ZIP_DEBUG
 
2273
                ut_a(!page_zip || page_zip_validate(page_zip, page));
 
2274
                page = buf_block_get_frame(rec_block);
 
2275
#endif /* UNIV_ZIP_DEBUG */
 
2276
                page_zip = buf_block_get_page_zip(rec_block);
 
2277
 
 
2278
                offsets = rec_get_offsets(rec, index, offsets,
 
2279
                                          ULINT_UNDEFINED, heap);
 
2280
                btr_cur_unmark_extern_fields(page_zip,
 
2281
                                             rec, index, offsets, mtr);
 
2282
        }
 
2283
 
 
2284
        lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
 
2285
                                           rec, block);
 
2286
 
 
2287
        /* If necessary, restore also the correct lock state for a new,
 
2288
        preceding supremum record created in a page split. While the old
 
2289
        record was nonexistent, the supremum might have inherited its locks
 
2290
        from a wrong record. */
 
2291
 
 
2292
        if (!was_first) {
 
2293
                btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
 
2294
                                                  rec, mtr);
 
2295
        }
 
2296
 
 
2297
return_after_reservations:
 
2298
#ifdef UNIV_ZIP_DEBUG
 
2299
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
2300
#endif /* UNIV_ZIP_DEBUG */
 
2301
 
 
2302
        if (n_extents > 0) {
 
2303
                fil_space_release_free_extents(index->space, n_reserved);
 
2304
        }
 
2305
 
 
2306
        *big_rec = big_rec_vec;
 
2307
 
 
2308
        return(err);
 
2309
}
 
2310
 
 
2311
/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
 
2312
 
 
2313
/********************************************************************
 
2314
Writes the redo log record for delete marking or unmarking of an index
 
2315
record. */
 
2316
UNIV_INLINE
 
2317
void
 
2318
btr_cur_del_mark_set_clust_rec_log(
 
2319
/*===============================*/
 
2320
        ulint           flags,  /* in: flags */
 
2321
        rec_t*          rec,    /* in: record */
 
2322
        dict_index_t*   index,  /* in: index of the record */
 
2323
        ibool           val,    /* in: value to set */
 
2324
        trx_t*          trx,    /* in: deleting transaction */
 
2325
        dulint          roll_ptr,/* in: roll ptr to the undo log record */
 
2326
        mtr_t*          mtr)    /* in: mtr */
 
2327
{
 
2328
        byte*   log_ptr;
 
2329
        ut_ad(flags < 256);
 
2330
        ut_ad(val <= 1);
 
2331
 
 
2332
        ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
 
2333
 
 
2334
        log_ptr = mlog_open_and_write_index(mtr, rec, index,
 
2335
                                            page_rec_is_comp(rec)
 
2336
                                            ? MLOG_COMP_REC_CLUST_DELETE_MARK
 
2337
                                            : MLOG_REC_CLUST_DELETE_MARK,
 
2338
                                            1 + 1 + DATA_ROLL_PTR_LEN
 
2339
                                            + 14 + 2);
 
2340
 
 
2341
        if (!log_ptr) {
 
2342
                /* Logging in mtr is switched off during crash recovery */
 
2343
                return;
 
2344
        }
 
2345
 
 
2346
        mach_write_to_1(log_ptr, flags);
 
2347
        log_ptr++;
 
2348
        mach_write_to_1(log_ptr, val);
 
2349
        log_ptr++;
 
2350
 
 
2351
        log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
 
2352
                                                mtr);
 
2353
        mach_write_to_2(log_ptr, page_offset(rec));
 
2354
        log_ptr += 2;
 
2355
 
 
2356
        mlog_close(mtr, log_ptr);
 
2357
}
 
2358
 
 
2359
/********************************************************************
 
2360
Parses the redo log record for delete marking or unmarking of a clustered
 
2361
index record. */
 
2362
UNIV_INTERN
 
2363
byte*
 
2364
btr_cur_parse_del_mark_set_clust_rec(
 
2365
/*=================================*/
 
2366
                                /* out: end of log record or NULL */
 
2367
        byte*           ptr,    /* in: buffer */
 
2368
        byte*           end_ptr,/* in: buffer end */
 
2369
        page_t*         page,   /* in/out: page or NULL */
 
2370
        page_zip_des_t* page_zip,/* in/out: compressed page, or NULL */
 
2371
        dict_index_t*   index)  /* in: index corresponding to page */
 
2372
{
 
2373
        ulint   flags;
 
2374
        ulint   val;
 
2375
        ulint   pos;
 
2376
        dulint  trx_id;
 
2377
        dulint  roll_ptr;
 
2378
        ulint   offset;
 
2379
        rec_t*  rec;
 
2380
 
 
2381
        ut_ad(!page
 
2382
              || !!page_is_comp(page) == dict_table_is_comp(index->table));
 
2383
 
 
2384
        if (end_ptr < ptr + 2) {
 
2385
 
 
2386
                return(NULL);
 
2387
        }
 
2388
 
 
2389
        flags = mach_read_from_1(ptr);
 
2390
        ptr++;
 
2391
        val = mach_read_from_1(ptr);
 
2392
        ptr++;
 
2393
 
 
2394
        ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
 
2395
 
 
2396
        if (ptr == NULL) {
 
2397
 
 
2398
                return(NULL);
 
2399
        }
 
2400
 
 
2401
        if (end_ptr < ptr + 2) {
 
2402
 
 
2403
                return(NULL);
 
2404
        }
 
2405
 
 
2406
        offset = mach_read_from_2(ptr);
 
2407
        ptr += 2;
 
2408
 
 
2409
        ut_a(offset <= UNIV_PAGE_SIZE);
 
2410
 
 
2411
        if (page) {
 
2412
                rec = page + offset;
 
2413
 
 
2414
                /* We do not need to reserve btr_search_latch, as the page
 
2415
                is only being recovered, and there cannot be a hash index to
 
2416
                it. */
 
2417
 
 
2418
                btr_rec_set_deleted_flag(rec, page_zip, val);
 
2419
 
 
2420
                if (!(flags & BTR_KEEP_SYS_FLAG)) {
 
2421
                        mem_heap_t*     heap            = NULL;
 
2422
                        ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
2423
                        rec_offs_init(offsets_);
 
2424
 
 
2425
                        row_upd_rec_sys_fields_in_recovery(
 
2426
                                rec, page_zip,
 
2427
                                rec_get_offsets(rec, index, offsets_,
 
2428
                                                ULINT_UNDEFINED, &heap),
 
2429
                                pos, trx_id, roll_ptr);
 
2430
                        if (UNIV_LIKELY_NULL(heap)) {
 
2431
                                mem_heap_free(heap);
 
2432
                        }
 
2433
                }
 
2434
        }
 
2435
 
 
2436
        return(ptr);
 
2437
}
 
2438
 
 
2439
/***************************************************************
 
2440
Marks a clustered index record deleted. Writes an undo log record to
 
2441
undo log on this delete marking. Writes in the trx id field the id
 
2442
of the deleting transaction, and in the roll ptr field pointer to the
 
2443
undo log record created. */
 
2444
UNIV_INTERN
 
2445
ulint
 
2446
btr_cur_del_mark_set_clust_rec(
 
2447
/*===========================*/
 
2448
                                /* out: DB_SUCCESS, DB_LOCK_WAIT, or error
 
2449
                                number */
 
2450
        ulint           flags,  /* in: undo logging and locking flags */
 
2451
        btr_cur_t*      cursor, /* in: cursor */
 
2452
        ibool           val,    /* in: value to set */
 
2453
        que_thr_t*      thr,    /* in: query thread */
 
2454
        mtr_t*          mtr)    /* in: mtr */
 
2455
{
 
2456
        dict_index_t*   index;
 
2457
        buf_block_t*    block;
 
2458
        dulint          roll_ptr;
 
2459
        ulint           err;
 
2460
        rec_t*          rec;
 
2461
        page_zip_des_t* page_zip;
 
2462
        trx_t*          trx;
 
2463
        mem_heap_t*     heap            = NULL;
 
2464
        ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
2465
        ulint*          offsets         = offsets_;
 
2466
        rec_offs_init(offsets_);
 
2467
 
 
2468
        rec = btr_cur_get_rec(cursor);
 
2469
        index = cursor->index;
 
2470
        ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
 
2471
        offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
 
2472
 
 
2473
#ifdef UNIV_DEBUG
 
2474
        if (btr_cur_print_record_ops && thr) {
 
2475
                btr_cur_trx_report(thr_get_trx(thr), index, "del mark ");
 
2476
                rec_print_new(stderr, rec, offsets);
 
2477
        }
 
2478
#endif /* UNIV_DEBUG */
 
2479
 
 
2480
        ut_ad(dict_index_is_clust(index));
 
2481
        ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
 
2482
 
 
2483
        err = lock_clust_rec_modify_check_and_lock(flags,
 
2484
                                                   btr_cur_get_block(cursor),
 
2485
                                                   rec, index, offsets, thr);
 
2486
 
 
2487
        if (err != DB_SUCCESS) {
 
2488
 
 
2489
                goto func_exit;
 
2490
        }
 
2491
 
 
2492
        err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
 
2493
                                            index, NULL, NULL, 0, rec,
 
2494
                                            &roll_ptr);
 
2495
        if (err != DB_SUCCESS) {
 
2496
 
 
2497
                goto func_exit;
 
2498
        }
 
2499
 
 
2500
        block = btr_cur_get_block(cursor);
 
2501
 
 
2502
        if (block->is_hashed) {
 
2503
                rw_lock_x_lock(&btr_search_latch);
 
2504
        }
 
2505
 
 
2506
        page_zip = buf_block_get_page_zip(block);
 
2507
 
 
2508
        btr_rec_set_deleted_flag(rec, page_zip, val);
 
2509
 
 
2510
        trx = thr_get_trx(thr);
 
2511
 
 
2512
        if (!(flags & BTR_KEEP_SYS_FLAG)) {
 
2513
                row_upd_rec_sys_fields(rec, page_zip,
 
2514
                                       index, offsets, trx, roll_ptr);
 
2515
        }
 
2516
 
 
2517
        if (block->is_hashed) {
 
2518
                rw_lock_x_unlock(&btr_search_latch);
 
2519
        }
 
2520
 
 
2521
        btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx,
 
2522
                                           roll_ptr, mtr);
 
2523
 
 
2524
func_exit:
 
2525
        if (UNIV_LIKELY_NULL(heap)) {
 
2526
                mem_heap_free(heap);
 
2527
        }
 
2528
        return(err);
 
2529
}
 
2530
 
 
2531
/********************************************************************
 
2532
Writes the redo log record for a delete mark setting of a secondary
 
2533
index record. */
 
2534
UNIV_INLINE
 
2535
void
 
2536
btr_cur_del_mark_set_sec_rec_log(
 
2537
/*=============================*/
 
2538
        rec_t*          rec,    /* in: record */
 
2539
        ibool           val,    /* in: value to set */
 
2540
        mtr_t*          mtr)    /* in: mtr */
 
2541
{
 
2542
        byte*   log_ptr;
 
2543
        ut_ad(val <= 1);
 
2544
 
 
2545
        log_ptr = mlog_open(mtr, 11 + 1 + 2);
 
2546
 
 
2547
        if (!log_ptr) {
 
2548
                /* Logging in mtr is switched off during crash recovery:
 
2549
                in that case mlog_open returns NULL */
 
2550
                return;
 
2551
        }
 
2552
 
 
2553
        log_ptr = mlog_write_initial_log_record_fast(
 
2554
                rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
 
2555
        mach_write_to_1(log_ptr, val);
 
2556
        log_ptr++;
 
2557
 
 
2558
        mach_write_to_2(log_ptr, page_offset(rec));
 
2559
        log_ptr += 2;
 
2560
 
 
2561
        mlog_close(mtr, log_ptr);
 
2562
}
 
2563
 
 
2564
/********************************************************************
 
2565
Parses the redo log record for delete marking or unmarking of a secondary
 
2566
index record. */
 
2567
UNIV_INTERN
 
2568
byte*
 
2569
btr_cur_parse_del_mark_set_sec_rec(
 
2570
/*===============================*/
 
2571
                                /* out: end of log record or NULL */
 
2572
        byte*           ptr,    /* in: buffer */
 
2573
        byte*           end_ptr,/* in: buffer end */
 
2574
        page_t*         page,   /* in/out: page or NULL */
 
2575
        page_zip_des_t* page_zip)/* in/out: compressed page, or NULL */
 
2576
{
 
2577
        ulint   val;
 
2578
        ulint   offset;
 
2579
        rec_t*  rec;
 
2580
 
 
2581
        if (end_ptr < ptr + 3) {
 
2582
 
 
2583
                return(NULL);
 
2584
        }
 
2585
 
 
2586
        val = mach_read_from_1(ptr);
 
2587
        ptr++;
 
2588
 
 
2589
        offset = mach_read_from_2(ptr);
 
2590
        ptr += 2;
 
2591
 
 
2592
        ut_a(offset <= UNIV_PAGE_SIZE);
 
2593
 
 
2594
        if (page) {
 
2595
                rec = page + offset;
 
2596
 
 
2597
                /* We do not need to reserve btr_search_latch, as the page
 
2598
                is only being recovered, and there cannot be a hash index to
 
2599
                it. */
 
2600
 
 
2601
                btr_rec_set_deleted_flag(rec, page_zip, val);
 
2602
        }
 
2603
 
 
2604
        return(ptr);
 
2605
}
 
2606
 
 
2607
/***************************************************************
 
2608
Sets a secondary index record delete mark to TRUE or FALSE. */
 
2609
UNIV_INTERN
 
2610
ulint
 
2611
btr_cur_del_mark_set_sec_rec(
 
2612
/*=========================*/
 
2613
                                /* out: DB_SUCCESS, DB_LOCK_WAIT, or error
 
2614
                                number */
 
2615
        ulint           flags,  /* in: locking flag */
 
2616
        btr_cur_t*      cursor, /* in: cursor */
 
2617
        ibool           val,    /* in: value to set */
 
2618
        que_thr_t*      thr,    /* in: query thread */
 
2619
        mtr_t*          mtr)    /* in: mtr */
 
2620
{
 
2621
        buf_block_t*    block;
 
2622
        rec_t*          rec;
 
2623
        ulint           err;
 
2624
 
 
2625
        block = btr_cur_get_block(cursor);
 
2626
        rec = btr_cur_get_rec(cursor);
 
2627
 
 
2628
#ifdef UNIV_DEBUG
 
2629
        if (btr_cur_print_record_ops && thr) {
 
2630
                btr_cur_trx_report(thr_get_trx(thr), cursor->index,
 
2631
                                   "del mark ");
 
2632
                rec_print(stderr, rec, cursor->index);
 
2633
        }
 
2634
#endif /* UNIV_DEBUG */
 
2635
 
 
2636
        err = lock_sec_rec_modify_check_and_lock(flags,
 
2637
                                                 btr_cur_get_block(cursor),
 
2638
                                                 rec, cursor->index, thr);
 
2639
        if (err != DB_SUCCESS) {
 
2640
 
 
2641
                return(err);
 
2642
        }
 
2643
 
 
2644
        ut_ad(!!page_rec_is_comp(rec)
 
2645
              == dict_table_is_comp(cursor->index->table));
 
2646
 
 
2647
        if (block->is_hashed) {
 
2648
                rw_lock_x_lock(&btr_search_latch);
 
2649
        }
 
2650
 
 
2651
        btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
 
2652
 
 
2653
        if (block->is_hashed) {
 
2654
                rw_lock_x_unlock(&btr_search_latch);
 
2655
        }
 
2656
 
 
2657
        btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
 
2658
 
 
2659
        return(DB_SUCCESS);
 
2660
}
 
2661
 
 
2662
/***************************************************************
 
2663
Clear a secondary index record's delete mark.  This function is only
 
2664
used by the insert buffer insert merge mechanism. */
 
2665
UNIV_INTERN
 
2666
void
 
2667
btr_cur_del_unmark_for_ibuf(
 
2668
/*========================*/
 
2669
        rec_t*          rec,            /* in/out: record to delete unmark */
 
2670
        page_zip_des_t* page_zip,       /* in/out: compressed page
 
2671
                                        corresponding to rec, or NULL
 
2672
                                        when the tablespace is
 
2673
                                        uncompressed */
 
2674
        mtr_t*          mtr)            /* in: mtr */
 
2675
{
 
2676
        /* We do not need to reserve btr_search_latch, as the page has just
 
2677
        been read to the buffer pool and there cannot be a hash index to it. */
 
2678
 
 
2679
        btr_rec_set_deleted_flag(rec, page_zip, FALSE);
 
2680
 
 
2681
        btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr);
 
2682
}
 
2683
 
 
2684
/*==================== B-TREE RECORD REMOVE =========================*/
 
2685
 
 
2686
/*****************************************************************
 
2687
Tries to compress a page of the tree if it seems useful. It is assumed
 
2688
that mtr holds an x-latch on the tree and on the cursor page. To avoid
 
2689
deadlocks, mtr must also own x-latches to brothers of page, if those
 
2690
brothers exist. NOTE: it is assumed that the caller has reserved enough
 
2691
free extents so that the compression will always succeed if done! */
 
2692
UNIV_INTERN
 
2693
ibool
 
2694
btr_cur_compress_if_useful(
 
2695
/*=======================*/
 
2696
                                /* out: TRUE if compression occurred */
 
2697
        btr_cur_t*      cursor, /* in: cursor on the page to compress;
 
2698
                                cursor does not stay valid if compression
 
2699
                                occurs */
 
2700
        mtr_t*          mtr)    /* in: mtr */
 
2701
{
 
2702
        ut_ad(mtr_memo_contains(mtr,
 
2703
                                dict_index_get_lock(btr_cur_get_index(cursor)),
 
2704
                                MTR_MEMO_X_LOCK));
 
2705
        ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
 
2706
                                MTR_MEMO_PAGE_X_FIX));
 
2707
 
 
2708
        return(btr_cur_compress_recommendation(cursor, mtr)
 
2709
               && btr_compress(cursor, mtr));
 
2710
}
 
2711
 
 
2712
/***********************************************************
 
2713
Removes the record on which the tree cursor is positioned on a leaf page.
 
2714
It is assumed that the mtr has an x-latch on the page where the cursor is
 
2715
positioned, but no latch on the whole tree. */
 
2716
UNIV_INTERN
 
2717
ibool
 
2718
btr_cur_optimistic_delete(
 
2719
/*======================*/
 
2720
                                /* out: TRUE if success, i.e., the page
 
2721
                                did not become too empty */
 
2722
        btr_cur_t*      cursor, /* in: cursor on leaf page, on the record to
 
2723
                                delete; cursor stays valid: if deletion
 
2724
                                succeeds, on function exit it points to the
 
2725
                                successor of the deleted record */
 
2726
        mtr_t*          mtr)    /* in: mtr */
 
2727
{
 
2728
        buf_block_t*    block;
 
2729
        rec_t*          rec;
 
2730
        mem_heap_t*     heap            = NULL;
 
2731
        ulint           offsets_[REC_OFFS_NORMAL_SIZE];
 
2732
        ulint*          offsets         = offsets_;
 
2733
        ibool           no_compress_needed;
 
2734
        rec_offs_init(offsets_);
 
2735
 
 
2736
        ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
 
2737
                                MTR_MEMO_PAGE_X_FIX));
 
2738
        /* This is intended only for leaf page deletions */
 
2739
 
 
2740
        block = btr_cur_get_block(cursor);
 
2741
 
 
2742
        ut_ad(page_is_leaf(buf_block_get_frame(block)));
 
2743
 
 
2744
        rec = btr_cur_get_rec(cursor);
 
2745
        offsets = rec_get_offsets(rec, cursor->index, offsets,
 
2746
                                  ULINT_UNDEFINED, &heap);
 
2747
 
 
2748
        no_compress_needed = !rec_offs_any_extern(offsets)
 
2749
                && btr_cur_can_delete_without_compress(
 
2750
                        cursor, rec_offs_size(offsets), mtr);
 
2751
 
 
2752
        if (no_compress_needed) {
 
2753
 
 
2754
                page_t*         page    = buf_block_get_frame(block);
 
2755
                page_zip_des_t* page_zip= buf_block_get_page_zip(block);
 
2756
                ulint           max_ins = 0;
 
2757
 
 
2758
                lock_update_delete(block, rec);
 
2759
 
 
2760
                btr_search_update_hash_on_delete(cursor);
 
2761
 
 
2762
                if (!page_zip) {
 
2763
                        max_ins = page_get_max_insert_size_after_reorganize(
 
2764
                                page, 1);
 
2765
                }
 
2766
#ifdef UNIV_ZIP_DEBUG
 
2767
                ut_a(!page_zip || page_zip_validate(page_zip, page));
 
2768
#endif /* UNIV_ZIP_DEBUG */
 
2769
                page_cur_delete_rec(btr_cur_get_page_cur(cursor),
 
2770
                                    cursor->index, offsets, mtr);
 
2771
#ifdef UNIV_ZIP_DEBUG
 
2772
                ut_a(!page_zip || page_zip_validate(page_zip, page));
 
2773
#endif /* UNIV_ZIP_DEBUG */
 
2774
 
 
2775
                if (dict_index_is_clust(cursor->index)
 
2776
                    || !page_is_leaf(page)) {
 
2777
                        /* The insert buffer does not handle
 
2778
                        inserts to clustered indexes or to non-leaf
 
2779
                        pages of secondary index B-trees. */
 
2780
                } else if (page_zip) {
 
2781
                        ibuf_update_free_bits_zip(block, mtr);
 
2782
                } else {
 
2783
                        ibuf_update_free_bits_low(block, max_ins, mtr);
 
2784
                }
 
2785
        }
 
2786
 
 
2787
        if (UNIV_LIKELY_NULL(heap)) {
 
2788
                mem_heap_free(heap);
 
2789
        }
 
2790
 
 
2791
        return(no_compress_needed);
 
2792
}
 
2793
 
 
2794
/*****************************************************************
 
2795
Removes the record on which the tree cursor is positioned. Tries
 
2796
to compress the page if its fillfactor drops below a threshold
 
2797
or if it is the only page on the level. It is assumed that mtr holds
 
2798
an x-latch on the tree and on the cursor page. To avoid deadlocks,
 
2799
mtr must also own x-latches to brothers of page, if those brothers
 
2800
exist. */
 
2801
UNIV_INTERN
 
2802
ibool
 
2803
btr_cur_pessimistic_delete(
 
2804
/*=======================*/
 
2805
                                /* out: TRUE if compression occurred */
 
2806
        ulint*          err,    /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
 
2807
                                the latter may occur because we may have
 
2808
                                to update node pointers on upper levels,
 
2809
                                and in the case of variable length keys
 
2810
                                these may actually grow in size */
 
2811
        ibool           has_reserved_extents, /* in: TRUE if the
 
2812
                                caller has already reserved enough free
 
2813
                                extents so that he knows that the operation
 
2814
                                will succeed */
 
2815
        btr_cur_t*      cursor, /* in: cursor on the record to delete;
 
2816
                                if compression does not occur, the cursor
 
2817
                                stays valid: it points to successor of
 
2818
                                deleted record on function exit */
 
2819
        enum trx_rb_ctx rb_ctx, /* in: rollback context */
 
2820
        mtr_t*          mtr)    /* in: mtr */
 
2821
{
 
2822
        buf_block_t*    block;
 
2823
        page_t*         page;
 
2824
        page_zip_des_t* page_zip;
 
2825
        dict_index_t*   index;
 
2826
        rec_t*          rec;
 
2827
        dtuple_t*       node_ptr;
 
2828
        ulint           n_extents       = 0;
 
2829
        ulint           n_reserved;
 
2830
        ibool           success;
 
2831
        ibool           ret             = FALSE;
 
2832
        ulint           level;
 
2833
        mem_heap_t*     heap;
 
2834
        ulint*          offsets;
 
2835
 
 
2836
        block = btr_cur_get_block(cursor);
 
2837
        page = buf_block_get_frame(block);
 
2838
        index = btr_cur_get_index(cursor);
 
2839
 
 
2840
        ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 
2841
                                MTR_MEMO_X_LOCK));
 
2842
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
2843
        if (!has_reserved_extents) {
 
2844
                /* First reserve enough free space for the file segments
 
2845
                of the index tree, so that the node pointer updates will
 
2846
                not fail because of lack of space */
 
2847
 
 
2848
                n_extents = cursor->tree_height / 32 + 1;
 
2849
 
 
2850
                success = fsp_reserve_free_extents(&n_reserved,
 
2851
                                                   index->space,
 
2852
                                                   n_extents,
 
2853
                                                   FSP_CLEANING, mtr);
 
2854
                if (!success) {
 
2855
                        *err = DB_OUT_OF_FILE_SPACE;
 
2856
 
 
2857
                        return(FALSE);
 
2858
                }
 
2859
        }
 
2860
 
 
2861
        heap = mem_heap_create(1024);
 
2862
        rec = btr_cur_get_rec(cursor);
 
2863
        page_zip = buf_block_get_page_zip(block);
 
2864
#ifdef UNIV_ZIP_DEBUG
 
2865
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
2866
#endif /* UNIV_ZIP_DEBUG */
 
2867
 
 
2868
        offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
 
2869
 
 
2870
        if (rec_offs_any_extern(offsets)) {
 
2871
                btr_rec_free_externally_stored_fields(index,
 
2872
                                                      rec, offsets, page_zip,
 
2873
                                                      rb_ctx, mtr);
 
2874
#ifdef UNIV_ZIP_DEBUG
 
2875
                ut_a(!page_zip || page_zip_validate(page_zip, page));
 
2876
#endif /* UNIV_ZIP_DEBUG */
 
2877
        }
 
2878
 
 
2879
        if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
 
2880
            && UNIV_UNLIKELY(dict_index_get_page(index)
 
2881
                             != buf_block_get_page_no(block))) {
 
2882
 
 
2883
                /* If there is only one record, drop the whole page in
 
2884
                btr_discard_page, if this is not the root page */
 
2885
 
 
2886
                btr_discard_page(cursor, mtr);
 
2887
 
 
2888
                *err = DB_SUCCESS;
 
2889
                ret = TRUE;
 
2890
 
 
2891
                goto return_after_reservations;
 
2892
        }
 
2893
 
 
2894
        lock_update_delete(block, rec);
 
2895
        level = btr_page_get_level(page, mtr);
 
2896
 
 
2897
        if (level > 0
 
2898
            && UNIV_UNLIKELY(rec == page_rec_get_next(
 
2899
                                     page_get_infimum_rec(page)))) {
 
2900
 
 
2901
                rec_t*  next_rec = page_rec_get_next(rec);
 
2902
 
 
2903
                if (btr_page_get_prev(page, mtr) == FIL_NULL) {
 
2904
 
 
2905
                        /* If we delete the leftmost node pointer on a
 
2906
                        non-leaf level, we must mark the new leftmost node
 
2907
                        pointer as the predefined minimum record */
 
2908
 
 
2909
                        /* This will make page_zip_validate() fail until
 
2910
                        page_cur_delete_rec() completes.  This is harmless,
 
2911
                        because everything will take place within a single
 
2912
                        mini-transaction and because writing to the redo log
 
2913
                        is an atomic operation (performed by mtr_commit()). */
 
2914
                        btr_set_min_rec_mark(next_rec, mtr);
 
2915
                } else {
 
2916
                        /* Otherwise, if we delete the leftmost node pointer
 
2917
                        on a page, we have to change the father node pointer
 
2918
                        so that it is equal to the new leftmost node pointer
 
2919
                        on the page */
 
2920
 
 
2921
                        btr_node_ptr_delete(index, block, mtr);
 
2922
 
 
2923
                        node_ptr = dict_index_build_node_ptr(
 
2924
                                index, next_rec, buf_block_get_page_no(block),
 
2925
                                heap, level);
 
2926
 
 
2927
                        btr_insert_on_non_leaf_level(index,
 
2928
                                                     level + 1, node_ptr, mtr);
 
2929
                }
 
2930
        }
 
2931
 
 
2932
        btr_search_update_hash_on_delete(cursor);
 
2933
 
 
2934
        page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
 
2935
#ifdef UNIV_ZIP_DEBUG
 
2936
        ut_a(!page_zip || page_zip_validate(page_zip, page));
 
2937
#endif /* UNIV_ZIP_DEBUG */
 
2938
 
 
2939
        ut_ad(btr_check_node_ptr(index, block, mtr));
 
2940
 
 
2941
        *err = DB_SUCCESS;
 
2942
 
 
2943
return_after_reservations:
 
2944
        mem_heap_free(heap);
 
2945
 
 
2946
        if (ret == FALSE) {
 
2947
                ret = btr_cur_compress_if_useful(cursor, mtr);
 
2948
        }
 
2949
 
 
2950
        if (n_extents > 0) {
 
2951
                fil_space_release_free_extents(index->space, n_reserved);
 
2952
        }
 
2953
 
 
2954
        return(ret);
 
2955
}
 
2956
 
 
2957
/***********************************************************************
 
2958
Adds path information to the cursor for the current page, for which
 
2959
the binary search has been performed. */
 
2960
static
 
2961
void
 
2962
btr_cur_add_path_info(
 
2963
/*==================*/
 
2964
        btr_cur_t*      cursor,         /* in: cursor positioned on a page */
 
2965
        ulint           height,         /* in: height of the page in tree;
 
2966
                                        0 means leaf node */
 
2967
        ulint           root_height)    /* in: root node height in tree */
 
2968
{
 
2969
        btr_path_t*     slot;
 
2970
        rec_t*          rec;
 
2971
 
 
2972
        ut_a(cursor->path_arr);
 
2973
 
 
2974
        if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
 
2975
                /* Do nothing; return empty path */
 
2976
 
 
2977
                slot = cursor->path_arr;
 
2978
                slot->nth_rec = ULINT_UNDEFINED;
 
2979
 
 
2980
                return;
 
2981
        }
 
2982
 
 
2983
        if (height == 0) {
 
2984
                /* Mark end of slots for path */
 
2985
                slot = cursor->path_arr + root_height + 1;
 
2986
                slot->nth_rec = ULINT_UNDEFINED;
 
2987
        }
 
2988
 
 
2989
        rec = btr_cur_get_rec(cursor);
 
2990
 
 
2991
        slot = cursor->path_arr + (root_height - height);
 
2992
 
 
2993
        slot->nth_rec = page_rec_get_n_recs_before(rec);
 
2994
        slot->n_recs = page_get_n_recs(page_align(rec));
 
2995
}
 
2996
 
 
2997
/***********************************************************************
 
2998
Estimates the number of rows in a given index range. */
 
2999
UNIV_INTERN
 
3000
ib_int64_t
 
3001
btr_estimate_n_rows_in_range(
 
3002
/*=========================*/
 
3003
                                /* out: estimated number of rows */
 
3004
        dict_index_t*   index,  /* in: index */
 
3005
        const dtuple_t* tuple1, /* in: range start, may also be empty tuple */
 
3006
        ulint           mode1,  /* in: search mode for range start */
 
3007
        const dtuple_t* tuple2, /* in: range end, may also be empty tuple */
 
3008
        ulint           mode2)  /* in: search mode for range end */
 
3009
{
 
3010
        btr_path_t      path1[BTR_PATH_ARRAY_N_SLOTS];
 
3011
        btr_path_t      path2[BTR_PATH_ARRAY_N_SLOTS];
 
3012
        btr_cur_t       cursor;
 
3013
        btr_path_t*     slot1;
 
3014
        btr_path_t*     slot2;
 
3015
        ibool           diverged;
 
3016
        ibool           diverged_lot;
 
3017
        ulint           divergence_level;
 
3018
        ib_int64_t      n_rows;
 
3019
        ulint           i;
 
3020
        mtr_t           mtr;
 
3021
 
 
3022
        mtr_start(&mtr);
 
3023
 
 
3024
        cursor.path_arr = path1;
 
3025
 
 
3026
        if (dtuple_get_n_fields(tuple1) > 0) {
 
3027
 
 
3028
                btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
 
3029
                                            BTR_SEARCH_LEAF | BTR_ESTIMATE,
 
3030
                                            &cursor, 0, &mtr);
 
3031
        } else {
 
3032
                btr_cur_open_at_index_side(TRUE, index,
 
3033
                                           BTR_SEARCH_LEAF | BTR_ESTIMATE,
 
3034
                                           &cursor, &mtr);
 
3035
        }
 
3036
 
 
3037
        mtr_commit(&mtr);
 
3038
 
 
3039
        mtr_start(&mtr);
 
3040
 
 
3041
        cursor.path_arr = path2;
 
3042
 
 
3043
        if (dtuple_get_n_fields(tuple2) > 0) {
 
3044
 
 
3045
                btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
 
3046
                                            BTR_SEARCH_LEAF | BTR_ESTIMATE,
 
3047
                                            &cursor, 0, &mtr);
 
3048
        } else {
 
3049
                btr_cur_open_at_index_side(FALSE, index,
 
3050
                                           BTR_SEARCH_LEAF | BTR_ESTIMATE,
 
3051
                                           &cursor, &mtr);
 
3052
        }
 
3053
 
 
3054
        mtr_commit(&mtr);
 
3055
 
 
3056
        /* We have the path information for the range in path1 and path2 */
 
3057
 
 
3058
        n_rows = 1;
 
3059
        diverged = FALSE;           /* This becomes true when the path is not
 
3060
                                    the same any more */
 
3061
        diverged_lot = FALSE;       /* This becomes true when the paths are
 
3062
                                    not the same or adjacent any more */
 
3063
        divergence_level = 1000000; /* This is the level where paths diverged
 
3064
                                    a lot */
 
3065
        for (i = 0; ; i++) {
 
3066
                ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
 
3067
 
 
3068
                slot1 = path1 + i;
 
3069
                slot2 = path2 + i;
 
3070
 
 
3071
                if (slot1->nth_rec == ULINT_UNDEFINED
 
3072
                    || slot2->nth_rec == ULINT_UNDEFINED) {
 
3073
 
 
3074
                        if (i > divergence_level + 1) {
 
3075
                                /* In trees whose height is > 1 our algorithm
 
3076
                                tends to underestimate: multiply the estimate
 
3077
                                by 2: */
 
3078
 
 
3079
                                n_rows = n_rows * 2;
 
3080
                        }
 
3081
 
 
3082
                        /* Do not estimate the number of rows in the range
 
3083
                        to over 1 / 2 of the estimated rows in the whole
 
3084
                        table */
 
3085
 
 
3086
                        if (n_rows > index->table->stat_n_rows / 2) {
 
3087
                                n_rows = index->table->stat_n_rows / 2;
 
3088
 
 
3089
                                /* If there are just 0 or 1 rows in the table,
 
3090
                                then we estimate all rows are in the range */
 
3091
 
 
3092
                                if (n_rows == 0) {
 
3093
                                        n_rows = index->table->stat_n_rows;
 
3094
                                }
 
3095
                        }
 
3096
 
 
3097
                        return(n_rows);
 
3098
                }
 
3099
 
 
3100
                if (!diverged && slot1->nth_rec != slot2->nth_rec) {
 
3101
 
 
3102
                        diverged = TRUE;
 
3103
 
 
3104
                        if (slot1->nth_rec < slot2->nth_rec) {
 
3105
                                n_rows = slot2->nth_rec - slot1->nth_rec;
 
3106
 
 
3107
                                if (n_rows > 1) {
 
3108
                                        diverged_lot = TRUE;
 
3109
                                        divergence_level = i;
 
3110
                                }
 
3111
                        } else {
 
3112
                                /* Maybe the tree has changed between
 
3113
                                searches */
 
3114
 
 
3115
                                return(10);
 
3116
                        }
 
3117
 
 
3118
                } else if (diverged && !diverged_lot) {
 
3119
 
 
3120
                        if (slot1->nth_rec < slot1->n_recs
 
3121
                            || slot2->nth_rec > 1) {
 
3122
 
 
3123
                                diverged_lot = TRUE;
 
3124
                                divergence_level = i;
 
3125
 
 
3126
                                n_rows = 0;
 
3127
 
 
3128
                                if (slot1->nth_rec < slot1->n_recs) {
 
3129
                                        n_rows += slot1->n_recs
 
3130
                                                - slot1->nth_rec;
 
3131
                                }
 
3132
 
 
3133
                                if (slot2->nth_rec > 1) {
 
3134
                                        n_rows += slot2->nth_rec - 1;
 
3135
                                }
 
3136
                        }
 
3137
                } else if (diverged_lot) {
 
3138
 
 
3139
                        n_rows = (n_rows * (slot1->n_recs + slot2->n_recs))
 
3140
                                / 2;
 
3141
                }
 
3142
        }
 
3143
}
 
3144
 
 
3145
/***********************************************************************
 
3146
Estimates the number of different key values in a given index, for
 
3147
each n-column prefix of the index where n <= dict_index_get_n_unique(index).
 
3148
The estimates are stored in the array index->stat_n_diff_key_vals. */
 
3149
UNIV_INTERN
 
3150
void
 
3151
btr_estimate_number_of_different_key_vals(
 
3152
/*======================================*/
 
3153
        dict_index_t*   index)  /* in: index */
 
3154
{
 
3155
        btr_cur_t       cursor;
 
3156
        page_t*         page;
 
3157
        rec_t*          rec;
 
3158
        ulint           n_cols;
 
3159
        ulint           matched_fields;
 
3160
        ulint           matched_bytes;
 
3161
        ib_int64_t*     n_diff;
 
3162
        ullint          n_sample_pages; /* number of pages to sample */
 
3163
        ulint           not_empty_flag  = 0;
 
3164
        ulint           total_external_size = 0;
 
3165
        ulint           i;
 
3166
        ulint           j;
 
3167
        ullint          add_on;
 
3168
        mtr_t           mtr;
 
3169
        mem_heap_t*     heap            = NULL;
 
3170
        ulint           offsets_rec_[REC_OFFS_NORMAL_SIZE];
 
3171
        ulint           offsets_next_rec_[REC_OFFS_NORMAL_SIZE];
 
3172
        ulint*          offsets_rec     = offsets_rec_;
 
3173
        ulint*          offsets_next_rec= offsets_next_rec_;
 
3174
        rec_offs_init(offsets_rec_);
 
3175
        rec_offs_init(offsets_next_rec_);
 
3176
 
 
3177
        n_cols = dict_index_get_n_unique(index);
 
3178
 
 
3179
        n_diff = mem_zalloc((n_cols + 1) * sizeof(ib_int64_t));
 
3180
 
 
3181
        /* It makes no sense to test more pages than are contained
 
3182
        in the index, thus we lower the number if it is too high */
 
3183
        if (srv_stats_sample_pages > index->stat_index_size) {
 
3184
                if (index->stat_index_size > 0) {
 
3185
                        n_sample_pages = index->stat_index_size;
 
3186
                } else {
 
3187
                        n_sample_pages = 1;
 
3188
                }
 
3189
        } else {
 
3190
                n_sample_pages = srv_stats_sample_pages;
 
3191
        }
 
3192
 
 
3193
        /* We sample some pages in the index to get an estimate */
 
3194
 
 
3195
        for (i = 0; i < n_sample_pages; i++) {
 
3196
                rec_t*  supremum;
 
3197
                mtr_start(&mtr);
 
3198
 
 
3199
                btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
 
3200
 
 
3201
                /* Count the number of different key values for each prefix of
 
3202
                the key on this index page. If the prefix does not determine
 
3203
                the index record uniquely in the B-tree, then we subtract one
 
3204
                because otherwise our algorithm would give a wrong estimate
 
3205
                for an index where there is just one key value. */
 
3206
 
 
3207
                page = btr_cur_get_page(&cursor);
 
3208
 
 
3209
                supremum = page_get_supremum_rec(page);
 
3210
                rec = page_rec_get_next(page_get_infimum_rec(page));
 
3211
 
 
3212
                if (rec != supremum) {
 
3213
                        not_empty_flag = 1;
 
3214
                        offsets_rec = rec_get_offsets(rec, index, offsets_rec,
 
3215
                                                      ULINT_UNDEFINED, &heap);
 
3216
                }
 
3217
 
 
3218
                while (rec != supremum) {
 
3219
                        rec_t*  next_rec = page_rec_get_next(rec);
 
3220
                        if (next_rec == supremum) {
 
3221
                                break;
 
3222
                        }
 
3223
 
 
3224
                        matched_fields = 0;
 
3225
                        matched_bytes = 0;
 
3226
                        offsets_next_rec = rec_get_offsets(next_rec, index,
 
3227
                                                           offsets_next_rec,
 
3228
                                                           n_cols, &heap);
 
3229
 
 
3230
                        cmp_rec_rec_with_match(rec, next_rec,
 
3231
                                               offsets_rec, offsets_next_rec,
 
3232
                                               index, &matched_fields,
 
3233
                                               &matched_bytes);
 
3234
 
 
3235
                        for (j = matched_fields + 1; j <= n_cols; j++) {
 
3236
                                /* We add one if this index record has
 
3237
                                a different prefix from the previous */
 
3238
 
 
3239
                                n_diff[j]++;
 
3240
                        }
 
3241
 
 
3242
                        total_external_size
 
3243
                                += btr_rec_get_externally_stored_len(
 
3244
                                        rec, offsets_rec);
 
3245
 
 
3246
                        rec = next_rec;
 
3247
                        /* Initialize offsets_rec for the next round
 
3248
                        and assign the old offsets_rec buffer to
 
3249
                        offsets_next_rec. */
 
3250
                        {
 
3251
                                ulint*  offsets_tmp = offsets_rec;
 
3252
                                offsets_rec = offsets_next_rec;
 
3253
                                offsets_next_rec = offsets_tmp;
 
3254
                        }
 
3255
                }
 
3256
 
 
3257
 
 
3258
                if (n_cols == dict_index_get_n_unique_in_tree(index)) {
 
3259
 
 
3260
                        /* If there is more than one leaf page in the tree,
 
3261
                        we add one because we know that the first record
 
3262
                        on the page certainly had a different prefix than the
 
3263
                        last record on the previous index page in the
 
3264
                        alphabetical order. Before this fix, if there was
 
3265
                        just one big record on each clustered index page, the
 
3266
                        algorithm grossly underestimated the number of rows
 
3267
                        in the table. */
 
3268
 
 
3269
                        if (btr_page_get_prev(page, &mtr) != FIL_NULL
 
3270
                            || btr_page_get_next(page, &mtr) != FIL_NULL) {
 
3271
 
 
3272
                                n_diff[n_cols]++;
 
3273
                        }
 
3274
                }
 
3275
 
 
3276
                offsets_rec = rec_get_offsets(rec, index, offsets_rec,
 
3277
                                              ULINT_UNDEFINED, &heap);
 
3278
                total_external_size += btr_rec_get_externally_stored_len(
 
3279
                        rec, offsets_rec);
 
3280
                mtr_commit(&mtr);
 
3281
        }
 
3282
 
 
3283
        /* If we saw k borders between different key values on
 
3284
        n_sample_pages leaf pages, we can estimate how many
 
3285
        there will be in index->stat_n_leaf_pages */
 
3286
 
 
3287
        /* We must take into account that our sample actually represents
 
3288
        also the pages used for external storage of fields (those pages are
 
3289
        included in index->stat_n_leaf_pages) */
 
3290
 
 
3291
        for (j = 0; j <= n_cols; j++) {
 
3292
                index->stat_n_diff_key_vals[j]
 
3293
                        = ((n_diff[j]
 
3294
                            * (ib_int64_t)index->stat_n_leaf_pages
 
3295
                            + n_sample_pages - 1
 
3296
                            + total_external_size
 
3297
                            + not_empty_flag)
 
3298
                           / (n_sample_pages
 
3299
                              + total_external_size));
 
3300
 
 
3301
                /* If the tree is small, smaller than
 
3302
                10 * n_sample_pages + total_external_size, then
 
3303
                the above estimate is ok. For bigger trees it is common that we
 
3304
                do not see any borders between key values in the few pages
 
3305
                we pick. But still there may be n_sample_pages
 
3306
                different key values, or even more. Let us try to approximate
 
3307
                that: */
 
3308
 
 
3309
                add_on = index->stat_n_leaf_pages
 
3310
                        / (10 * (n_sample_pages
 
3311
                                 + total_external_size));
 
3312
 
 
3313
                if (add_on > n_sample_pages) {
 
3314
                        add_on = n_sample_pages;
 
3315
                }
 
3316
 
 
3317
                index->stat_n_diff_key_vals[j] += add_on;
 
3318
        }
 
3319
 
 
3320
        mem_free(n_diff);
 
3321
        if (UNIV_LIKELY_NULL(heap)) {
 
3322
                mem_heap_free(heap);
 
3323
        }
 
3324
}
 
3325
 
 
3326
/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
 
3327
 
 
3328
/***************************************************************
 
3329
Gets the externally stored size of a record, in units of a database page. */
 
3330
static
 
3331
ulint
 
3332
btr_rec_get_externally_stored_len(
 
3333
/*==============================*/
 
3334
                                /* out: externally stored part,
 
3335
                                in units of a database page */
 
3336
        rec_t*          rec,    /* in: record */
 
3337
        const ulint*    offsets)/* in: array returned by rec_get_offsets() */
 
3338
{
 
3339
        ulint   n_fields;
 
3340
        byte*   data;
 
3341
        ulint   local_len;
 
3342
        ulint   extern_len;
 
3343
        ulint   total_extern_len = 0;
 
3344
        ulint   i;
 
3345
 
 
3346
        ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
 
3347
        n_fields = rec_offs_n_fields(offsets);
 
3348
 
 
3349
        for (i = 0; i < n_fields; i++) {
 
3350
                if (rec_offs_nth_extern(offsets, i)) {
 
3351
 
 
3352
                        data = rec_get_nth_field(rec, offsets, i, &local_len);
 
3353
 
 
3354
                        local_len -= BTR_EXTERN_FIELD_REF_SIZE;
 
3355
 
 
3356
                        extern_len = mach_read_from_4(data + local_len
 
3357
                                                      + BTR_EXTERN_LEN + 4);
 
3358
 
 
3359
                        total_extern_len += ut_calc_align(extern_len,
 
3360
                                                          UNIV_PAGE_SIZE);
 
3361
                }
 
3362
        }
 
3363
 
 
3364
        return(total_extern_len / UNIV_PAGE_SIZE);
 
3365
}
 
3366
 
 
3367
/***********************************************************************
 
3368
Sets the ownership bit of an externally stored field in a record. */
 
3369
static
 
3370
void
 
3371
btr_cur_set_ownership_of_extern_field(
 
3372
/*==================================*/
 
3373
        page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed
 
3374
                                part will be updated, or NULL */
 
3375
        rec_t*          rec,    /* in/out: clustered index record */
 
3376
        dict_index_t*   index,  /* in: index of the page */
 
3377
        const ulint*    offsets,/* in: array returned by rec_get_offsets() */
 
3378
        ulint           i,      /* in: field number */
 
3379
        ibool           val,    /* in: value to set */
 
3380
        mtr_t*          mtr)    /* in: mtr, or NULL if not logged */
 
3381
{
 
3382
        byte*   data;
 
3383
        ulint   local_len;
 
3384
        ulint   byte_val;
 
3385
 
 
3386
        data = rec_get_nth_field(rec, offsets, i, &local_len);
 
3387
 
 
3388
        ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 
3389
 
 
3390
        local_len -= BTR_EXTERN_FIELD_REF_SIZE;
 
3391
 
 
3392
        byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
 
3393
 
 
3394
        if (val) {
 
3395
                byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
 
3396
        } else {
 
3397
                byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
 
3398
        }
 
3399
 
 
3400
        if (UNIV_LIKELY_NULL(page_zip)) {
 
3401
                mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
 
3402
                page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
 
3403
        } else if (UNIV_LIKELY(mtr != NULL)) {
 
3404
 
 
3405
                mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
 
3406
                                 MLOG_1BYTE, mtr);
 
3407
        } else {
 
3408
                mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
 
3409
        }
 
3410
}
 
3411
 
 
3412
/***********************************************************************
 
3413
Marks not updated extern fields as not-owned by this record. The ownership
 
3414
is transferred to the updated record which is inserted elsewhere in the
 
3415
index tree. In purge only the owner of externally stored field is allowed
 
3416
to free the field. */
 
3417
UNIV_INTERN
 
3418
void
 
3419
btr_cur_mark_extern_inherited_fields(
 
3420
/*=================================*/
 
3421
        page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed
 
3422
                                part will be updated, or NULL */
 
3423
        rec_t*          rec,    /* in/out: record in a clustered index */
 
3424
        dict_index_t*   index,  /* in: index of the page */
 
3425
        const ulint*    offsets,/* in: array returned by rec_get_offsets() */
 
3426
        const upd_t*    update, /* in: update vector */
 
3427
        mtr_t*          mtr)    /* in: mtr, or NULL if not logged */
 
3428
{
 
3429
        ulint   n;
 
3430
        ulint   j;
 
3431
        ulint   i;
 
3432
 
 
3433
        ut_ad(rec_offs_validate(rec, NULL, offsets));
 
3434
        ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
 
3435
 
 
3436
        if (!rec_offs_any_extern(offsets)) {
 
3437
 
 
3438
                return;
 
3439
        }
 
3440
 
 
3441
        n = rec_offs_n_fields(offsets);
 
3442
 
 
3443
        for (i = 0; i < n; i++) {
 
3444
                if (rec_offs_nth_extern(offsets, i)) {
 
3445
 
 
3446
                        /* Check it is not in updated fields */
 
3447
 
 
3448
                        if (update) {
 
3449
                                for (j = 0; j < upd_get_n_fields(update);
 
3450
                                     j++) {
 
3451
                                        if (upd_get_nth_field(update, j)
 
3452
                                            ->field_no == i) {
 
3453
 
 
3454
                                                goto updated;
 
3455
                                        }
 
3456
                                }
 
3457
                        }
 
3458
 
 
3459
                        btr_cur_set_ownership_of_extern_field(
 
3460
                                page_zip, rec, index, offsets, i, FALSE, mtr);
 
3461
updated:
 
3462
                        ;
 
3463
                }
 
3464
        }
 
3465
}
 
3466
 
 
3467
/***********************************************************************
 
3468
The complement of the previous function: in an update entry may inherit
 
3469
some externally stored fields from a record. We must mark them as inherited
 
3470
in entry, so that they are not freed in a rollback. */
 
3471
UNIV_INTERN
 
3472
void
 
3473
btr_cur_mark_dtuple_inherited_extern(
 
3474
/*=================================*/
 
3475
        dtuple_t*       entry,          /* in/out: updated entry to be
 
3476
                                        inserted to clustered index */
 
3477
        const upd_t*    update)         /* in: update vector */
 
3478
{
 
3479
        ulint           i;
 
3480
 
 
3481
        for (i = 0; i < dtuple_get_n_fields(entry); i++) {
 
3482
 
 
3483
                dfield_t*       dfield = dtuple_get_nth_field(entry, i);
 
3484
                byte*           data;
 
3485
                ulint           len;
 
3486
                ulint           j;
 
3487
 
 
3488
                if (!dfield_is_ext(dfield)) {
 
3489
                        continue;
 
3490
                }
 
3491
 
 
3492
                /* Check if it is in updated fields */
 
3493
 
 
3494
                for (j = 0; j < upd_get_n_fields(update); j++) {
 
3495
                        if (upd_get_nth_field(update, j)->field_no == i) {
 
3496
 
 
3497
                                goto is_updated;
 
3498
                        }
 
3499
                }
 
3500
 
 
3501
                data = dfield_get_data(dfield);
 
3502
                len = dfield_get_len(dfield);
 
3503
                data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN]
 
3504
                        |= BTR_EXTERN_INHERITED_FLAG;
 
3505
 
 
3506
is_updated:
 
3507
                ;
 
3508
        }
 
3509
}
 
3510
 
 
3511
/***********************************************************************
 
3512
Marks all extern fields in a record as owned by the record. This function
 
3513
should be called if the delete mark of a record is removed: a not delete
 
3514
marked record always owns all its extern fields. */
 
3515
static
 
3516
void
 
3517
btr_cur_unmark_extern_fields(
 
3518
/*=========================*/
 
3519
        page_zip_des_t* page_zip,/* in/out: compressed page whose uncompressed
 
3520
                                part will be updated, or NULL */
 
3521
        rec_t*          rec,    /* in/out: record in a clustered index */
 
3522
        dict_index_t*   index,  /* in: index of the page */
 
3523
        const ulint*    offsets,/* in: array returned by rec_get_offsets() */
 
3524
        mtr_t*          mtr)    /* in: mtr, or NULL if not logged */
 
3525
{
 
3526
        ulint   n;
 
3527
        ulint   i;
 
3528
 
 
3529
        ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
 
3530
        n = rec_offs_n_fields(offsets);
 
3531
 
 
3532
        if (!rec_offs_any_extern(offsets)) {
 
3533
 
 
3534
                return;
 
3535
        }
 
3536
 
 
3537
        for (i = 0; i < n; i++) {
 
3538
                if (rec_offs_nth_extern(offsets, i)) {
 
3539
 
 
3540
                        btr_cur_set_ownership_of_extern_field(
 
3541
                                page_zip, rec, index, offsets, i, TRUE, mtr);
 
3542
                }
 
3543
        }
 
3544
}
 
3545
 
 
3546
/***********************************************************************
 
3547
Marks all extern fields in a dtuple as owned by the record. */
 
3548
UNIV_INTERN
 
3549
void
 
3550
btr_cur_unmark_dtuple_extern_fields(
 
3551
/*================================*/
 
3552
        dtuple_t*       entry)          /* in/out: clustered index entry */
 
3553
{
 
3554
        ulint   i;
 
3555
 
 
3556
        for (i = 0; i < dtuple_get_n_fields(entry); i++) {
 
3557
                dfield_t* dfield = dtuple_get_nth_field(entry, i);
 
3558
 
 
3559
                if (dfield_is_ext(dfield)) {
 
3560
                        byte*   data = dfield_get_data(dfield);
 
3561
                        ulint   len = dfield_get_len(dfield);
 
3562
 
 
3563
                        data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN]
 
3564
                                &= ~BTR_EXTERN_OWNER_FLAG;
 
3565
                }
 
3566
        }
 
3567
}
 
3568
 
 
3569
/***********************************************************************
 
3570
Flags the data tuple fields that are marked as extern storage in the
 
3571
update vector.  We use this function to remember which fields we must
 
3572
mark as extern storage in a record inserted for an update. */
 
3573
UNIV_INTERN
 
3574
ulint
 
3575
btr_push_update_extern_fields(
 
3576
/*==========================*/
 
3577
                                /* out: number of flagged external columns */
 
3578
        dtuple_t*       tuple,  /* in/out: data tuple */
 
3579
        const upd_t*    update, /* in: update vector */
 
3580
        mem_heap_t*     heap)   /* in: memory heap */
 
3581
{
 
3582
        ulint                   n_pushed        = 0;
 
3583
        ulint                   n;
 
3584
        const upd_field_t*      uf;
 
3585
 
 
3586
        ut_ad(tuple);
 
3587
        ut_ad(update);
 
3588
 
 
3589
        uf = update->fields;
 
3590
        n = upd_get_n_fields(update);
 
3591
 
 
3592
        for (; n--; uf++) {
 
3593
                if (dfield_is_ext(&uf->new_val)) {
 
3594
                        dfield_t*       field
 
3595
                                = dtuple_get_nth_field(tuple, uf->field_no);
 
3596
 
 
3597
                        if (!dfield_is_ext(field)) {
 
3598
                                dfield_set_ext(field);
 
3599
                                n_pushed++;
 
3600
                        }
 
3601
 
 
3602
                        switch (uf->orig_len) {
 
3603
                                byte*   data;
 
3604
                                ulint   len;
 
3605
                                byte*   buf;
 
3606
                        case 0:
 
3607
                                break;
 
3608
                        case BTR_EXTERN_FIELD_REF_SIZE:
 
3609
                                /* Restore the original locally stored
 
3610
                                part of the column.  In the undo log,
 
3611
                                InnoDB writes a longer prefix of externally
 
3612
                                stored columns, so that column prefixes
 
3613
                                in secondary indexes can be reconstructed. */
 
3614
                                dfield_set_data(field, (byte*) dfield_get_data(field)
 
3615
                                                + dfield_get_len(field)
 
3616
                                                - BTR_EXTERN_FIELD_REF_SIZE,
 
3617
                                                BTR_EXTERN_FIELD_REF_SIZE);
 
3618
                                dfield_set_ext(field);
 
3619
                                break;
 
3620
                        default:
 
3621
                                /* Reconstruct the original locally
 
3622
                                stored part of the column.  The data
 
3623
                                will have to be copied. */
 
3624
                                ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
 
3625
 
 
3626
                                data = dfield_get_data(field);
 
3627
                                len = dfield_get_len(field);
 
3628
 
 
3629
                                buf = mem_heap_alloc(heap, uf->orig_len);
 
3630
                                /* Copy the locally stored prefix. */
 
3631
                                memcpy(buf, data,
 
3632
                                       uf->orig_len
 
3633
                                       - BTR_EXTERN_FIELD_REF_SIZE);
 
3634
                                /* Copy the BLOB pointer. */
 
3635
                                memcpy(buf + uf->orig_len
 
3636
                                       - BTR_EXTERN_FIELD_REF_SIZE,
 
3637
                                       data + len - BTR_EXTERN_FIELD_REF_SIZE,
 
3638
                                       BTR_EXTERN_FIELD_REF_SIZE);
 
3639
 
 
3640
                                dfield_set_data(field, buf, uf->orig_len);
 
3641
                                dfield_set_ext(field);
 
3642
                        }
 
3643
                }
 
3644
        }
 
3645
 
 
3646
        return(n_pushed);
 
3647
}
 
3648
 
 
3649
/***********************************************************************
 
3650
Returns the length of a BLOB part stored on the header page. */
 
3651
static
 
3652
ulint
 
3653
btr_blob_get_part_len(
 
3654
/*==================*/
 
3655
                                        /* out: part length */
 
3656
        const byte*     blob_header)    /* in: blob header */
 
3657
{
 
3658
        return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
 
3659
}
 
3660
 
 
3661
/***********************************************************************
 
3662
Returns the page number where the next BLOB part is stored. */
 
3663
static
 
3664
ulint
 
3665
btr_blob_get_next_page_no(
 
3666
/*======================*/
 
3667
                                        /* out: page number or FIL_NULL if
 
3668
                                        no more pages */
 
3669
        const byte*     blob_header)    /* in: blob header */
 
3670
{
 
3671
        return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
 
3672
}
 
3673
 
 
3674
/***********************************************************************
 
3675
Deallocate a buffer block that was reserved for a BLOB part. */
 
3676
static
 
3677
void
 
3678
btr_blob_free(
 
3679
/*==========*/
 
3680
        buf_block_t*    block,  /* in: buffer block */
 
3681
        ibool           all,    /* in: TRUE=remove also the compressed page
 
3682
                                if there is one */
 
3683
        mtr_t*          mtr)    /* in: mini-transaction to commit */
 
3684
{
 
3685
        ulint   space   = buf_block_get_space(block);
 
3686
        ulint   page_no = buf_block_get_page_no(block);
 
3687
 
 
3688
        ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
 
3689
 
 
3690
        mtr_commit(mtr);
 
3691
 
 
3692
        buf_pool_mutex_enter();
 
3693
        mutex_enter(&block->mutex);
 
3694
 
 
3695
        /* Only free the block if it is still allocated to
 
3696
        the same file page. */
 
3697
 
 
3698
        if (buf_block_get_state(block)
 
3699
            == BUF_BLOCK_FILE_PAGE
 
3700
            && buf_block_get_space(block) == space
 
3701
            && buf_block_get_page_no(block) == page_no) {
 
3702
 
 
3703
                if (buf_LRU_free_block(&block->page, all, NULL)
 
3704
                    != BUF_LRU_FREED
 
3705
                    && all && block->page.zip.data) {
 
3706
                        /* Attempt to deallocate the uncompressed page
 
3707
                        if the whole block cannot be deallocted. */
 
3708
 
 
3709
                        buf_LRU_free_block(&block->page, FALSE, NULL);
 
3710
                }
 
3711
        }
 
3712
 
 
3713
        buf_pool_mutex_exit();
 
3714
        mutex_exit(&block->mutex);
 
3715
}
 
3716
 
 
3717
/***********************************************************************
 
3718
Stores the fields in big_rec_vec to the tablespace and puts pointers to
 
3719
them in rec.  The extern flags in rec will have to be set beforehand.
 
3720
The fields are stored on pages allocated from leaf node
 
3721
file segment of the index tree. */
 
3722
UNIV_INTERN
 
3723
ulint
 
3724
btr_store_big_rec_extern_fields(
 
3725
/*============================*/
 
3726
                                        /* out: DB_SUCCESS or error */
 
3727
        dict_index_t*   index,          /* in: index of rec; the index tree
 
3728
                                        MUST be X-latched */
 
3729
        buf_block_t*    rec_block,      /* in/out: block containing rec */
 
3730
        rec_t*          rec,            /* in/out: record */
 
3731
        const ulint*    offsets,        /* in: rec_get_offsets(rec, index);
 
3732
                                        the "external storage" flags in offsets
 
3733
                                        will not correspond to rec when
 
3734
                                        this function returns */
 
3735
        big_rec_t*      big_rec_vec,    /* in: vector containing fields
 
3736
                                        to be stored externally */
 
3737
        mtr_t*          local_mtr __attribute__((unused))) /* in: mtr
 
3738
                                        containing the latch to rec and to the
 
3739
                                        tree */
 
3740
{
 
3741
        ulint   rec_page_no;
 
3742
        byte*   field_ref;
 
3743
        ulint   extern_len;
 
3744
        ulint   store_len;
 
3745
        ulint   page_no;
 
3746
        ulint   space_id;
 
3747
        ulint   zip_size;
 
3748
        ulint   prev_page_no;
 
3749
        ulint   hint_page_no;
 
3750
        ulint   i;
 
3751
        mtr_t   mtr;
 
3752
        mem_heap_t* heap = NULL;
 
3753
        page_zip_des_t* page_zip;
 
3754
        z_stream c_stream;
 
3755
 
 
3756
        ut_ad(rec_offs_validate(rec, index, offsets));
 
3757
        ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
 
3758
                                MTR_MEMO_X_LOCK));
 
3759
        ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
 
3760
        ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
 
3761
        ut_a(dict_index_is_clust(index));
 
3762
 
 
3763
        page_zip = buf_block_get_page_zip(rec_block);
 
3764
        ut_a(dict_table_zip_size(index->table)
 
3765
             == buf_block_get_zip_size(rec_block));
 
3766
 
 
3767
        space_id = buf_block_get_space(rec_block);
 
3768
        zip_size = buf_block_get_zip_size(rec_block);
 
3769
        rec_page_no = buf_block_get_page_no(rec_block);
 
3770
        ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
 
3771
 
 
3772
        if (UNIV_LIKELY_NULL(page_zip)) {
 
3773
                int     err;
 
3774
 
 
3775
                /* Zlib deflate needs 128 kilobytes for the default
 
3776
                window size, plus 512 << memLevel, plus a few
 
3777
                kilobytes for small objects.  We use reduced memLevel
 
3778
                to limit the memory consumption, and preallocate the
 
3779
                heap, hoping to avoid memory fragmentation. */
 
3780
                heap = mem_heap_create(250000);
 
3781
                page_zip_set_alloc(&c_stream, heap);
 
3782
 
 
3783
                err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
 
3784
                                   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
 
3785
                ut_a(err == Z_OK);
 
3786
        }
 
3787
 
 
3788
        /* We have to create a file segment to the tablespace
 
3789
        for each field and put the pointer to the field in rec */
 
3790
 
 
3791
        for (i = 0; i < big_rec_vec->n_fields; i++) {
 
3792
                ut_ad(rec_offs_nth_extern(offsets,
 
3793
                                          big_rec_vec->fields[i].field_no));
 
3794
                {
 
3795
                        ulint   local_len;
 
3796
                        field_ref = rec_get_nth_field(
 
3797
                                rec, offsets, big_rec_vec->fields[i].field_no,
 
3798
                                &local_len);
 
3799
                        ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 
3800
                        local_len -= BTR_EXTERN_FIELD_REF_SIZE;
 
3801
                        field_ref += local_len;
 
3802
                }
 
3803
                extern_len = big_rec_vec->fields[i].len;
 
3804
 
 
3805
                ut_a(extern_len > 0);
 
3806
 
 
3807
                prev_page_no = FIL_NULL;
 
3808
 
 
3809
                if (UNIV_LIKELY_NULL(page_zip)) {
 
3810
                        int     err = deflateReset(&c_stream);
 
3811
                        ut_a(err == Z_OK);
 
3812
 
 
3813
                        c_stream.next_in = (void*) big_rec_vec->fields[i].data;
 
3814
                        c_stream.avail_in = extern_len;
 
3815
                }
 
3816
 
 
3817
                for (;;) {
 
3818
                        buf_block_t*    block;
 
3819
                        page_t*         page;
 
3820
 
 
3821
                        mtr_start(&mtr);
 
3822
 
 
3823
                        if (prev_page_no == FIL_NULL) {
 
3824
                                hint_page_no = 1 + rec_page_no;
 
3825
                        } else {
 
3826
                                hint_page_no = prev_page_no + 1;
 
3827
                        }
 
3828
 
 
3829
                        block = btr_page_alloc(index, hint_page_no,
 
3830
                                               FSP_NO_DIR, 0, &mtr);
 
3831
                        if (UNIV_UNLIKELY(block == NULL)) {
 
3832
 
 
3833
                                mtr_commit(&mtr);
 
3834
 
 
3835
                                if (UNIV_LIKELY_NULL(page_zip)) {
 
3836
                                        deflateEnd(&c_stream);
 
3837
                                        mem_heap_free(heap);
 
3838
                                }
 
3839
 
 
3840
                                return(DB_OUT_OF_FILE_SPACE);
 
3841
                        }
 
3842
 
 
3843
                        page_no = buf_block_get_page_no(block);
 
3844
                        page = buf_block_get_frame(block);
 
3845
 
 
3846
                        if (prev_page_no != FIL_NULL) {
 
3847
                                buf_block_t*    prev_block;
 
3848
                                page_t*         prev_page;
 
3849
 
 
3850
                                prev_block = buf_page_get(space_id, zip_size,
 
3851
                                                          prev_page_no,
 
3852
                                                          RW_X_LATCH, &mtr);
 
3853
                                buf_block_dbg_add_level(prev_block,
 
3854
                                                        SYNC_EXTERN_STORAGE);
 
3855
                                prev_page = buf_block_get_frame(prev_block);
 
3856
 
 
3857
                                if (UNIV_LIKELY_NULL(page_zip)) {
 
3858
                                        mlog_write_ulint(
 
3859
                                                prev_page + FIL_PAGE_NEXT,
 
3860
                                                page_no, MLOG_4BYTES, &mtr);
 
3861
                                        memcpy(buf_block_get_page_zip(
 
3862
                                                       prev_block)
 
3863
                                               ->data + FIL_PAGE_NEXT,
 
3864
                                               prev_page + FIL_PAGE_NEXT, 4);
 
3865
                                } else {
 
3866
                                        mlog_write_ulint(
 
3867
                                                prev_page + FIL_PAGE_DATA
 
3868
                                                + BTR_BLOB_HDR_NEXT_PAGE_NO,
 
3869
                                                page_no, MLOG_4BYTES, &mtr);
 
3870
                                }
 
3871
 
 
3872
                        }
 
3873
 
 
3874
                        if (UNIV_LIKELY_NULL(page_zip)) {
 
3875
                                int             err;
 
3876
                                page_zip_des_t* blob_page_zip;
 
3877
 
 
3878
                                mach_write_to_2(page + FIL_PAGE_TYPE,
 
3879
                                                prev_page_no == FIL_NULL
 
3880
                                                ? FIL_PAGE_TYPE_ZBLOB
 
3881
                                                : FIL_PAGE_TYPE_ZBLOB2);
 
3882
 
 
3883
                                c_stream.next_out = page
 
3884
                                        + FIL_PAGE_DATA;
 
3885
                                c_stream.avail_out
 
3886
                                        = page_zip_get_size(page_zip)
 
3887
                                        - FIL_PAGE_DATA;
 
3888
 
 
3889
                                err = deflate(&c_stream, Z_FINISH);
 
3890
                                ut_a(err == Z_OK || err == Z_STREAM_END);
 
3891
                                ut_a(err == Z_STREAM_END
 
3892
                                     || c_stream.avail_out == 0);
 
3893
 
 
3894
                                /* Write the "next BLOB page" pointer */
 
3895
                                mlog_write_ulint(page + FIL_PAGE_NEXT,
 
3896
                                                 FIL_NULL, MLOG_4BYTES, &mtr);
 
3897
                                /* Initialize the unused "prev page" pointer */
 
3898
                                mlog_write_ulint(page + FIL_PAGE_PREV,
 
3899
                                                 FIL_NULL, MLOG_4BYTES, &mtr);
 
3900
                                /* Write a back pointer to the record
 
3901
                                into the otherwise unused area.  This
 
3902
                                information could be useful in
 
3903
                                debugging.  Later, we might want to
 
3904
                                implement the possibility to relocate
 
3905
                                BLOB pages.  Then, we would need to be
 
3906
                                able to adjust the BLOB pointer in the
 
3907
                                record.  We do not store the heap
 
3908
                                number of the record, because it can
 
3909
                                change in page_zip_reorganize() or
 
3910
                                btr_page_reorganize().  However, also
 
3911
                                the page number of the record may
 
3912
                                change when B-tree nodes are split or
 
3913
                                merged. */
 
3914
                                mlog_write_ulint(page
 
3915
                                                 + FIL_PAGE_FILE_FLUSH_LSN,
 
3916
                                                 space_id,
 
3917
                                                 MLOG_4BYTES, &mtr);
 
3918
                                mlog_write_ulint(page
 
3919
                                                 + FIL_PAGE_FILE_FLUSH_LSN + 4,
 
3920
                                                 rec_page_no,
 
3921
                                                 MLOG_4BYTES, &mtr);
 
3922
 
 
3923
                                /* Zero out the unused part of the page. */
 
3924
                                memset(page + page_zip_get_size(page_zip)
 
3925
                                       - c_stream.avail_out,
 
3926
                                       0, c_stream.avail_out);
 
3927
                                mlog_log_string(page + FIL_PAGE_TYPE,
 
3928
                                                page_zip_get_size(page_zip)
 
3929
                                                - FIL_PAGE_TYPE,
 
3930
                                                &mtr);
 
3931
                                /* Copy the page to compressed storage,
 
3932
                                because it will be flushed to disk
 
3933
                                from there. */
 
3934
                                blob_page_zip = buf_block_get_page_zip(block);
 
3935
                                ut_ad(blob_page_zip);
 
3936
                                ut_ad(page_zip_get_size(blob_page_zip)
 
3937
                                      == page_zip_get_size(page_zip));
 
3938
                                memcpy(blob_page_zip->data, page,
 
3939
                                       page_zip_get_size(page_zip));
 
3940
 
 
3941
                                if (err == Z_OK && prev_page_no != FIL_NULL) {
 
3942
 
 
3943
                                        goto next_zip_page;
 
3944
                                }
 
3945
 
 
3946
                                rec_block = buf_page_get(space_id, zip_size,
 
3947
                                                         rec_page_no,
 
3948
                                                         RW_X_LATCH, &mtr);
 
3949
                                buf_block_dbg_add_level(rec_block,
 
3950
                                                        SYNC_NO_ORDER_CHECK);
 
3951
 
 
3952
                                if (err == Z_STREAM_END) {
 
3953
                                        mach_write_to_4(field_ref
 
3954
                                                        + BTR_EXTERN_LEN, 0);
 
3955
                                        mach_write_to_4(field_ref
 
3956
                                                        + BTR_EXTERN_LEN + 4,
 
3957
                                                        c_stream.total_in);
 
3958
                                } else {
 
3959
                                        memset(field_ref + BTR_EXTERN_LEN,
 
3960
                                               0, 8);
 
3961
                                }
 
3962
 
 
3963
                                if (prev_page_no == FIL_NULL) {
 
3964
                                        mach_write_to_4(field_ref
 
3965
                                                        + BTR_EXTERN_SPACE_ID,
 
3966
                                                        space_id);
 
3967
 
 
3968
                                        mach_write_to_4(field_ref
 
3969
                                                        + BTR_EXTERN_PAGE_NO,
 
3970
                                                        page_no);
 
3971
 
 
3972
                                        mach_write_to_4(field_ref
 
3973
                                                        + BTR_EXTERN_OFFSET,
 
3974
                                                        FIL_PAGE_NEXT);
 
3975
                                }
 
3976
 
 
3977
                                page_zip_write_blob_ptr(
 
3978
                                        page_zip, rec, index, offsets,
 
3979
                                        big_rec_vec->fields[i].field_no, &mtr);
 
3980
 
 
3981
next_zip_page:
 
3982
                                prev_page_no = page_no;
 
3983
 
 
3984
                                /* Commit mtr and release the
 
3985
                                uncompressed page frame to save memory. */
 
3986
                                btr_blob_free(block, FALSE, &mtr);
 
3987
 
 
3988
                                if (err == Z_STREAM_END) {
 
3989
                                        break;
 
3990
                                }
 
3991
                        } else {
 
3992
                                mlog_write_ulint(page + FIL_PAGE_TYPE,
 
3993
                                                 FIL_PAGE_TYPE_BLOB,
 
3994
                                                 MLOG_2BYTES, &mtr);
 
3995
 
 
3996
                                if (extern_len > (UNIV_PAGE_SIZE
 
3997
                                                  - FIL_PAGE_DATA
 
3998
                                                  - BTR_BLOB_HDR_SIZE
 
3999
                                                  - FIL_PAGE_DATA_END)) {
 
4000
                                        store_len = UNIV_PAGE_SIZE
 
4001
                                                - FIL_PAGE_DATA
 
4002
                                                - BTR_BLOB_HDR_SIZE
 
4003
                                                - FIL_PAGE_DATA_END;
 
4004
                                } else {
 
4005
                                        store_len = extern_len;
 
4006
                                }
 
4007
 
 
4008
                                mlog_write_string(page + FIL_PAGE_DATA
 
4009
                                                  + BTR_BLOB_HDR_SIZE,
 
4010
                                                  (const byte*)
 
4011
                                                  big_rec_vec->fields[i].data
 
4012
                                                  + big_rec_vec->fields[i].len
 
4013
                                                  - extern_len,
 
4014
                                                  store_len, &mtr);
 
4015
                                mlog_write_ulint(page + FIL_PAGE_DATA
 
4016
                                                 + BTR_BLOB_HDR_PART_LEN,
 
4017
                                                 store_len, MLOG_4BYTES, &mtr);
 
4018
                                mlog_write_ulint(page + FIL_PAGE_DATA
 
4019
                                                 + BTR_BLOB_HDR_NEXT_PAGE_NO,
 
4020
                                                 FIL_NULL, MLOG_4BYTES, &mtr);
 
4021
 
 
4022
                                extern_len -= store_len;
 
4023
 
 
4024
                                rec_block = buf_page_get(space_id, zip_size,
 
4025
                                                         rec_page_no,
 
4026
                                                         RW_X_LATCH, &mtr);
 
4027
                                buf_block_dbg_add_level(rec_block,
 
4028
                                                        SYNC_NO_ORDER_CHECK);
 
4029
 
 
4030
                                mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
 
4031
                                                 MLOG_4BYTES, &mtr);
 
4032
                                mlog_write_ulint(field_ref
 
4033
                                                 + BTR_EXTERN_LEN + 4,
 
4034
                                                 big_rec_vec->fields[i].len
 
4035
                                                 - extern_len,
 
4036
                                                 MLOG_4BYTES, &mtr);
 
4037
 
 
4038
                                if (prev_page_no == FIL_NULL) {
 
4039
                                        mlog_write_ulint(field_ref
 
4040
                                                         + BTR_EXTERN_SPACE_ID,
 
4041
                                                         space_id,
 
4042
                                                         MLOG_4BYTES, &mtr);
 
4043
 
 
4044
                                        mlog_write_ulint(field_ref
 
4045
                                                         + BTR_EXTERN_PAGE_NO,
 
4046
                                                         page_no,
 
4047
                                                         MLOG_4BYTES, &mtr);
 
4048
 
 
4049
                                        mlog_write_ulint(field_ref
 
4050
                                                         + BTR_EXTERN_OFFSET,
 
4051
                                                         FIL_PAGE_DATA,
 
4052
                                                         MLOG_4BYTES, &mtr);
 
4053
                                }
 
4054
 
 
4055
                                prev_page_no = page_no;
 
4056
 
 
4057
                                mtr_commit(&mtr);
 
4058
 
 
4059
                                if (extern_len == 0) {
 
4060
                                        break;
 
4061
                                }
 
4062
                        }
 
4063
                }
 
4064
        }
 
4065
 
 
4066
        if (UNIV_LIKELY_NULL(page_zip)) {
 
4067
                deflateEnd(&c_stream);
 
4068
                mem_heap_free(heap);
 
4069
        }
 
4070
 
 
4071
        return(DB_SUCCESS);
 
4072
}
 
4073
 
 
4074
/***********************************************************************
 
4075
Frees the space in an externally stored field to the file space
 
4076
management if the field in data is owned by the externally stored field,
 
4077
in a rollback we may have the additional condition that the field must
 
4078
not be inherited. */
 
4079
UNIV_INTERN
 
4080
void
 
4081
btr_free_externally_stored_field(
 
4082
/*=============================*/
 
4083
        dict_index_t*   index,          /* in: index of the data, the index
 
4084
                                        tree MUST be X-latched; if the tree
 
4085
                                        height is 1, then also the root page
 
4086
                                        must be X-latched! (this is relevant
 
4087
                                        in the case this function is called
 
4088
                                        from purge where 'data' is located on
 
4089
                                        an undo log page, not an index
 
4090
                                        page) */
 
4091
        byte*           field_ref,      /* in/out: field reference */
 
4092
        const rec_t*    rec,            /* in: record containing field_ref, for
 
4093
                                        page_zip_write_blob_ptr(), or NULL */
 
4094
        const ulint*    offsets,        /* in: rec_get_offsets(rec, index),
 
4095
                                        or NULL */
 
4096
        page_zip_des_t* page_zip,       /* in: compressed page corresponding
 
4097
                                        to rec, or NULL if rec == NULL */
 
4098
        ulint           i,              /* in: field number of field_ref;
 
4099
                                        ignored if rec == NULL */
 
4100
        enum trx_rb_ctx rb_ctx,         /* in: rollback context */
 
4101
        mtr_t*          local_mtr __attribute__((unused))) /* in: mtr
 
4102
                                        containing the latch to data an an
 
4103
                                        X-latch to the index tree */
 
4104
{
 
4105
        page_t*         page;
 
4106
        ulint           space_id;
 
4107
        ulint           rec_zip_size = dict_table_zip_size(index->table);
 
4108
        ulint           ext_zip_size;
 
4109
        ulint           page_no;
 
4110
        ulint           next_page_no;
 
4111
        mtr_t           mtr;
 
4112
#ifdef UNIV_DEBUG
 
4113
        ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
 
4114
                                MTR_MEMO_X_LOCK));
 
4115
        ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
 
4116
                                     MTR_MEMO_PAGE_X_FIX));
 
4117
        ut_ad(!rec || rec_offs_validate(rec, index, offsets));
 
4118
 
 
4119
        if (rec) {
 
4120
                ulint   local_len;
 
4121
                const byte*     f = rec_get_nth_field(rec, offsets,
 
4122
                                                      i, &local_len);
 
4123
                ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 
4124
                local_len -= BTR_EXTERN_FIELD_REF_SIZE;
 
4125
                f += local_len;
 
4126
                ut_ad(f == field_ref);
 
4127
        }
 
4128
#endif /* UNIV_DEBUG */
 
4129
 
 
4130
        if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
 
4131
                                  BTR_EXTERN_FIELD_REF_SIZE))) {
 
4132
                /* In the rollback of uncommitted transactions, we may
 
4133
                encounter a clustered index record whose BLOBs have
 
4134
                not been written.  There is nothing to free then. */
 
4135
                ut_a(rb_ctx == RB_RECOVERY);
 
4136
                return;
 
4137
        }
 
4138
 
 
4139
        space_id = mach_read_from_4(field_ref + BTR_EXTERN_SPACE_ID);
 
4140
 
 
4141
        if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
 
4142
                ext_zip_size = fil_space_get_zip_size(space_id);
 
4143
                /* This must be an undo log record in the system tablespace,
 
4144
                that is, in row_purge_upd_exist_or_extern().
 
4145
                Currently, externally stored records are stored in the
 
4146
                same tablespace as the referring records. */
 
4147
                ut_ad(!page_get_space_id(page_align(field_ref)));
 
4148
                ut_ad(!rec);
 
4149
                ut_ad(!page_zip);
 
4150
        } else {
 
4151
                ext_zip_size = rec_zip_size;
 
4152
        }
 
4153
 
 
4154
        if (!rec) {
 
4155
                /* This is a call from row_purge_upd_exist_or_extern(). */
 
4156
                ut_ad(!page_zip);
 
4157
                rec_zip_size = 0;
 
4158
        }
 
4159
 
 
4160
        for (;;) {
 
4161
                buf_block_t*    rec_block;
 
4162
                buf_block_t*    ext_block;
 
4163
 
 
4164
                mtr_start(&mtr);
 
4165
 
 
4166
                rec_block = buf_page_get(page_get_space_id(
 
4167
                                                 page_align(field_ref)),
 
4168
                                         rec_zip_size,
 
4169
                                         page_get_page_no(
 
4170
                                                 page_align(field_ref)),
 
4171
                                         RW_X_LATCH, &mtr);
 
4172
                buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
 
4173
                page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
 
4174
 
 
4175
                if (/* There is no external storage data */
 
4176
                    page_no == FIL_NULL
 
4177
                    /* This field does not own the externally stored field */
 
4178
                    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
 
4179
                        & BTR_EXTERN_OWNER_FLAG)
 
4180
                    /* Rollback and inherited field */
 
4181
                    || (rb_ctx != RB_NONE
 
4182
                        && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
 
4183
                            & BTR_EXTERN_INHERITED_FLAG))) {
 
4184
 
 
4185
                        /* Do not free */
 
4186
                        mtr_commit(&mtr);
 
4187
 
 
4188
                        return;
 
4189
                }
 
4190
 
 
4191
                ext_block = buf_page_get(space_id, ext_zip_size, page_no,
 
4192
                                         RW_X_LATCH, &mtr);
 
4193
                buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
 
4194
                page = buf_block_get_frame(ext_block);
 
4195
 
 
4196
                if (ext_zip_size) {
 
4197
                        /* Note that page_zip will be NULL
 
4198
                        in row_purge_upd_exist_or_extern(). */
 
4199
                        switch (fil_page_get_type(page)) {
 
4200
                        case FIL_PAGE_TYPE_ZBLOB:
 
4201
                        case FIL_PAGE_TYPE_ZBLOB2:
 
4202
                                break;
 
4203
                        default:
 
4204
                                ut_error;
 
4205
                        }
 
4206
                        next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
 
4207
 
 
4208
                        btr_page_free_low(index, ext_block, 0, &mtr);
 
4209
 
 
4210
                        if (UNIV_LIKELY(page_zip != NULL)) {
 
4211
                                mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
 
4212
                                                next_page_no);
 
4213
                                mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
 
4214
                                                0);
 
4215
                                page_zip_write_blob_ptr(page_zip, rec, index,
 
4216
                                                        offsets, i, &mtr);
 
4217
                        } else {
 
4218
                                mlog_write_ulint(field_ref
 
4219
                                                 + BTR_EXTERN_PAGE_NO,
 
4220
                                                 next_page_no,
 
4221
                                                 MLOG_4BYTES, &mtr);
 
4222
                                mlog_write_ulint(field_ref
 
4223
                                                 + BTR_EXTERN_LEN + 4, 0,
 
4224
                                                 MLOG_4BYTES, &mtr);
 
4225
                        }
 
4226
                } else {
 
4227
                        ut_a(fil_page_get_type(page) == FIL_PAGE_TYPE_BLOB);
 
4228
                        ut_a(!page_zip);
 
4229
 
 
4230
                        next_page_no = mach_read_from_4(
 
4231
                                page + FIL_PAGE_DATA
 
4232
                                + BTR_BLOB_HDR_NEXT_PAGE_NO);
 
4233
 
 
4234
                        /* We must supply the page level (= 0) as an argument
 
4235
                        because we did not store it on the page (we save the
 
4236
                        space overhead from an index page header. */
 
4237
 
 
4238
                        ut_a(space_id == page_get_space_id(page));
 
4239
                        ut_a(page_no == page_get_page_no(page));
 
4240
 
 
4241
                        btr_page_free_low(index, ext_block, 0, &mtr);
 
4242
 
 
4243
                        mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
 
4244
                                         next_page_no,
 
4245
                                         MLOG_4BYTES, &mtr);
 
4246
                        /* Zero out the BLOB length.  If the server
 
4247
                        crashes during the execution of this function,
 
4248
                        trx_rollback_or_clean_all_recovered() could
 
4249
                        dereference the half-deleted BLOB, fetching a
 
4250
                        wrong prefix for the BLOB. */
 
4251
                        mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
 
4252
                                         0,
 
4253
                                         MLOG_4BYTES, &mtr);
 
4254
                }
 
4255
 
 
4256
                /* Commit mtr and release the BLOB block to save memory. */
 
4257
                btr_blob_free(ext_block, TRUE, &mtr);
 
4258
        }
 
4259
}
 
4260
 
 
4261
/***************************************************************
 
4262
Frees the externally stored fields for a record. */
 
4263
static
 
4264
void
 
4265
btr_rec_free_externally_stored_fields(
 
4266
/*==================================*/
 
4267
        dict_index_t*   index,  /* in: index of the data, the index
 
4268
                                tree MUST be X-latched */
 
4269
        rec_t*          rec,    /* in/out: record */
 
4270
        const ulint*    offsets,/* in: rec_get_offsets(rec, index) */
 
4271
        page_zip_des_t* page_zip,/* in: compressed page whose uncompressed
 
4272
                                part will be updated, or NULL */
 
4273
        enum trx_rb_ctx rb_ctx, /* in: rollback context */
 
4274
        mtr_t*          mtr)    /* in: mini-transaction handle which contains
 
4275
                                an X-latch to record page and to the index
 
4276
                                tree */
 
4277
{
 
4278
        ulint   n_fields;
 
4279
        ulint   i;
 
4280
 
 
4281
        ut_ad(rec_offs_validate(rec, index, offsets));
 
4282
        ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
 
4283
        /* Free possible externally stored fields in the record */
 
4284
 
 
4285
        ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
 
4286
        n_fields = rec_offs_n_fields(offsets);
 
4287
 
 
4288
        for (i = 0; i < n_fields; i++) {
 
4289
                if (rec_offs_nth_extern(offsets, i)) {
 
4290
                        ulint   len;
 
4291
                        byte*   data
 
4292
                                = rec_get_nth_field(rec, offsets, i, &len);
 
4293
                        ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
 
4294
 
 
4295
                        btr_free_externally_stored_field(
 
4296
                                index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
 
4297
                                rec, offsets, page_zip, i, rb_ctx, mtr);
 
4298
                }
 
4299
        }
 
4300
}
 
4301
 
 
4302
/***************************************************************
 
4303
Frees the externally stored fields for a record, if the field is mentioned
 
4304
in the update vector. */
 
4305
static
 
4306
void
 
4307
btr_rec_free_updated_extern_fields(
 
4308
/*===============================*/
 
4309
        dict_index_t*   index,  /* in: index of rec; the index tree MUST be
 
4310
                                X-latched */
 
4311
        rec_t*          rec,    /* in/out: record */
 
4312
        page_zip_des_t* page_zip,/* in: compressed page whose uncompressed
 
4313
                                part will be updated, or NULL */
 
4314
        const ulint*    offsets,/* in: rec_get_offsets(rec, index) */
 
4315
        const upd_t*    update, /* in: update vector */
 
4316
        enum trx_rb_ctx rb_ctx, /* in: rollback context */
 
4317
        mtr_t*          mtr)    /* in: mini-transaction handle which contains
 
4318
                                an X-latch to record page and to the tree */
 
4319
{
 
4320
        ulint   n_fields;
 
4321
        ulint   i;
 
4322
 
 
4323
        ut_ad(rec_offs_validate(rec, index, offsets));
 
4324
        ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
 
4325
 
 
4326
        /* Free possible externally stored fields in the record */
 
4327
 
 
4328
        n_fields = upd_get_n_fields(update);
 
4329
 
 
4330
        for (i = 0; i < n_fields; i++) {
 
4331
                const upd_field_t* ufield = upd_get_nth_field(update, i);
 
4332
 
 
4333
                if (rec_offs_nth_extern(offsets, ufield->field_no)) {
 
4334
                        ulint   len;
 
4335
                        byte*   data = rec_get_nth_field(
 
4336
                                rec, offsets, ufield->field_no, &len);
 
4337
                        ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
 
4338
 
 
4339
                        btr_free_externally_stored_field(
 
4340
                                index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
 
4341
                                rec, offsets, page_zip,
 
4342
                                ufield->field_no, rb_ctx, mtr);
 
4343
                }
 
4344
        }
 
4345
}
 
4346
 
 
4347
/***********************************************************************
 
4348
Copies the prefix of an uncompressed BLOB.  The clustered index record
 
4349
that points to this BLOB must be protected by a lock or a page latch. */
 
4350
static
 
4351
ulint
 
4352
btr_copy_blob_prefix(
 
4353
/*=================*/
 
4354
                                /* out: number of bytes written to buf */
 
4355
        byte*           buf,    /* out: the externally stored part of
 
4356
                                the field, or a prefix of it */
 
4357
        ulint           len,    /* in: length of buf, in bytes */
 
4358
        ulint           space_id,/* in: space id of the BLOB pages */
 
4359
        ulint           page_no,/* in: page number of the first BLOB page */
 
4360
        ulint           offset) /* in: offset on the first BLOB page */
 
4361
{
 
4362
        ulint   copied_len      = 0;
 
4363
 
 
4364
        for (;;) {
 
4365
                mtr_t           mtr;
 
4366
                buf_block_t*    block;
 
4367
                const page_t*   page;
 
4368
                const byte*     blob_header;
 
4369
                ulint           part_len;
 
4370
                ulint           copy_len;
 
4371
 
 
4372
                mtr_start(&mtr);
 
4373
 
 
4374
                block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr);
 
4375
                buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
 
4376
                page = buf_block_get_frame(block);
 
4377
 
 
4378
                /* Unfortunately, FIL_PAGE_TYPE was uninitialized for
 
4379
                many pages until MySQL/InnoDB 5.1.7. */
 
4380
                /* ut_ad(fil_page_get_type(page) == FIL_PAGE_TYPE_BLOB); */
 
4381
                blob_header = page + offset;
 
4382
                part_len = btr_blob_get_part_len(blob_header);
 
4383
                copy_len = ut_min(part_len, len - copied_len);
 
4384
 
 
4385
                memcpy(buf + copied_len,
 
4386
                       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
 
4387
                copied_len += copy_len;
 
4388
 
 
4389
                page_no = btr_blob_get_next_page_no(blob_header);
 
4390
 
 
4391
                mtr_commit(&mtr);
 
4392
 
 
4393
                if (page_no == FIL_NULL || copy_len != part_len) {
 
4394
                        return(copied_len);
 
4395
                }
 
4396
 
 
4397
                /* On other BLOB pages except the first the BLOB header
 
4398
                always is at the page data start: */
 
4399
 
 
4400
                offset = FIL_PAGE_DATA;
 
4401
 
 
4402
                ut_ad(copied_len <= len);
 
4403
        }
 
4404
}
 
4405
 
 
4406
/***********************************************************************
 
4407
Copies the prefix of a compressed BLOB.  The clustered index record
 
4408
that points to this BLOB must be protected by a lock or a page latch. */
 
4409
static
 
4410
void
 
4411
btr_copy_zblob_prefix(
 
4412
/*==================*/
 
4413
        z_stream*       d_stream,/* in/out: the decompressing stream */
 
4414
        ulint           zip_size,/* in: compressed BLOB page size */
 
4415
        ulint           space_id,/* in: space id of the BLOB pages */
 
4416
        ulint           page_no,/* in: page number of the first BLOB page */
 
4417
        ulint           offset) /* in: offset on the first BLOB page */
 
4418
{
 
4419
        ulint   page_type = FIL_PAGE_TYPE_ZBLOB;
 
4420
 
 
4421
        ut_ad(ut_is_2pow(zip_size));
 
4422
        ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE);
 
4423
        ut_ad(zip_size <= UNIV_PAGE_SIZE);
 
4424
        ut_ad(space_id);
 
4425
 
 
4426
        for (;;) {
 
4427
                buf_page_t*     bpage;
 
4428
                int             err;
 
4429
                ulint           next_page_no;
 
4430
 
 
4431
                /* There is no latch on bpage directly.  Instead,
 
4432
                bpage is protected by the B-tree page latch that
 
4433
                is being held on the clustered index record, or,
 
4434
                in row_merge_copy_blobs(), by an exclusive table lock. */
 
4435
                bpage = buf_page_get_zip(space_id, zip_size, page_no);
 
4436
 
 
4437
                if (UNIV_UNLIKELY(!bpage)) {
 
4438
                        ut_print_timestamp(stderr);
 
4439
                        fprintf(stderr,
 
4440
                                "  InnoDB: Cannot load"
 
4441
                                " compressed BLOB"
 
4442
                                " page %lu space %lu\n",
 
4443
                                (ulong) page_no, (ulong) space_id);
 
4444
                        return;
 
4445
                }
 
4446
 
 
4447
                if (UNIV_UNLIKELY
 
4448
                    (fil_page_get_type(bpage->zip.data) != page_type)) {
 
4449
                        ut_print_timestamp(stderr);
 
4450
                        fprintf(stderr,
 
4451
                                "  InnoDB: Unexpected type %lu of"
 
4452
                                " compressed BLOB"
 
4453
                                " page %lu space %lu\n",
 
4454
                                (ulong) fil_page_get_type(bpage->zip.data),
 
4455
                                (ulong) page_no, (ulong) space_id);
 
4456
                        goto end_of_blob;
 
4457
                }
 
4458
 
 
4459
                next_page_no = mach_read_from_4(bpage->zip.data + offset);
 
4460
 
 
4461
                if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
 
4462
                        /* When the BLOB begins at page header,
 
4463
                        the compressed data payload does not
 
4464
                        immediately follow the next page pointer. */
 
4465
                        offset = FIL_PAGE_DATA;
 
4466
                } else {
 
4467
                        offset += 4;
 
4468
                }
 
4469
 
 
4470
                d_stream->next_in = bpage->zip.data + offset;
 
4471
                d_stream->avail_in = zip_size - offset;
 
4472
 
 
4473
                err = inflate(d_stream, Z_NO_FLUSH);
 
4474
                switch (err) {
 
4475
                case Z_OK:
 
4476
                        if (!d_stream->avail_out) {
 
4477
                                goto end_of_blob;
 
4478
                        }
 
4479
                        break;
 
4480
                case Z_STREAM_END:
 
4481
                        if (next_page_no == FIL_NULL) {
 
4482
                                goto end_of_blob;
 
4483
                        }
 
4484
                        /* fall through */
 
4485
                default:
 
4486
inflate_error:
 
4487
                        ut_print_timestamp(stderr);
 
4488
                        fprintf(stderr,
 
4489
                                "  InnoDB: inflate() of"
 
4490
                                " compressed BLOB"
 
4491
                                " page %lu space %lu returned %d (%s)\n",
 
4492
                                (ulong) page_no, (ulong) space_id,
 
4493
                                err, d_stream->msg);
 
4494
                case Z_BUF_ERROR:
 
4495
                        goto end_of_blob;
 
4496
                }
 
4497
 
 
4498
                if (next_page_no == FIL_NULL) {
 
4499
                        if (!d_stream->avail_in) {
 
4500
                                ut_print_timestamp(stderr);
 
4501
                                fprintf(stderr,
 
4502
                                        "  InnoDB: unexpected end of"
 
4503
                                        " compressed BLOB"
 
4504
                                        " page %lu space %lu\n",
 
4505
                                        (ulong) page_no,
 
4506
                                        (ulong) space_id);
 
4507
                        } else {
 
4508
                                err = inflate(d_stream, Z_FINISH);
 
4509
                                switch (err) {
 
4510
                                case Z_STREAM_END:
 
4511
                                case Z_BUF_ERROR:
 
4512
                                        break;
 
4513
                                default:
 
4514
                                        goto inflate_error;
 
4515
                                }
 
4516
                        }
 
4517
 
 
4518
end_of_blob:
 
4519
                        buf_page_release_zip(bpage);
 
4520
                        return;
 
4521
                }
 
4522
 
 
4523
                buf_page_release_zip(bpage);
 
4524
 
 
4525
                /* On other BLOB pages except the first
 
4526
                the BLOB header always is at the page header: */
 
4527
 
 
4528
                page_no = next_page_no;
 
4529
                offset = FIL_PAGE_NEXT;
 
4530
                page_type = FIL_PAGE_TYPE_ZBLOB2;
 
4531
        }
 
4532
}
 
4533
 
 
4534
/***********************************************************************
 
4535
Copies the prefix of an externally stored field of a record.  The
 
4536
clustered index record that points to this BLOB must be protected by a
 
4537
lock or a page latch. */
 
4538
static
 
4539
ulint
 
4540
btr_copy_externally_stored_field_prefix_low(
 
4541
/*========================================*/
 
4542
                                /* out: number of bytes written to buf */
 
4543
        byte*           buf,    /* out: the externally stored part of
 
4544
                                the field, or a prefix of it */
 
4545
        ulint           len,    /* in: length of buf, in bytes */
 
4546
        ulint           zip_size,/* in: nonzero=compressed BLOB page size,
 
4547
                                zero for uncompressed BLOBs */
 
4548
        ulint           space_id,/* in: space id of the first BLOB page */
 
4549
        ulint           page_no,/* in: page number of the first BLOB page */
 
4550
        ulint           offset) /* in: offset on the first BLOB page */
 
4551
{
 
4552
        if (UNIV_UNLIKELY(len == 0)) {
 
4553
                return(0);
 
4554
        }
 
4555
 
 
4556
        if (UNIV_UNLIKELY(zip_size)) {
 
4557
                int             err;
 
4558
                z_stream        d_stream;
 
4559
                mem_heap_t*     heap;
 
4560
 
 
4561
                /* Zlib inflate needs 32 kilobytes for the default
 
4562
                window size, plus a few kilobytes for small objects. */
 
4563
                heap = mem_heap_create(40000);
 
4564
                page_zip_set_alloc(&d_stream, heap);
 
4565
 
 
4566
                err = inflateInit(&d_stream);
 
4567
                ut_a(err == Z_OK);
 
4568
 
 
4569
                d_stream.next_out = buf;
 
4570
                d_stream.avail_out = len;
 
4571
                d_stream.avail_in = 0;
 
4572
 
 
4573
                btr_copy_zblob_prefix(&d_stream, zip_size,
 
4574
                                      space_id, page_no, offset);
 
4575
                inflateEnd(&d_stream);
 
4576
                mem_heap_free(heap);
 
4577
                return(d_stream.total_out);
 
4578
        } else {
 
4579
                return(btr_copy_blob_prefix(buf, len, space_id,
 
4580
                                            page_no, offset));
 
4581
        }
 
4582
}
 
4583
 
 
4584
/***********************************************************************
 
4585
Copies the prefix of an externally stored field of a record.  The
 
4586
clustered index record must be protected by a lock or a page latch. */
 
4587
UNIV_INTERN
 
4588
ulint
 
4589
btr_copy_externally_stored_field_prefix(
 
4590
/*====================================*/
 
4591
                                /* out: the length of the copied field,
 
4592
                                or 0 if the column was being or has been
 
4593
                                deleted */
 
4594
        byte*           buf,    /* out: the field, or a prefix of it */
 
4595
        ulint           len,    /* in: length of buf, in bytes */
 
4596
        ulint           zip_size,/* in: nonzero=compressed BLOB page size,
 
4597
                                zero for uncompressed BLOBs */
 
4598
        const byte*     data,   /* in: 'internally' stored part of the
 
4599
                                field containing also the reference to
 
4600
                                the external part; must be protected by
 
4601
                                a lock or a page latch */
 
4602
        ulint           local_len)/* in: length of data, in bytes */
 
4603
{
 
4604
        ulint   space_id;
 
4605
        ulint   page_no;
 
4606
        ulint   offset;
 
4607
 
 
4608
        ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 
4609
 
 
4610
        local_len -= BTR_EXTERN_FIELD_REF_SIZE;
 
4611
 
 
4612
        if (UNIV_UNLIKELY(local_len >= len)) {
 
4613
                memcpy(buf, data, len);
 
4614
                return(len);
 
4615
        }
 
4616
 
 
4617
        memcpy(buf, data, local_len);
 
4618
        data += local_len;
 
4619
 
 
4620
        ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
 
4621
 
 
4622
        if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
 
4623
                /* The externally stored part of the column has been
 
4624
                (partially) deleted.  Signal the half-deleted BLOB
 
4625
                to the caller. */
 
4626
 
 
4627
                return(0);
 
4628
        }
 
4629
 
 
4630
        space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
 
4631
 
 
4632
        page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
 
4633
 
 
4634
        offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
 
4635
 
 
4636
        return(local_len
 
4637
               + btr_copy_externally_stored_field_prefix_low(buf + local_len,
 
4638
                                                             len - local_len,
 
4639
                                                             zip_size,
 
4640
                                                             space_id, page_no,
 
4641
                                                             offset));
 
4642
}
 
4643
 
 
4644
/***********************************************************************
 
4645
Copies an externally stored field of a record to mem heap.  The
 
4646
clustered index record must be protected by a lock or a page latch. */
 
4647
static
 
4648
byte*
 
4649
btr_copy_externally_stored_field(
 
4650
/*=============================*/
 
4651
                                /* out: the whole field copied to heap */
 
4652
        ulint*          len,    /* out: length of the whole field */
 
4653
        const byte*     data,   /* in: 'internally' stored part of the
 
4654
                                field containing also the reference to
 
4655
                                the external part; must be protected by
 
4656
                                a lock or a page latch */
 
4657
        ulint           zip_size,/* in: nonzero=compressed BLOB page size,
 
4658
                                zero for uncompressed BLOBs */
 
4659
        ulint           local_len,/* in: length of data */
 
4660
        mem_heap_t*     heap)   /* in: mem heap */
 
4661
{
 
4662
        ulint   space_id;
 
4663
        ulint   page_no;
 
4664
        ulint   offset;
 
4665
        ulint   extern_len;
 
4666
        byte*   buf;
 
4667
 
 
4668
        ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
 
4669
 
 
4670
        local_len -= BTR_EXTERN_FIELD_REF_SIZE;
 
4671
 
 
4672
        space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
 
4673
 
 
4674
        page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
 
4675
 
 
4676
        offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
 
4677
 
 
4678
        /* Currently a BLOB cannot be bigger than 4 GB; we
 
4679
        leave the 4 upper bytes in the length field unused */
 
4680
 
 
4681
        extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
 
4682
 
 
4683
        buf = mem_heap_alloc(heap, local_len + extern_len);
 
4684
 
 
4685
        memcpy(buf, data, local_len);
 
4686
        *len = local_len
 
4687
                + btr_copy_externally_stored_field_prefix_low(buf + local_len,
 
4688
                                                              extern_len,
 
4689
                                                              zip_size,
 
4690
                                                              space_id,
 
4691
                                                              page_no, offset);
 
4692
 
 
4693
        return(buf);
 
4694
}
 
4695
 
 
4696
/***********************************************************************
 
4697
Copies an externally stored field of a record to mem heap. */
 
4698
UNIV_INTERN
 
4699
byte*
 
4700
btr_rec_copy_externally_stored_field(
 
4701
/*=================================*/
 
4702
                                /* out: the field copied to heap */
 
4703
        const rec_t*    rec,    /* in: record in a clustered index;
 
4704
                                must be protected by a lock or a page latch */
 
4705
        const ulint*    offsets,/* in: array returned by rec_get_offsets() */
 
4706
        ulint           zip_size,/* in: nonzero=compressed BLOB page size,
 
4707
                                zero for uncompressed BLOBs */
 
4708
        ulint           no,     /* in: field number */
 
4709
        ulint*          len,    /* out: length of the field */
 
4710
        mem_heap_t*     heap)   /* in: mem heap */
 
4711
{
 
4712
        ulint           local_len;
 
4713
        const byte*     data;
 
4714
 
 
4715
        ut_a(rec_offs_nth_extern(offsets, no));
 
4716
 
 
4717
        /* An externally stored field can contain some initial
 
4718
        data from the field, and in the last 20 bytes it has the
 
4719
        space id, page number, and offset where the rest of the
 
4720
        field data is stored, and the data length in addition to
 
4721
        the data stored locally. We may need to store some data
 
4722
        locally to get the local record length above the 128 byte
 
4723
        limit so that field offsets are stored in two bytes, and
 
4724
        the extern bit is available in those two bytes. */
 
4725
 
 
4726
        data = rec_get_nth_field(rec, offsets, no, &local_len);
 
4727
 
 
4728
        return(btr_copy_externally_stored_field(len, data,
 
4729
                                                zip_size, local_len, heap));
 
4730
}