1
/*****************************************************************************
3
Copyright (C) 1997, 2010, Innobase Oy. All Rights Reserved.
4
Copyright (C) 2008, Google Inc.
6
Portions of this file contain modifications contributed and copyrighted by
7
Google, Inc. Those modifications are gratefully acknowledged and are described
8
briefly in the InnoDB documentation. The contributions by Google are
9
incorporated with their permission, and subject to the conditions contained in
10
the file COPYING.Google.
12
This program is free software; you can redistribute it and/or modify it under
13
the terms of the GNU General Public License as published by the Free Software
14
Foundation; version 2 of the License.
16
This program is distributed in the hope that it will be useful, but WITHOUT
17
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20
You should have received a copy of the GNU General Public License along with
21
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22
St, Fifth Floor, Boston, MA 02110-1301 USA
24
*****************************************************************************/
26
/***************************************************//**
30
Created 12/19/1997 Heikki Tuuri
31
*******************************************************/
39
#include "dict0dict.h"
40
#include "dict0boot.h"
46
#include "mach0data.h"
52
#include "lock0lock.h"
53
#include "eval0eval.h"
55
#include "pars0pars.h"
56
#include "row0mysql.h"
57
#include "read0read.h"
59
#include "ha_prototypes.h"
61
/* Maximum number of rows to prefetch; MySQL interface has another parameter */
62
#define SEL_MAX_N_PREFETCH 16
64
/* Number of rows fetched, after which to start prefetching; MySQL interface
65
has another parameter */
66
#define SEL_PREFETCH_LIMIT 1
68
/* When a select has accessed about this many pages, it returns control back
69
to que_run_threads: this is to allow canceling runaway queries */
71
#define SEL_COST_LIMIT 100
73
/* Flags for search shortcut */
75
#define SEL_EXHAUSTED 1
78
/********************************************************************//**
79
Returns TRUE if the user-defined column in a secondary index record
80
is alphabetically the same as the corresponding BLOB column in the clustered
82
NOTE: the comparison is NOT done as a binary comparison, but character
83
fields are compared with collation!
84
@return TRUE if the columns are equal */
87
row_sel_sec_rec_is_for_blob(
88
/*========================*/
89
ulint mtype, /*!< in: main type */
90
ulint prtype, /*!< in: precise type */
91
ulint mbminmaxlen, /*!< in: minimum and maximum length of
92
a multi-byte character */
93
const byte* clust_field, /*!< in: the locally stored part of
94
the clustered index column, including
95
the BLOB pointer; the clustered
96
index record must be covered by
97
a lock or a page latch to protect it
98
against deletion (rollback or purge) */
99
ulint clust_len, /*!< in: length of clust_field */
100
const byte* sec_field, /*!< in: column in secondary index */
101
ulint sec_len, /*!< in: length of sec_field */
102
ulint zip_size) /*!< in: compressed page size, or 0 */
105
byte buf[DICT_MAX_INDEX_COL_LEN];
107
ut_a(clust_len >= BTR_EXTERN_FIELD_REF_SIZE);
110
(!memcmp(clust_field + clust_len - BTR_EXTERN_FIELD_REF_SIZE,
111
field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
112
/* The externally stored field was not written yet.
113
This record should only be seen by
114
recv_recovery_rollback_active() or any
115
TRX_ISO_READ_UNCOMMITTED transactions. */
119
len = btr_copy_externally_stored_field_prefix(buf, sizeof buf,
121
clust_field, clust_len);
123
if (UNIV_UNLIKELY(len == 0)) {
124
/* The BLOB was being deleted as the server crashed.
125
There should not be any secondary index records
126
referring to this clustered index record, because
127
btr_free_externally_stored_field() is called after all
128
secondary index entries of the row have been purged. */
132
len = dtype_get_at_most_n_mbchars(prtype, mbminmaxlen,
133
sec_len, len, (const char*) buf);
135
return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
138
/********************************************************************//**
139
Returns TRUE if the user-defined column values in a secondary index record
140
are alphabetically the same as the corresponding columns in the clustered
142
NOTE: the comparison is NOT done as a binary comparison, but character
143
fields are compared with collation!
144
@return TRUE if the secondary record is equal to the corresponding
145
fields in the clustered record, when compared with collation;
146
FALSE if not equal or if the clustered record has been marked for deletion */
149
row_sel_sec_rec_is_for_clust_rec(
150
/*=============================*/
151
const rec_t* sec_rec, /*!< in: secondary index record */
152
dict_index_t* sec_index, /*!< in: secondary index */
153
const rec_t* clust_rec, /*!< in: clustered index record;
154
must be protected by a lock or
155
a page latch against deletion
156
in rollback or purge */
157
dict_index_t* clust_index) /*!< in: clustered index */
159
const byte* sec_field;
161
const byte* clust_field;
164
mem_heap_t* heap = NULL;
165
ulint clust_offsets_[REC_OFFS_NORMAL_SIZE];
166
ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
167
ulint* clust_offs = clust_offsets_;
168
ulint* sec_offs = sec_offsets_;
169
ibool is_equal = TRUE;
171
rec_offs_init(clust_offsets_);
172
rec_offs_init(sec_offsets_);
174
if (rec_get_deleted_flag(clust_rec,
175
dict_table_is_comp(clust_index->table))) {
177
/* The clustered index record is delete-marked;
178
it is not visible in the read view. Besides,
179
if there are any externally stored columns,
180
some of them may have already been purged. */
184
clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
185
ULINT_UNDEFINED, &heap);
186
sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
187
ULINT_UNDEFINED, &heap);
189
n = dict_index_get_n_ordering_defined_by_user(sec_index);
191
for (i = 0; i < n; i++) {
192
const dict_field_t* ifield;
193
const dict_col_t* col;
198
ifield = dict_index_get_nth_field(sec_index, i);
199
col = dict_field_get_col(ifield);
200
clust_pos = dict_col_get_clust_pos(col, clust_index);
202
clust_field = rec_get_nth_field(
203
clust_rec, clust_offs, clust_pos, &clust_len);
204
sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
208
if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL) {
210
if (rec_offs_nth_extern(clust_offs, clust_pos)) {
211
len -= BTR_EXTERN_FIELD_REF_SIZE;
214
len = dtype_get_at_most_n_mbchars(
215
col->prtype, col->mbminmaxlen,
216
ifield->prefix_len, len, (char*) clust_field);
218
if (rec_offs_nth_extern(clust_offs, clust_pos)
220
if (!row_sel_sec_rec_is_for_blob(
221
col->mtype, col->prtype,
223
clust_field, clust_len,
226
clust_index->table))) {
234
if (0 != cmp_data_data(col->mtype, col->prtype,
236
sec_field, sec_len)) {
244
if (UNIV_LIKELY_NULL(heap)) {
250
/*********************************************************************//**
251
Creates a select node struct.
252
@return own: select node struct */
257
mem_heap_t* heap) /*!< in: memory heap where created */
261
node = static_cast<sel_node_t *>(mem_heap_alloc(heap, sizeof(sel_node_t)));
262
node->common.type = QUE_NODE_SELECT;
263
node->state = SEL_NODE_OPEN;
270
/*********************************************************************//**
271
Frees the memory private to a select node when a query graph is freed,
272
does not free the heap where the node was originally created. */
275
sel_node_free_private(
276
/*==================*/
277
sel_node_t* node) /*!< in: select node struct */
282
if (node->plans != NULL) {
283
for (i = 0; i < node->n_tables; i++) {
284
plan = sel_node_get_nth_plan(node, i);
286
btr_pcur_close(&(plan->pcur));
287
btr_pcur_close(&(plan->clust_pcur));
289
if (plan->old_vers_heap) {
290
mem_heap_free(plan->old_vers_heap);
296
/*********************************************************************//**
297
Evaluates the values in a select list. If there are aggregate functions,
298
their argument value is added to the aggregate total. */
301
sel_eval_select_list(
302
/*=================*/
303
sel_node_t* node) /*!< in: select node */
307
exp = node->select_list;
312
exp = que_node_get_next(exp);
316
/*********************************************************************//**
317
Assigns the values in the select list to the possible into-variables in
318
SELECT ... INTO ... */
321
sel_assign_into_var_values(
322
/*=======================*/
323
sym_node_t* var, /*!< in: first variable in a list of variables */
324
sel_node_t* node) /*!< in: select node */
333
exp = node->select_list;
338
eval_node_copy_val(var->alias, exp);
340
exp = que_node_get_next(exp);
341
var = static_cast<sym_node_t *>(que_node_get_next(var));
345
/*********************************************************************//**
346
Resets the aggregate value totals in the select list of an aggregate type
350
sel_reset_aggregate_vals(
351
/*=====================*/
352
sel_node_t* node) /*!< in: select node */
354
func_node_t* func_node;
356
ut_ad(node->is_aggregate);
358
func_node = static_cast<func_node_t *>(node->select_list);
361
eval_node_set_int_val(func_node, 0);
363
func_node = static_cast<func_node_t *>(que_node_get_next(func_node));
366
node->aggregate_already_fetched = FALSE;
369
/*********************************************************************//**
370
Copies the input variable values when an explicit cursor is opened. */
373
row_sel_copy_input_variable_vals(
374
/*=============================*/
375
sel_node_t* node) /*!< in: select node */
379
var = UT_LIST_GET_FIRST(node->copy_variables);
382
eval_node_copy_val(var, var->alias);
384
var->indirection = NULL;
386
var = UT_LIST_GET_NEXT(col_var_list, var);
390
/*********************************************************************//**
391
Fetches the column values from a record. */
394
row_sel_fetch_columns(
395
/*==================*/
396
dict_index_t* index, /*!< in: record index */
397
const rec_t* rec, /*!< in: record in a clustered or non-clustered
398
index; must be protected by a page latch */
399
const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
400
sym_node_t* column) /*!< in: first column in a column list, or
409
ut_ad(rec_offs_validate(rec, index, offsets));
411
if (dict_index_is_clust(index)) {
412
index_type = SYM_CLUST_FIELD_NO;
414
index_type = SYM_SEC_FIELD_NO;
418
mem_heap_t* heap = NULL;
421
field_no = column->field_nos[index_type];
423
if (field_no != ULINT_UNDEFINED) {
425
if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
428
/* Copy an externally stored field to the
429
temporary heap, if possible. */
431
heap = mem_heap_create(1);
433
data = btr_rec_copy_externally_stored_field(
435
dict_table_zip_size(index->table),
436
field_no, &len, heap);
438
/* data == NULL means that the
439
externally stored field was not
440
written yet. This record
441
should only be seen by
442
recv_recovery_rollback_active() or any
443
TRX_ISO_READ_UNCOMMITTED
444
transactions. The InnoDB SQL parser
445
(the sole caller of this function)
446
does not implement READ UNCOMMITTED,
447
and it is not involved during rollback. */
449
ut_a(len != UNIV_SQL_NULL);
453
data = rec_get_nth_field(rec, offsets,
456
needs_copy = column->copy_val;
460
eval_node_copy_and_alloc_val(column, data,
463
val = que_node_get_val(column);
464
dfield_set_data(val, data, len);
467
if (UNIV_LIKELY_NULL(heap)) {
472
column = UT_LIST_GET_NEXT(col_var_list, column);
476
/*********************************************************************//**
477
Allocates a prefetch buffer for a column when prefetch is first time done. */
480
sel_col_prefetch_buf_alloc(
481
/*=======================*/
482
sym_node_t* column) /*!< in: symbol table node for a column */
487
ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
489
column->prefetch_buf = static_cast<sel_buf_t *>(mem_alloc(SEL_MAX_N_PREFETCH
490
* sizeof(sel_buf_t)));
491
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
492
sel_buf = column->prefetch_buf + i;
494
sel_buf->data = NULL;
496
sel_buf->val_buf_size = 0;
500
/*********************************************************************//**
501
Frees a prefetch buffer for a column, including the dynamically allocated
502
memory for data stored there. */
505
sel_col_prefetch_buf_free(
506
/*======================*/
507
sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */
512
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
513
sel_buf = prefetch_buf + i;
515
if (sel_buf->val_buf_size > 0) {
517
mem_free(sel_buf->data);
522
/*********************************************************************//**
523
Pops the column values for a prefetched, cached row from the column prefetch
524
buffers and places them to the val fields in the column nodes. */
527
sel_pop_prefetched_row(
528
/*===================*/
529
plan_t* plan) /*!< in: plan node for a table */
538
ut_ad(plan->n_rows_prefetched > 0);
540
column = UT_LIST_GET_FIRST(plan->columns);
543
val = que_node_get_val(column);
545
if (!column->copy_val) {
546
/* We did not really push any value for the
549
ut_ad(!column->prefetch_buf);
550
ut_ad(que_node_get_val_buf_size(column) == 0);
551
ut_d(dfield_set_null(val));
556
ut_ad(column->prefetch_buf);
557
ut_ad(!dfield_is_ext(val));
559
sel_buf = column->prefetch_buf + plan->first_prefetched;
561
data = sel_buf->data;
563
val_buf_size = sel_buf->val_buf_size;
565
/* We must keep track of the allocated memory for
566
column values to be able to free it later: therefore
567
we swap the values for sel_buf and val */
569
sel_buf->data = static_cast<byte *>(dfield_get_data(val));
570
sel_buf->len = dfield_get_len(val);
571
sel_buf->val_buf_size = que_node_get_val_buf_size(column);
573
dfield_set_data(val, data, len);
574
que_node_set_val_buf_size(column, val_buf_size);
576
column = UT_LIST_GET_NEXT(col_var_list, column);
579
plan->n_rows_prefetched--;
581
plan->first_prefetched++;
584
/*********************************************************************//**
585
Pushes the column values for a prefetched, cached row to the column prefetch
586
buffers from the val fields in the column nodes. */
589
sel_push_prefetched_row(
590
/*====================*/
591
plan_t* plan) /*!< in: plan node for a table */
601
if (plan->n_rows_prefetched == 0) {
603
plan->first_prefetched = 0;
605
pos = plan->n_rows_prefetched;
607
/* We have the convention that pushing new rows starts only
608
after the prefetch stack has been emptied: */
610
ut_ad(plan->first_prefetched == 0);
613
plan->n_rows_prefetched++;
615
ut_ad(pos < SEL_MAX_N_PREFETCH);
617
column = UT_LIST_GET_FIRST(plan->columns);
620
if (!column->copy_val) {
621
/* There is no sense to push pointers to database
622
page fields when we do not keep latch on the page! */
627
if (!column->prefetch_buf) {
628
/* Allocate a new prefetch buffer */
630
sel_col_prefetch_buf_alloc(column);
633
sel_buf = column->prefetch_buf + pos;
635
val = que_node_get_val(column);
637
data = static_cast<byte *>(dfield_get_data(val));
638
len = dfield_get_len(val);
639
val_buf_size = que_node_get_val_buf_size(column);
641
/* We must keep track of the allocated memory for
642
column values to be able to free it later: therefore
643
we swap the values for sel_buf and val */
645
dfield_set_data(val, sel_buf->data, sel_buf->len);
646
que_node_set_val_buf_size(column, sel_buf->val_buf_size);
648
sel_buf->data = data;
650
sel_buf->val_buf_size = val_buf_size;
652
column = UT_LIST_GET_NEXT(col_var_list, column);
656
/*********************************************************************//**
657
Builds a previous version of a clustered index record for a consistent read
658
@return DB_SUCCESS or error code */
661
row_sel_build_prev_vers(
662
/*====================*/
663
read_view_t* read_view, /*!< in: read view */
664
dict_index_t* index, /*!< in: plan node for table */
665
rec_t* rec, /*!< in: record in a clustered index */
666
ulint** offsets, /*!< in/out: offsets returned by
667
rec_get_offsets(rec, plan->index) */
668
mem_heap_t** offset_heap, /*!< in/out: memory heap from which
669
the offsets are allocated */
670
mem_heap_t** old_vers_heap, /*!< out: old version heap to use */
671
rec_t** old_vers, /*!< out: old version, or NULL if the
672
record does not exist in the view:
673
i.e., it was freshly inserted
675
mtr_t* mtr) /*!< in: mtr */
679
if (*old_vers_heap) {
680
mem_heap_empty(*old_vers_heap);
682
*old_vers_heap = mem_heap_create(512);
685
err = row_vers_build_for_consistent_read(
686
rec, mtr, index, offsets, read_view, offset_heap,
687
*old_vers_heap, old_vers);
691
/*********************************************************************//**
692
Builds the last committed version of a clustered index record for a
693
semi-consistent read.
694
@return DB_SUCCESS or error code */
697
row_sel_build_committed_vers_for_mysql(
698
/*===================================*/
699
dict_index_t* clust_index, /*!< in: clustered index */
700
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
701
const rec_t* rec, /*!< in: record in a clustered index */
702
ulint** offsets, /*!< in/out: offsets returned by
703
rec_get_offsets(rec, clust_index) */
704
mem_heap_t** offset_heap, /*!< in/out: memory heap from which
705
the offsets are allocated */
706
const rec_t** old_vers, /*!< out: old version, or NULL if the
707
record does not exist in the view:
708
i.e., it was freshly inserted
710
mtr_t* mtr) /*!< in: mtr */
714
if (prebuilt->old_vers_heap) {
715
mem_heap_empty(prebuilt->old_vers_heap);
717
prebuilt->old_vers_heap = mem_heap_create(200);
720
err = row_vers_build_for_semi_consistent_read(
721
rec, mtr, clust_index, offsets, offset_heap,
722
prebuilt->old_vers_heap, old_vers);
726
/*********************************************************************//**
727
Tests the conditions which determine when the index segment we are searching
728
through has been exhausted.
729
@return TRUE if row passed the tests */
732
row_sel_test_end_conds(
733
/*===================*/
734
plan_t* plan) /*!< in: plan for the table; the column values must
735
already have been retrieved and the right sides of
736
comparisons evaluated */
740
/* All conditions in end_conds are comparisons of a column to an
743
cond = UT_LIST_GET_FIRST(plan->end_conds);
746
/* Evaluate the left side of the comparison, i.e., get the
747
column value if there is an indirection */
749
eval_sym(static_cast<sym_node_t *>(cond->args));
751
/* Do the comparison */
753
if (!eval_cmp(cond)) {
758
cond = UT_LIST_GET_NEXT(cond_list, cond);
764
/*********************************************************************//**
765
Tests the other conditions.
766
@return TRUE if row passed the tests */
769
row_sel_test_other_conds(
770
/*=====================*/
771
plan_t* plan) /*!< in: plan for the table; the column values must
772
already have been retrieved */
776
cond = UT_LIST_GET_FIRST(plan->other_conds);
781
if (!eval_node_get_ibool_val(cond)) {
786
cond = UT_LIST_GET_NEXT(cond_list, cond);
792
/*********************************************************************//**
793
Retrieves the clustered index record corresponding to a record in a
794
non-clustered index. Does the necessary locking.
795
@return DB_SUCCESS or error code */
798
row_sel_get_clust_rec(
799
/*==================*/
800
sel_node_t* node, /*!< in: select_node */
801
plan_t* plan, /*!< in: plan node for table */
802
rec_t* rec, /*!< in: record in a non-clustered index */
803
que_thr_t* thr, /*!< in: query thread */
804
rec_t** out_rec,/*!< out: clustered record or an old version of
805
it, NULL if the old version did not exist
806
in the read view, i.e., it was a fresh
808
mtr_t* mtr) /*!< in: mtr used to get access to the
809
non-clustered record; the same mtr is used to
810
access the clustered index */
816
mem_heap_t* heap = NULL;
817
ulint offsets_[REC_OFFS_NORMAL_SIZE];
818
ulint* offsets = offsets_;
819
rec_offs_init(offsets_);
823
offsets = rec_get_offsets(rec,
824
btr_pcur_get_btr_cur(&plan->pcur)->index,
825
offsets, ULINT_UNDEFINED, &heap);
827
row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
829
index = dict_table_get_first_index(plan->table);
831
btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
832
BTR_SEARCH_LEAF, &plan->clust_pcur,
835
clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
837
/* Note: only if the search ends up on a non-infimum record is the
838
low_match value the real match to the search tuple */
840
if (!page_rec_is_user_rec(clust_rec)
841
|| btr_pcur_get_low_match(&(plan->clust_pcur))
842
< dict_index_get_n_unique(index)) {
844
ut_a(rec_get_deleted_flag(rec,
845
dict_table_is_comp(plan->table)));
846
ut_a(node->read_view);
848
/* In a rare case it is possible that no clust rec is found
849
for a delete-marked secondary index record: if in row0umod.c
850
in row_undo_mod_remove_clust_low() we have already removed
851
the clust rec, while purge is still cleaning and removing
852
secondary index records associated with earlier versions of
853
the clustered index record. In that case we know that the
854
clustered index record did not exist in the read view of
860
offsets = rec_get_offsets(clust_rec, index, offsets,
861
ULINT_UNDEFINED, &heap);
863
if (!node->read_view) {
864
/* Try to place a lock on the index record */
866
/* If innodb_locks_unsafe_for_binlog option is used
867
or this session is using READ COMMITTED isolation level
868
we lock only the record, i.e., next-key locking is
873
trx = thr_get_trx(thr);
875
if (srv_locks_unsafe_for_binlog
876
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
877
lock_type = LOCK_REC_NOT_GAP;
879
lock_type = LOCK_ORDINARY;
882
err = lock_clust_rec_read_check_and_lock(
883
0, btr_pcur_get_block(&plan->clust_pcur),
884
clust_rec, index, offsets,
885
static_cast<lock_mode>(node->row_lock_mode), lock_type, thr);
889
case DB_SUCCESS_LOCKED_REC:
890
/* Declare the variable uninitialized in Valgrind.
891
It should be set to DB_SUCCESS at func_exit. */
892
UNIV_MEM_INVALID(&err, sizeof err);
898
/* This is a non-locking consistent read: if necessary, fetch
899
a previous version of the record */
903
if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
906
err = row_sel_build_prev_vers(
907
node->read_view, index, clust_rec,
908
&offsets, &heap, &plan->old_vers_heap,
911
if (err != DB_SUCCESS) {
916
clust_rec = old_vers;
918
if (clust_rec == NULL) {
923
/* If we had to go to an earlier version of row or the
924
secondary index record is delete marked, then it may be that
925
the secondary index record corresponding to clust_rec
926
(or old_vers) is not rec; in that case we must ignore
927
such row because in our snapshot rec would not have existed.
928
Remember that from rec we cannot see directly which transaction
929
id corresponds to it: we have to go to the clustered index
930
record. A query where we want to fetch all rows where
931
the secondary index value is in some interval would return
932
a wrong result if we would not drop rows which we come to
933
visit through secondary index records that would not really
934
exist in our snapshot. */
937
|| rec_get_deleted_flag(rec, dict_table_is_comp(
939
&& !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
945
/* Fetch the columns needed in test conditions. The clustered
946
index record is protected by a page latch that was acquired
947
when plan->clust_pcur was positioned. The latch will not be
948
released until mtr_commit(mtr). */
950
ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
951
row_sel_fetch_columns(index, clust_rec, offsets,
952
UT_LIST_GET_FIRST(plan->columns));
953
*out_rec = clust_rec;
957
if (UNIV_LIKELY_NULL(heap)) {
963
/*********************************************************************//**
964
Sets a lock on a record.
965
@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
970
const buf_block_t* block, /*!< in: buffer block of rec */
971
const rec_t* rec, /*!< in: record */
972
dict_index_t* index, /*!< in: index */
973
const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
974
ulint mode, /*!< in: lock mode */
975
ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
977
que_thr_t* thr) /*!< in: query thread */
982
trx = thr_get_trx(thr);
984
if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
985
if (buf_LRU_buf_pool_running_out()) {
987
return(DB_LOCK_TABLE_FULL);
991
if (dict_index_is_clust(index)) {
992
err = lock_clust_rec_read_check_and_lock(0, block, rec, index,
993
offsets, static_cast<lock_mode>(mode), type, thr);
995
err = lock_sec_rec_read_check_and_lock(0, block, rec, index,
996
offsets, static_cast<lock_mode>(mode), type, thr);
1002
/*********************************************************************//**
1003
Opens a pcur to a table index. */
1008
plan_t* plan, /*!< in: table plan */
1009
ibool search_latch_locked,
1010
/*!< in: TRUE if the thread currently
1011
has the search latch locked in
1013
mtr_t* mtr) /*!< in: mtr */
1015
dict_index_t* index;
1019
ulint has_search_latch = 0; /* RW_S_LATCH or 0 */
1022
if (search_latch_locked) {
1023
has_search_latch = RW_S_LATCH;
1026
index = plan->index;
1028
/* Calculate the value of the search tuple: the exact match columns
1029
get their expressions evaluated when we evaluate the right sides of
1032
cond = UT_LIST_GET_FIRST(plan->end_conds);
1035
eval_exp(que_node_get_next(cond->args));
1037
cond = UT_LIST_GET_NEXT(cond_list, cond);
1041
n_fields = dtuple_get_n_fields(plan->tuple);
1043
if (plan->n_exact_match < n_fields) {
1044
/* There is a non-exact match field which must be
1045
evaluated separately */
1047
eval_exp(plan->tuple_exps[n_fields - 1]);
1050
for (i = 0; i < n_fields; i++) {
1051
exp = plan->tuple_exps[i];
1053
dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
1054
que_node_get_val(exp));
1057
/* Open pcur to the index */
1059
btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
1060
BTR_SEARCH_LEAF, &plan->pcur,
1061
has_search_latch, mtr);
1063
/* Open the cursor to the start or the end of the index
1066
btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
1067
&(plan->pcur), FALSE, mtr);
1070
ut_ad(plan->n_rows_prefetched == 0);
1071
ut_ad(plan->n_rows_fetched == 0);
1072
ut_ad(plan->cursor_at_end == FALSE);
1074
plan->pcur_is_open = TRUE;
1077
/*********************************************************************//**
1078
Restores a stored pcur position to a table index.
1079
@return TRUE if the cursor should be moved to the next record after we
1080
return from this function (moved to the previous, in the case of a
1081
descending cursor) without processing again the current cursor
1085
row_sel_restore_pcur_pos(
1086
/*=====================*/
1087
plan_t* plan, /*!< in: table plan */
1088
mtr_t* mtr) /*!< in: mtr */
1090
ibool equal_position;
1091
ulint relative_position;
1093
ut_ad(!plan->cursor_at_end);
1095
relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
1097
equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
1098
&(plan->pcur), mtr);
1100
/* If the cursor is traveling upwards, and relative_position is
1102
(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
1103
yet on the successor of the page infimum;
1104
(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1105
first record GREATER than the predecessor of a page supremum; we have
1106
not yet processed the cursor record: no need to move the cursor to the
1108
(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1109
last record LESS or EQUAL to the old stored user record; (a) if
1110
equal_position is FALSE, this means that the cursor is now on a record
1111
less than the old user record, and we must move to the next record;
1112
(b) if equal_position is TRUE, then if
1113
plan->stored_cursor_rec_processed is TRUE, we must move to the next
1114
record, else there is no need to move the cursor. */
1117
if (relative_position == BTR_PCUR_ON) {
1119
if (equal_position) {
1121
return(plan->stored_cursor_rec_processed);
1127
ut_ad(relative_position == BTR_PCUR_AFTER
1128
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1133
/* If the cursor is traveling downwards, and relative_position is
1135
(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
1136
the last record LESS than the successor of a page infimum; we have not
1137
processed the cursor record: no need to move the cursor;
1138
(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1139
first record GREATER than the predecessor of a page supremum; we have
1140
processed the cursor record: we should move the cursor to the previous
1142
(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1143
last record LESS or EQUAL to the old stored user record; (a) if
1144
equal_position is FALSE, this means that the cursor is now on a record
1145
less than the old user record, and we need not move to the previous
1146
record; (b) if equal_position is TRUE, then if
1147
plan->stored_cursor_rec_processed is TRUE, we must move to the previous
1148
record, else there is no need to move the cursor. */
1150
if (relative_position == BTR_PCUR_BEFORE
1151
|| relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
1156
if (relative_position == BTR_PCUR_ON) {
1158
if (equal_position) {
1160
return(plan->stored_cursor_rec_processed);
1166
ut_ad(relative_position == BTR_PCUR_AFTER
1167
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1172
/*********************************************************************//**
1173
Resets a plan cursor to a closed state. */
1178
plan_t* plan) /*!< in: plan */
1180
plan->pcur_is_open = FALSE;
1181
plan->cursor_at_end = FALSE;
1182
plan->n_rows_fetched = 0;
1183
plan->n_rows_prefetched = 0;
1186
/*********************************************************************//**
1187
Tries to do a shortcut to fetch a clustered index record with a unique key,
1188
using the hash index if possible (not always).
1189
@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
1192
row_sel_try_search_shortcut(
1193
/*========================*/
1194
sel_node_t* node, /*!< in: select node for a consistent read */
1195
plan_t* plan, /*!< in: plan for a unique search in clustered
1197
mtr_t* mtr) /*!< in: mtr */
1199
dict_index_t* index;
1201
mem_heap_t* heap = NULL;
1202
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1203
ulint* offsets = offsets_;
1205
rec_offs_init(offsets_);
1207
index = plan->index;
1209
ut_ad(node->read_view);
1210
ut_ad(plan->unique_search);
1211
ut_ad(!plan->must_get_clust);
1212
#ifdef UNIV_SYNC_DEBUG
1213
ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
1214
#endif /* UNIV_SYNC_DEBUG */
1216
row_sel_open_pcur(plan, TRUE, mtr);
1218
rec = btr_pcur_get_rec(&(plan->pcur));
1220
if (!page_rec_is_user_rec(rec)) {
1225
ut_ad(plan->mode == PAGE_CUR_GE);
1227
/* As the cursor is now placed on a user record after a search with
1228
the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
1229
fields in the user record matched to the search tuple */
1231
if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
1233
return(SEL_EXHAUSTED);
1236
/* This is a non-locking consistent read: if necessary, fetch
1237
a previous version of the record */
1239
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1241
if (dict_index_is_clust(index)) {
1242
if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1247
} else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
1253
/* Test the deleted flag. */
1255
if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
1257
ret = SEL_EXHAUSTED;
1261
/* Fetch the columns needed in test conditions. The index
1262
record is protected by a page latch that was acquired when
1263
plan->pcur was positioned. The latch will not be released
1264
until mtr_commit(mtr). */
1266
row_sel_fetch_columns(index, rec, offsets,
1267
UT_LIST_GET_FIRST(plan->columns));
1269
/* Test the rest of search conditions */
1271
if (!row_sel_test_other_conds(plan)) {
1273
ret = SEL_EXHAUSTED;
1277
ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1279
plan->n_rows_fetched++;
1282
if (UNIV_LIKELY_NULL(heap)) {
1283
mem_heap_free(heap);
1288
/*********************************************************************//**
1289
Performs a select step.
1290
@return DB_SUCCESS or error code */
1295
sel_node_t* node, /*!< in: select node */
1296
que_thr_t* thr) /*!< in: query thread */
1298
dict_index_t* index;
1305
ibool search_latch_locked;
1306
ibool consistent_read;
1308
/* The following flag becomes TRUE when we are doing a
1309
consistent read from a non-clustered index and we must look
1310
at the clustered index to find out the previous delete mark
1311
state of the non-clustered record: */
1313
ibool cons_read_requires_clust_rec = FALSE;
1314
ulint cost_counter = 0;
1315
ibool cursor_just_opened;
1316
ibool must_go_to_next;
1317
ibool mtr_has_extra_clust_latch = FALSE;
1318
/* TRUE if the search was made using
1319
a non-clustered index, and we had to
1320
access the clustered record: now &mtr
1321
contains a clustered index latch, and
1322
&mtr must be committed before we move
1323
to the next non-clustered record */
1326
mem_heap_t* heap = NULL;
1327
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1328
ulint* offsets = offsets_;
1329
rec_offs_init(offsets_);
1331
ut_ad(thr->run_node == node);
1333
search_latch_locked = FALSE;
1335
if (node->read_view) {
1336
/* In consistent reads, we try to do with the hash index and
1337
not to use the buffer page get. This is to reduce memory bus
1338
load resulting from semaphore operations. The search latch
1339
will be s-locked when we access an index with a unique search
1340
condition, but not locked when we access an index with a
1341
less selective search condition. */
1343
consistent_read = TRUE;
1345
consistent_read = FALSE;
1351
This is the outer major loop in calculating a join. We come here when
1352
node->fetch_table changes, and after adding a row to aggregate totals
1353
and, of course, when this function is called. */
1355
ut_ad(mtr_has_extra_clust_latch == FALSE);
1357
plan = sel_node_get_nth_plan(node, node->fetch_table);
1358
index = plan->index;
1360
if (plan->n_rows_prefetched > 0) {
1361
sel_pop_prefetched_row(plan);
1363
goto next_table_no_mtr;
1366
if (plan->cursor_at_end) {
1367
/* The cursor has already reached the result set end: no more
1368
rows to process for this table cursor, as also the prefetch
1371
ut_ad(plan->pcur_is_open);
1373
goto table_exhausted_no_mtr;
1376
/* Open a cursor to index, or restore an open cursor position */
1380
if (consistent_read && plan->unique_search && !plan->pcur_is_open
1381
&& !plan->must_get_clust
1382
&& !plan->table->big_rows) {
1383
if (!search_latch_locked) {
1384
rw_lock_s_lock(&btr_search_latch);
1386
search_latch_locked = TRUE;
1387
} else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
1389
/* There is an x-latch request waiting: release the
1390
s-latch for a moment; as an s-latch here is often
1391
kept for some 10 searches before being released,
1392
a waiting x-latch request would block other threads
1393
from acquiring an s-latch for a long time, lowering
1394
performance significantly in multiprocessors. */
1396
rw_lock_s_unlock(&btr_search_latch);
1397
rw_lock_s_lock(&btr_search_latch);
1400
found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
1402
if (found_flag == SEL_FOUND) {
1406
} else if (found_flag == SEL_EXHAUSTED) {
1408
goto table_exhausted;
1411
ut_ad(found_flag == SEL_RETRY);
1413
plan_reset_cursor(plan);
1419
if (search_latch_locked) {
1420
rw_lock_s_unlock(&btr_search_latch);
1422
search_latch_locked = FALSE;
1425
if (!plan->pcur_is_open) {
1426
/* Evaluate the expressions to build the search tuple and
1429
row_sel_open_pcur(plan, search_latch_locked, &mtr);
1431
cursor_just_opened = TRUE;
1433
/* A new search was made: increment the cost counter */
1436
/* Restore pcur position to the index */
1438
must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
1440
cursor_just_opened = FALSE;
1442
if (must_go_to_next) {
1443
/* We have already processed the cursor record: move
1453
In this loop we use pcur and try to fetch a qualifying row, and
1454
also fill the prefetch buffer for this table if n_rows_fetched has
1455
exceeded a threshold. While we are inside this loop, the following
1457
(1) &mtr is started,
1458
(2) pcur is positioned and open.
1460
NOTE that if cursor_just_opened is TRUE here, it means that we came
1461
to this point right after row_sel_open_pcur. */
1463
ut_ad(mtr_has_extra_clust_latch == FALSE);
1465
rec = btr_pcur_get_rec(&(plan->pcur));
1467
/* PHASE 1: Set a lock if specified */
1469
if (!node->asc && cursor_just_opened
1470
&& !page_rec_is_supremum(rec)) {
1472
/* When we open a cursor for a descending search, we must set
1473
a next-key lock on the successor record: otherwise it would
1474
be possible to insert new records next to the cursor position,
1475
and it might be that these new records should appear in the
1476
search result set, resulting in the phantom problem. */
1478
if (!consistent_read) {
1480
/* If innodb_locks_unsafe_for_binlog option is used
1481
or this session is using READ COMMITTED isolation
1482
level, we lock only the record, i.e., next-key
1483
locking is not used. */
1485
rec_t* next_rec = page_rec_get_next(rec);
1489
trx = thr_get_trx(thr);
1491
offsets = rec_get_offsets(next_rec, index, offsets,
1492
ULINT_UNDEFINED, &heap);
1494
if (srv_locks_unsafe_for_binlog
1495
|| trx->isolation_level
1496
<= TRX_ISO_READ_COMMITTED) {
1498
if (page_rec_is_supremum(next_rec)) {
1503
lock_type = LOCK_REC_NOT_GAP;
1505
lock_type = LOCK_ORDINARY;
1508
err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1509
next_rec, index, offsets,
1510
node->row_lock_mode,
1514
case DB_SUCCESS_LOCKED_REC:
1519
/* Note that in this case we will store in pcur
1520
the PREDECESSOR of the record we are waiting
1522
goto lock_wait_or_error;
1528
if (page_rec_is_infimum(rec)) {
1530
/* The infimum record on a page cannot be in the result set,
1531
and neither can a record lock be placed on it: we skip such
1532
a record. We also increment the cost counter as we may have
1533
processed yet another page of index. */
1540
if (!consistent_read) {
1541
/* Try to place a lock on the index record */
1543
/* If innodb_locks_unsafe_for_binlog option is used
1544
or this session is using READ COMMITTED isolation level,
1545
we lock only the record, i.e., next-key locking is
1551
offsets = rec_get_offsets(rec, index, offsets,
1552
ULINT_UNDEFINED, &heap);
1554
trx = thr_get_trx(thr);
1556
if (srv_locks_unsafe_for_binlog
1557
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
1559
if (page_rec_is_supremum(rec)) {
1564
lock_type = LOCK_REC_NOT_GAP;
1566
lock_type = LOCK_ORDINARY;
1569
err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1570
rec, index, offsets,
1571
node->row_lock_mode, lock_type, thr);
1574
case DB_SUCCESS_LOCKED_REC:
1579
goto lock_wait_or_error;
1583
if (page_rec_is_supremum(rec)) {
1585
/* A page supremum record cannot be in the result set: skip
1586
it now when we have placed a possible lock on it */
1591
ut_ad(page_rec_is_user_rec(rec));
1593
if (cost_counter > SEL_COST_LIMIT) {
1595
/* Now that we have placed the necessary locks, we can stop
1596
for a while and store the cursor position; NOTE that if we
1597
would store the cursor position BEFORE placing a record lock,
1598
it might happen that the cursor would jump over some records
1599
that another transaction could meanwhile insert adjacent to
1600
the cursor: this would result in the phantom problem. */
1602
goto stop_for_a_while;
1605
/* PHASE 2: Check a mixed index mix id if needed */
1607
if (plan->unique_search && cursor_just_opened) {
1609
ut_ad(plan->mode == PAGE_CUR_GE);
1611
/* As the cursor is now placed on a user record after a search
1612
with the mode PAGE_CUR_GE, the up_match field in the cursor
1613
tells how many fields in the user record matched to the search
1616
if (btr_pcur_get_up_match(&(plan->pcur))
1617
< plan->n_exact_match) {
1618
goto table_exhausted;
1621
/* Ok, no need to test end_conds or mix id */
1625
/* We are ready to look at a possible new index entry in the result
1626
set: the cursor is now placed on a user record */
1628
/* PHASE 3: Get previous version in a consistent read */
1630
cons_read_requires_clust_rec = FALSE;
1631
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1633
if (consistent_read) {
1634
/* This is a non-locking consistent read: if necessary, fetch
1635
a previous version of the record */
1637
if (dict_index_is_clust(index)) {
1639
if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1642
err = row_sel_build_prev_vers(
1643
node->read_view, index, rec,
1644
&offsets, &heap, &plan->old_vers_heap,
1647
if (err != DB_SUCCESS) {
1649
goto lock_wait_or_error;
1652
if (old_vers == NULL) {
1653
/* The record does not exist
1654
in our read view. Skip it, but
1655
first attempt to determine
1656
whether the index segment we
1657
are searching through has been
1660
offsets = rec_get_offsets(
1661
rec, index, offsets,
1662
ULINT_UNDEFINED, &heap);
1664
/* Fetch the columns needed in
1665
test conditions. The clustered
1666
index record is protected by a
1667
page latch that was acquired
1668
by row_sel_open_pcur() or
1669
row_sel_restore_pcur_pos().
1670
The latch will not be released
1671
until mtr_commit(mtr). */
1673
row_sel_fetch_columns(
1674
index, rec, offsets,
1678
if (!row_sel_test_end_conds(plan)) {
1680
goto table_exhausted;
1688
} else if (!lock_sec_rec_cons_read_sees(rec,
1690
cons_read_requires_clust_rec = TRUE;
1694
/* PHASE 4: Test search end conditions and deleted flag */
1696
/* Fetch the columns needed in test conditions. The record is
1697
protected by a page latch that was acquired by
1698
row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
1699
will not be released until mtr_commit(mtr). */
1701
row_sel_fetch_columns(index, rec, offsets,
1702
UT_LIST_GET_FIRST(plan->columns));
1704
/* Test the selection end conditions: these can only contain columns
1705
which already are found in the index, even though the index might be
1708
if (plan->unique_search && cursor_just_opened) {
1710
/* No test necessary: the test was already made above */
1712
} else if (!row_sel_test_end_conds(plan)) {
1714
goto table_exhausted;
1717
if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
1718
&& !cons_read_requires_clust_rec) {
1720
/* The record is delete marked: we can skip it if this is
1721
not a consistent read which might see an earlier version
1722
of a non-clustered index record */
1724
if (plan->unique_search) {
1726
goto table_exhausted;
1732
/* PHASE 5: Get the clustered index record, if needed and if we did
1733
not do the search using the clustered index */
1735
if (plan->must_get_clust || cons_read_requires_clust_rec) {
1737
/* It was a non-clustered index and we must fetch also the
1738
clustered index record */
1740
err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
1742
mtr_has_extra_clust_latch = TRUE;
1745
case DB_SUCCESS_LOCKED_REC:
1750
goto lock_wait_or_error;
1753
/* Retrieving the clustered record required a search:
1754
increment the cost counter */
1758
if (clust_rec == NULL) {
1759
/* The record did not exist in the read view */
1760
ut_ad(consistent_read);
1765
if (rec_get_deleted_flag(clust_rec,
1766
dict_table_is_comp(plan->table))) {
1768
/* The record is delete marked: we can skip it */
1773
if (node->can_get_updated) {
1775
btr_pcur_store_position(&(plan->clust_pcur), &mtr);
1779
/* PHASE 6: Test the rest of search conditions */
1781
if (!row_sel_test_other_conds(plan)) {
1783
if (plan->unique_search) {
1785
goto table_exhausted;
1791
/* PHASE 7: We found a new qualifying row for the current table; push
1792
the row if prefetch is on, or move to the next table in the join */
1794
plan->n_rows_fetched++;
1796
ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1798
if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
1799
|| plan->unique_search || plan->no_prefetch
1800
|| plan->table->big_rows) {
1802
/* No prefetch in operation: go to the next table */
1807
sel_push_prefetched_row(plan);
1809
if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
1811
/* The prefetch buffer is now full */
1813
sel_pop_prefetched_row(plan);
1819
ut_ad(!search_latch_locked);
1821
if (mtr_has_extra_clust_latch) {
1823
/* We must commit &mtr if we are moving to the next
1824
non-clustered index record, because we could break the
1825
latching order if we would access a different clustered
1826
index page right away without releasing the previous. */
1828
goto commit_mtr_for_a_while;
1832
moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
1834
moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
1839
goto table_exhausted;
1842
cursor_just_opened = FALSE;
1844
/* END OF RECORD LOOP
1845
------------------ */
1849
/* We found a record which satisfies the conditions: we can move to
1850
the next table or return a row in the result set */
1852
ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
1854
if (plan->unique_search && !node->can_get_updated) {
1856
plan->cursor_at_end = TRUE;
1858
ut_ad(!search_latch_locked);
1860
plan->stored_cursor_rec_processed = TRUE;
1862
btr_pcur_store_position(&(plan->pcur), &mtr);
1867
mtr_has_extra_clust_latch = FALSE;
1870
/* If we use 'goto' to this label, it means that the row was popped
1871
from the prefetched rows stack, and &mtr is already committed */
1873
if (node->fetch_table + 1 == node->n_tables) {
1875
sel_eval_select_list(node);
1877
if (node->is_aggregate) {
1882
sel_assign_into_var_values(node->into_list, node);
1884
thr->run_node = que_node_get_parent(node);
1890
node->fetch_table++;
1892
/* When we move to the next table, we first reset the plan cursor:
1893
we do not care about resetting it when we backtrack from a table */
1895
plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
1900
/* The table cursor pcur reached the result set end: backtrack to the
1901
previous table in the join if we do not have cached prefetched rows */
1903
plan->cursor_at_end = TRUE;
1907
mtr_has_extra_clust_latch = FALSE;
1909
if (plan->n_rows_prefetched > 0) {
1910
/* The table became exhausted during a prefetch */
1912
sel_pop_prefetched_row(plan);
1914
goto next_table_no_mtr;
1917
table_exhausted_no_mtr:
1918
if (node->fetch_table == 0) {
1921
if (node->is_aggregate && !node->aggregate_already_fetched) {
1923
node->aggregate_already_fetched = TRUE;
1925
sel_assign_into_var_values(node->into_list, node);
1927
thr->run_node = que_node_get_parent(node);
1929
node->state = SEL_NODE_NO_MORE_ROWS;
1931
thr->run_node = que_node_get_parent(node);
1938
node->fetch_table--;
1943
/* Return control for a while to que_run_threads, so that runaway
1944
queries can be canceled. NOTE that when we come here, we must, in a
1945
locking read, have placed the necessary (possibly waiting request)
1946
record lock on the cursor record or its successor: when we reposition
1947
the cursor, this record lock guarantees that nobody can meanwhile have
1948
inserted new records which should have appeared in the result set,
1949
which would result in the phantom problem. */
1951
ut_ad(!search_latch_locked);
1953
plan->stored_cursor_rec_processed = FALSE;
1954
btr_pcur_store_position(&(plan->pcur), &mtr);
1958
#ifdef UNIV_SYNC_DEBUG
1959
ut_ad(sync_thread_levels_empty_gen(TRUE));
1960
#endif /* UNIV_SYNC_DEBUG */
1964
commit_mtr_for_a_while:
1965
/* Stores the cursor position and commits &mtr; this is used if
1966
&mtr may contain latches which would break the latching order if
1967
&mtr would not be committed and the latches released. */
1969
plan->stored_cursor_rec_processed = TRUE;
1971
ut_ad(!search_latch_locked);
1972
btr_pcur_store_position(&(plan->pcur), &mtr);
1976
mtr_has_extra_clust_latch = FALSE;
1978
#ifdef UNIV_SYNC_DEBUG
1979
ut_ad(sync_thread_levels_empty_gen(TRUE));
1980
#endif /* UNIV_SYNC_DEBUG */
1985
/* See the note at stop_for_a_while: the same holds for this case */
1987
ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
1988
ut_ad(!search_latch_locked);
1990
plan->stored_cursor_rec_processed = FALSE;
1991
btr_pcur_store_position(&(plan->pcur), &mtr);
1995
#ifdef UNIV_SYNC_DEBUG
1996
ut_ad(sync_thread_levels_empty_gen(TRUE));
1997
#endif /* UNIV_SYNC_DEBUG */
2000
if (search_latch_locked) {
2001
rw_lock_s_unlock(&btr_search_latch);
2003
if (UNIV_LIKELY_NULL(heap)) {
2004
mem_heap_free(heap);
2009
/**********************************************************************//**
2010
Performs a select step. This is a high-level function used in SQL execution
2012
@return query thread to run next or NULL */
2017
que_thr_t* thr) /*!< in: query thread */
2020
sym_node_t* table_node;
2026
node = static_cast<sel_node_t *>(thr->run_node);
2028
ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
2030
/* If this is a new time this node is executed (or when execution
2031
resumes after wait for a table intention lock), set intention locks
2032
on the tables, or assign a read view */
2034
if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
2036
node->state = SEL_NODE_OPEN;
2039
if (node->state == SEL_NODE_OPEN) {
2041
/* It may be that the current session has not yet started
2042
its transaction, or it has been committed: */
2044
trx_start_if_not_started(thr_get_trx(thr));
2046
plan_reset_cursor(sel_node_get_nth_plan(node, 0));
2048
if (node->consistent_read) {
2049
/* Assign a read view for the query */
2050
node->read_view = trx_assign_read_view(
2053
if (node->set_x_locks) {
2054
i_lock_mode = LOCK_IX;
2056
i_lock_mode = LOCK_IS;
2059
table_node = node->table_list;
2061
while (table_node) {
2062
err = lock_table(0, table_node->table,
2063
static_cast<lock_mode>(i_lock_mode), thr);
2064
if (err != DB_SUCCESS) {
2065
thr_get_trx(thr)->error_state = err;
2070
table_node = static_cast<sym_node_t *>(que_node_get_next(table_node));
2074
/* If this is an explicit cursor, copy stored procedure
2075
variable values, so that the values cannot change between
2076
fetches (currently, we copy them also for non-explicit
2079
if (node->explicit_cursor
2080
&& UT_LIST_GET_FIRST(node->copy_variables)) {
2082
row_sel_copy_input_variable_vals(node);
2085
node->state = SEL_NODE_FETCH;
2086
node->fetch_table = 0;
2088
if (node->is_aggregate) {
2089
/* Reset the aggregate total values */
2090
sel_reset_aggregate_vals(node);
2096
err = row_sel(node, thr);
2098
/* NOTE! if queries are parallelized, the following assignment may
2099
have problems; the assignment should be made only if thr is the
2100
only top-level thr in the graph: */
2102
thr->graph->last_sel_node = node;
2104
if (err != DB_SUCCESS) {
2105
thr_get_trx(thr)->error_state = err;
2113
/**********************************************************************//**
2114
Performs a fetch for a cursor.
2115
@return query thread to run next or NULL */
2120
que_thr_t* thr) /*!< in: query thread */
2122
sel_node_t* sel_node;
2127
node = static_cast<fetch_node_t *>(thr->run_node);
2128
sel_node = node->cursor_def;
2130
ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
2132
if (thr->prev_node != que_node_get_parent(node)) {
2134
if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
2136
if (node->into_list) {
2137
sel_assign_into_var_values(node->into_list,
2140
void* ret = (*node->func->func)(
2141
sel_node, node->func->arg);
2145
= SEL_NODE_NO_MORE_ROWS;
2150
thr->run_node = que_node_get_parent(node);
2155
/* Make the fetch node the parent of the cursor definition for
2156
the time of the fetch, so that execution knows to return to this
2157
fetch node after a row has been selected or we know that there is
2160
sel_node->common.parent = node;
2162
if (sel_node->state == SEL_NODE_CLOSED) {
2164
"InnoDB: Error: fetch called on a closed cursor\n");
2166
thr_get_trx(thr)->error_state = DB_ERROR;
2171
thr->run_node = sel_node;
2176
/****************************************************************//**
2177
Sample callback function for fetch that prints each row.
2178
@return always returns non-NULL */
2183
void* row, /*!< in: sel_node_t* */
2184
void* user_arg) /*!< in: not used */
2186
sel_node_t *node = static_cast<sel_node_t *>(row);
2190
UT_NOT_USED(user_arg);
2192
fprintf(stderr, "row_fetch_print: row %p\n", row);
2194
exp = node->select_list;
2197
dfield_t* dfield = que_node_get_val(exp);
2198
const dtype_t* type = dfield_get_type(dfield);
2200
fprintf(stderr, " column %lu:\n", (ulong)i);
2205
if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
2206
ut_print_buf(stderr, dfield_get_data(dfield),
2207
dfield_get_len(dfield));
2210
fputs(" <NULL>;\n", stderr);
2213
exp = que_node_get_next(exp);
2220
/***********************************************************//**
2221
Prints a row in a select result.
2222
@return query thread to run next or NULL */
2227
que_thr_t* thr) /*!< in: query thread */
2229
row_printf_node_t* node;
2230
sel_node_t* sel_node;
2235
node = static_cast<row_printf_node_t *>(thr->run_node);
2237
sel_node = node->sel_node;
2239
ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
2241
if (thr->prev_node == que_node_get_parent(node)) {
2243
/* Reset the cursor */
2244
sel_node->state = SEL_NODE_OPEN;
2246
/* Fetch next row to print */
2248
thr->run_node = sel_node;
2253
if (sel_node->state != SEL_NODE_FETCH) {
2255
ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
2257
/* No more rows to print */
2259
thr->run_node = que_node_get_parent(node);
2264
arg = sel_node->select_list;
2267
dfield_print_also_hex(que_node_get_val(arg));
2269
fputs(" ::: ", stderr);
2271
arg = que_node_get_next(arg);
2276
/* Fetch next row to print */
2278
thr->run_node = sel_node;
2283
/****************************************************************//**
2284
Converts a key value stored in MySQL format to an Innobase dtuple. The last
2285
field of the key value may be just a prefix of a fixed length field: hence
2286
the parameter key_len. But currently we do not allow search keys where the
2287
last field is only a prefix of the full key field len and print a warning if
2288
such appears. A counterpart of this function is
2289
ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2292
row_sel_convert_mysql_key_to_innobase(
2293
/*==================================*/
2294
dtuple_t* tuple, /*!< in/out: tuple where to build;
2295
NOTE: we assume that the type info
2296
in the tuple is already according
2298
byte* buf, /*!< in: buffer to use in field
2300
ulint buf_len, /*!< in: buffer length */
2301
dict_index_t* index, /*!< in: index of the key value */
2302
const byte* key_ptr, /*!< in: MySQL key value */
2303
ulint key_len, /*!< in: MySQL key value length */
2304
trx_t* trx) /*!< in: transaction */
2306
byte* original_buf = buf;
2307
const byte* original_key_ptr = key_ptr;
2308
dict_field_t* field;
2312
ulint data_field_len;
2314
const byte* key_end;
2317
/* For documentation of the key value storage format in MySQL, see
2318
ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2320
key_end = key_ptr + key_len;
2322
/* Permit us to access any field in the tuple (ULINT_MAX): */
2324
dtuple_set_n_fields(tuple, ULINT_MAX);
2326
dfield = dtuple_get_nth_field(tuple, 0);
2327
field = dict_index_get_nth_field(index, 0);
2329
if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
2330
/* A special case: we are looking for a position in the
2331
generated clustered index which InnoDB automatically added
2332
to a table with no primary key: the first and the only
2333
ordering column is ROW_ID which InnoDB stored to the key_ptr
2336
ut_a(key_len == DATA_ROW_ID_LEN);
2338
dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
2340
dtuple_set_n_fields(tuple, 1);
2345
while (key_ptr < key_end) {
2347
ulint type = dfield_get_type(dfield)->mtype;
2348
ut_a(field->col->mtype == type);
2353
if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
2354
/* The first byte in the field tells if this is
2355
an SQL NULL value */
2359
if (*key_ptr != 0) {
2360
dfield_set_null(dfield);
2366
/* Calculate data length and data field total length */
2368
if (type == DATA_BLOB) {
2369
/* The key field is a column prefix of a BLOB or
2372
ut_a(field->prefix_len > 0);
2374
/* MySQL stores the actual data length to the first 2
2375
bytes after the optional SQL NULL marker byte. The
2376
storage format is little-endian, that is, the most
2377
significant byte at a higher address. In UTF-8, MySQL
2378
seems to reserve field->prefix_len bytes for
2379
storing this field in the key value buffer, even
2380
though the actual value only takes data_len bytes
2383
data_len = key_ptr[data_offset]
2384
+ 256 * key_ptr[data_offset + 1];
2385
data_field_len = data_offset + 2 + field->prefix_len;
2389
/* Now that we know the length, we store the column
2390
value like it would be a fixed char field */
2392
} else if (field->prefix_len > 0) {
2393
/* Looks like MySQL pads unused end bytes in the
2394
prefix with space. Therefore, also in UTF-8, it is ok
2395
to compare with a prefix containing full prefix_len
2396
bytes, and no need to take at most prefix_len / 3
2397
UTF-8 characters from the start.
2398
If the prefix is used as the upper end of a LIKE
2399
'abc%' query, then MySQL pads the end with chars
2400
0xff. TODO: in that case does it any harm to compare
2401
with the full prefix_len bytes. How do characters
2402
0xff in UTF-8 behave? */
2404
data_len = field->prefix_len;
2405
data_field_len = data_offset + data_len;
2407
data_len = dfield_get_type(dfield)->len;
2408
data_field_len = data_offset + data_len;
2412
(dtype_get_mysql_type(dfield_get_type(dfield))
2413
== DATA_MYSQL_TRUE_VARCHAR)
2414
&& UNIV_LIKELY(type != DATA_INT)) {
2415
/* In a MySQL key value format, a true VARCHAR is
2416
always preceded by 2 bytes of a length field.
2417
dfield_get_type(dfield)->len returns the maximum
2418
'payload' len in bytes. That does not include the
2419
2 bytes that tell the actual data length.
2421
We added the check != DATA_INT to make sure we do
2422
not treat MySQL ENUM or SET as a true VARCHAR! */
2425
data_field_len += 2;
2428
/* Storing may use at most data_len bytes of buf */
2430
if (UNIV_LIKELY(!is_null)) {
2431
row_mysql_store_col_in_innobase_format(
2433
FALSE, /* MySQL key value format col */
2434
key_ptr + data_offset, data_len,
2435
dict_table_is_comp(index->table));
2439
key_ptr += data_field_len;
2441
if (UNIV_UNLIKELY(key_ptr > key_end)) {
2442
/* The last field in key was not a complete key field
2445
Print a warning about this! HA_READ_PREFIX_LAST does
2446
not currently work in InnoDB with partial-field key
2447
value prefixes. Since MySQL currently uses a padding
2448
trick to calculate LIKE 'abc%' type queries there
2449
should never be partial-field prefixes in searches. */
2451
ut_print_timestamp(stderr);
2453
fputs(" InnoDB: Warning: using a partial-field"
2454
" key prefix in search.\n"
2455
"InnoDB: ", stderr);
2456
dict_index_name_print(stderr, trx, index);
2457
fprintf(stderr, ". Last data field length %lu bytes,\n"
2458
"InnoDB: key ptr now exceeds"
2459
" key end by %lu bytes.\n"
2460
"InnoDB: Key value in the MySQL format:\n",
2461
(ulong) data_field_len,
2462
(ulong) (key_ptr - key_end));
2464
ut_print_buf(stderr, original_key_ptr, key_len);
2468
ulint len = dfield_get_len(dfield);
2469
dfield_set_len(dfield, len
2470
- (ulint) (key_ptr - key_end));
2479
ut_a(buf <= original_buf + buf_len);
2481
/* We set the length of tuple to n_fields: we assume that the memory
2482
area allocated for it is big enough (usually bigger than n_fields). */
2484
dtuple_set_n_fields(tuple, n_fields);
2487
/**************************************************************//**
2488
Stores the row id to the prebuilt struct. */
2491
row_sel_store_row_id_to_prebuilt(
2492
/*=============================*/
2493
row_prebuilt_t* prebuilt, /*!< in/out: prebuilt */
2494
const rec_t* index_rec, /*!< in: record */
2495
const dict_index_t* index, /*!< in: index of the record */
2496
const ulint* offsets) /*!< in: rec_get_offsets
2497
(index_rec, index) */
2502
ut_ad(rec_offs_validate(index_rec, index, offsets));
2504
data = rec_get_nth_field(
2506
dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
2508
if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
2510
"InnoDB: Error: Row id field is"
2511
" wrong length %lu in ", (ulong) len);
2512
dict_index_name_print(stderr, prebuilt->trx, index);
2513
fprintf(stderr, "\n"
2514
"InnoDB: Field number %lu, record:\n",
2515
(ulong) dict_index_get_sys_col_pos(index,
2517
rec_print_new(stderr, index_rec, offsets);
2522
ut_memcpy(prebuilt->row_id, data, len);
2525
/**************************************************************//**
2526
Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
2527
function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
2530
row_sel_field_store_in_mysql_format(
2531
/*================================*/
2532
byte* dest, /*!< in/out: buffer where to store; NOTE
2533
that BLOBs are not in themselves
2534
stored here: the caller must allocate
2535
and copy the BLOB into buffer before,
2536
and pass the pointer to the BLOB in
2538
const mysql_row_templ_t* templ,
2539
/*!< in: MySQL column template.
2540
Its following fields are referenced:
2541
type, is_unsigned, mysql_col_len,
2542
mbminlen, mbmaxlen */
2543
const byte* data, /*!< in: data to store */
2544
ulint len) /*!< in: length of the data */
2548
ut_ad(len != UNIV_SQL_NULL);
2549
UNIV_MEM_ASSERT_RW(data, len);
2551
switch (templ->type) {
2552
const byte* field_end;
2555
/* Convert integer data from Innobase to a little-endian
2556
format, sign bit restored to normal */
2569
if (!templ->is_unsigned) {
2570
dest[len - 1] = (byte) (dest[len - 1] ^ 128);
2573
ut_ad(templ->mysql_col_len == len);
2579
field_end = dest + templ->mysql_col_len;
2581
if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
2582
/* This is a >= 5.0.3 type true VARCHAR. Store the
2583
length of the data to the first byte or the first
2584
two bytes of dest. */
2586
dest = row_mysql_store_true_var_len(
2587
dest, len, templ->mysql_length_bytes);
2590
/* Copy the actual data */
2591
ut_memcpy(dest, data, len);
2593
/* Pad with trailing spaces. We pad with spaces also the
2594
unused end of a >= 5.0.3 true VARCHAR column, just in case
2595
MySQL expects its contents to be deterministic. */
2599
ut_ad(templ->mbminlen <= templ->mbmaxlen);
2601
/* We treat some Unicode charset strings specially. */
2602
switch (templ->mbminlen) {
2604
/* InnoDB should never have stripped partial
2605
UTF-32 characters. */
2609
/* A space char is two bytes,
2610
0x0020 in UCS2 and UTF-16 */
2612
if (UNIV_UNLIKELY(len & 1)) {
2613
/* A 0x20 has been stripped from the column.
2616
if (pad < field_end) {
2622
row_mysql_pad_col(templ->mbminlen, pad, field_end - pad);
2626
/* Store a pointer to the BLOB buffer to dest: the BLOB was
2627
already copied to the buffer in row_sel_store_mysql_rec */
2629
row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
2634
memcpy(dest, data, len);
2636
ut_ad(templ->mysql_col_len >= len);
2637
ut_ad(templ->mbmaxlen >= templ->mbminlen);
2639
ut_ad(templ->mbmaxlen > templ->mbminlen
2640
|| templ->mysql_col_len == len);
2641
/* The following assertion would fail for old tables
2642
containing UTF-8 ENUM columns due to Bug #9526. */
2643
ut_ad(!templ->mbmaxlen
2644
|| !(templ->mysql_col_len % templ->mbmaxlen));
2645
ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
2647
if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
2648
/* Pad with spaces. This undoes the stripping
2649
done in row0mysql.c, function
2650
row_mysql_store_col_in_innobase_format(). */
2652
memset(dest + len, 0x20, templ->mysql_col_len - len);
2658
case DATA_SYS_CHILD:
2660
/* These column types should never be shipped to MySQL. */
2664
case DATA_FIXBINARY:
2668
/* Above are the valid column types for MySQL data. */
2669
#endif /* UNIV_DEBUG */
2670
ut_ad(templ->mysql_col_len == len);
2671
memcpy(dest, data, len);
2675
/**************************************************************//**
2676
Convert a row in the Innobase format to a row in the MySQL format.
2677
Note that the template in prebuilt may advise us to copy only a few
2678
columns to mysql_rec, other columns are left blank. All columns may not
2679
be needed in the query.
2680
@return TRUE on success, FALSE if not all columns could be retrieved */
2683
__attribute__((warn_unused_result))
2686
row_sel_store_mysql_rec(
2687
/*====================*/
2688
byte* mysql_rec, /*!< out: row in the MySQL format */
2689
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
2690
const rec_t* rec, /*!< in: Innobase record in the index
2691
which was described in prebuilt's
2692
template, or in the clustered index;
2693
must be protected by a page latch */
2694
ibool rec_clust, /*!< in: TRUE if rec is in the
2695
clustered index instead of
2697
const ulint* offsets) /*!< in: array returned by
2698
rec_get_offsets(rec) */
2700
mem_heap_t* extern_field_heap = NULL;
2704
ut_ad(prebuilt->mysql_template);
2705
ut_ad(prebuilt->default_rec);
2706
ut_ad(rec_offs_validate(rec, NULL, offsets));
2707
ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
2709
if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
2710
mem_heap_free(prebuilt->blob_heap);
2711
prebuilt->blob_heap = NULL;
2714
for (i = 0; i < prebuilt->n_template ; i++) {
2716
const mysql_row_templ_t*templ = prebuilt->mysql_template + i;
2721
field_no = rec_clust
2722
? templ->clust_rec_field_no : templ->rec_field_no;
2724
if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets, field_no))) {
2726
/* Copy an externally stored field to the temporary
2729
ut_a(!prebuilt->trx->has_search_latch);
2731
if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2732
if (prebuilt->blob_heap == NULL) {
2733
prebuilt->blob_heap = mem_heap_create(
2737
heap = prebuilt->blob_heap;
2740
= mem_heap_create(UNIV_PAGE_SIZE);
2742
heap = extern_field_heap;
2745
/* NOTE: if we are retrieving a big BLOB, we may
2746
already run out of memory in the next call, which
2749
data = btr_rec_copy_externally_stored_field(
2751
dict_table_zip_size(prebuilt->table),
2752
field_no, &len, heap);
2754
if (UNIV_UNLIKELY(!data)) {
2755
/* The externally stored field
2756
was not written yet. This
2757
record should only be seen by
2758
recv_recovery_rollback_active()
2759
or any TRX_ISO_READ_UNCOMMITTED
2762
if (extern_field_heap) {
2763
mem_heap_free(extern_field_heap);
2769
if (UNIV_UNLIKELY(!data)) {
2770
/* The externally stored field
2771
was not written yet. This
2772
record should only be seen by
2773
recv_recovery_rollback_active()
2774
or any TRX_ISO_READ_UNCOMMITTED
2777
if (extern_field_heap) {
2778
mem_heap_free(extern_field_heap);
2784
ut_a(len != UNIV_SQL_NULL);
2786
/* Field is stored in the row. */
2788
data = rec_get_nth_field(rec, offsets, field_no, &len);
2790
if (UNIV_UNLIKELY(templ->type == DATA_BLOB)
2791
&& len != UNIV_SQL_NULL) {
2793
/* It is a BLOB field locally stored in the
2794
InnoDB record: we MUST copy its contents to
2795
prebuilt->blob_heap here because later code
2796
assumes all BLOB values have been copied to a
2799
if (prebuilt->blob_heap == NULL) {
2800
prebuilt->blob_heap = mem_heap_create(
2804
data = static_cast<byte *>(memcpy(mem_heap_alloc(
2805
prebuilt->blob_heap, len),
2810
if (len != UNIV_SQL_NULL) {
2811
row_sel_field_store_in_mysql_format(
2812
mysql_rec + templ->mysql_col_offset,
2816
if (extern_field_heap) {
2817
mem_heap_free(extern_field_heap);
2818
extern_field_heap = NULL;
2821
if (templ->mysql_null_bit_mask) {
2822
/* It is a nullable column with a non-NULL
2824
mysql_rec[templ->mysql_null_byte_offset]
2825
&= ~(byte) templ->mysql_null_bit_mask;
2828
/* MySQL assumes that the field for an SQL
2829
NULL value is set to the default value. */
2831
UNIV_MEM_ASSERT_RW(prebuilt->default_rec
2832
+ templ->mysql_col_offset,
2833
templ->mysql_col_len);
2834
mysql_rec[templ->mysql_null_byte_offset]
2835
|= (byte) templ->mysql_null_bit_mask;
2836
memcpy(mysql_rec + templ->mysql_col_offset,
2837
(const byte*) prebuilt->default_rec
2838
+ templ->mysql_col_offset,
2839
templ->mysql_col_len);
2846
/*********************************************************************//**
2847
Builds a previous version of a clustered index record for a consistent read
2848
@return DB_SUCCESS or error code */
2851
row_sel_build_prev_vers_for_mysql(
2852
/*==============================*/
2853
read_view_t* read_view, /*!< in: read view */
2854
dict_index_t* clust_index, /*!< in: clustered index */
2855
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
2856
const rec_t* rec, /*!< in: record in a clustered index */
2857
ulint** offsets, /*!< in/out: offsets returned by
2858
rec_get_offsets(rec, clust_index) */
2859
mem_heap_t** offset_heap, /*!< in/out: memory heap from which
2860
the offsets are allocated */
2861
rec_t** old_vers, /*!< out: old version, or NULL if the
2862
record does not exist in the view:
2863
i.e., it was freshly inserted
2865
mtr_t* mtr) /*!< in: mtr */
2869
if (prebuilt->old_vers_heap) {
2870
mem_heap_empty(prebuilt->old_vers_heap);
2872
prebuilt->old_vers_heap = mem_heap_create(200);
2875
err = row_vers_build_for_consistent_read(
2876
rec, mtr, clust_index, offsets, read_view, offset_heap,
2877
prebuilt->old_vers_heap, old_vers);
2881
/*********************************************************************//**
2882
Retrieves the clustered index record corresponding to a record in a
2883
non-clustered index. Does the necessary locking. Used in the MySQL
2885
@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
2888
row_sel_get_clust_rec_for_mysql(
2889
/*============================*/
2890
row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */
2891
dict_index_t* sec_index,/*!< in: secondary index where rec resides */
2892
const rec_t* rec, /*!< in: record in a non-clustered index; if
2893
this is a locking read, then rec is not
2894
allowed to be delete-marked, and that would
2895
not make sense either */
2896
que_thr_t* thr, /*!< in: query thread */
2897
const rec_t** out_rec,/*!< out: clustered record or an old version of
2898
it, NULL if the old version did not exist
2899
in the read view, i.e., it was a fresh
2901
ulint** offsets,/*!< in: offsets returned by
2902
rec_get_offsets(rec, sec_index);
2903
out: offsets returned by
2904
rec_get_offsets(out_rec, clust_index) */
2905
mem_heap_t** offset_heap,/*!< in/out: memory heap from which
2906
the offsets are allocated */
2907
mtr_t* mtr) /*!< in: mtr used to get access to the
2908
non-clustered record; the same mtr is used to
2909
access the clustered index */
2911
dict_index_t* clust_index;
2912
const rec_t* clust_rec;
2918
trx = thr_get_trx(thr);
2920
row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
2921
sec_index, *offsets, trx);
2923
clust_index = dict_table_get_first_index(sec_index->table);
2925
btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
2926
PAGE_CUR_LE, BTR_SEARCH_LEAF,
2927
prebuilt->clust_pcur, 0, mtr);
2929
clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
2931
prebuilt->clust_pcur->trx_if_known = trx;
2933
/* Note: only if the search ends up on a non-infimum record is the
2934
low_match value the real match to the search tuple */
2936
if (!page_rec_is_user_rec(clust_rec)
2937
|| btr_pcur_get_low_match(prebuilt->clust_pcur)
2938
< dict_index_get_n_unique(clust_index)) {
2940
/* In a rare case it is possible that no clust rec is found
2941
for a delete-marked secondary index record: if in row0umod.c
2942
in row_undo_mod_remove_clust_low() we have already removed
2943
the clust rec, while purge is still cleaning and removing
2944
secondary index records associated with earlier versions of
2945
the clustered index record. In that case we know that the
2946
clustered index record did not exist in the read view of
2949
if (!rec_get_deleted_flag(rec,
2950
dict_table_is_comp(sec_index->table))
2951
|| prebuilt->select_lock_type != LOCK_NONE) {
2952
ut_print_timestamp(stderr);
2953
fputs(" InnoDB: error clustered record"
2954
" for sec rec not found\n"
2955
"InnoDB: ", stderr);
2956
dict_index_name_print(stderr, trx, sec_index);
2958
"InnoDB: sec index record ", stderr);
2959
rec_print(stderr, rec, sec_index);
2961
"InnoDB: clust index record ", stderr);
2962
rec_print(stderr, clust_rec, clust_index);
2964
trx_print(stderr, trx, 600);
2967
"InnoDB: Submit a detailed bug report"
2968
" to http://bugs.mysql.com\n", stderr);
2977
*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
2978
ULINT_UNDEFINED, offset_heap);
2980
if (prebuilt->select_lock_type != LOCK_NONE) {
2981
/* Try to place a lock on the index record; we are searching
2982
the clust rec with a unique condition, hence
2983
we set a LOCK_REC_NOT_GAP type lock */
2985
err = lock_clust_rec_read_check_and_lock(
2986
0, btr_pcur_get_block(prebuilt->clust_pcur),
2987
clust_rec, clust_index, *offsets,
2988
static_cast<lock_mode>(prebuilt->select_lock_type),
2989
LOCK_REC_NOT_GAP, thr);
2992
case DB_SUCCESS_LOCKED_REC:
2998
/* This is a non-locking consistent read: if necessary, fetch
2999
a previous version of the record */
3003
/* If the isolation level allows reading of uncommitted data,
3004
then we never look for an earlier version */
3006
if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3007
&& !lock_clust_rec_cons_read_sees(
3008
clust_rec, clust_index, *offsets,
3011
/* The following call returns 'offsets' associated with
3013
err = static_cast<db_err>(row_sel_build_prev_vers_for_mysql(
3014
trx->read_view, clust_index, prebuilt,
3015
clust_rec, offsets, offset_heap, &old_vers,
3018
if (err != DB_SUCCESS || old_vers == NULL) {
3023
clust_rec = old_vers;
3026
/* If we had to go to an earlier version of row or the
3027
secondary index record is delete marked, then it may be that
3028
the secondary index record corresponding to clust_rec
3029
(or old_vers) is not rec; in that case we must ignore
3030
such row because in our snapshot rec would not have existed.
3031
Remember that from rec we cannot see directly which transaction
3032
id corresponds to it: we have to go to the clustered index
3033
record. A query where we want to fetch all rows where
3034
the secondary index value is in some interval would return
3035
a wrong result if we would not drop rows which we come to
3036
visit through secondary index records that would not really
3037
exist in our snapshot. */
3041
|| trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
3042
|| rec_get_deleted_flag(rec, dict_table_is_comp(
3044
&& !row_sel_sec_rec_is_for_clust_rec(
3045
rec, sec_index, clust_rec, clust_index)) {
3047
#ifdef UNIV_SEARCH_DEBUG
3049
ut_a(clust_rec == NULL
3050
|| row_sel_sec_rec_is_for_clust_rec(
3051
rec, sec_index, clust_rec, clust_index));
3059
*out_rec = clust_rec;
3061
if (prebuilt->select_lock_type != LOCK_NONE) {
3062
/* We may use the cursor in update or in unlock_row():
3063
store its position */
3065
btr_pcur_store_position(prebuilt->clust_pcur, mtr);
3072
/********************************************************************//**
3073
Restores cursor position after it has been stored. We have to take into
3074
account that the record cursor was positioned on may have been deleted.
3075
Then we may have to move the cursor one step up or down.
3076
@return TRUE if we may need to process the record the cursor is now
3077
positioned on (i.e. we should not go to the next record yet) */
3080
sel_restore_position_for_mysql(
3081
/*===========================*/
3082
ibool* same_user_rec, /*!< out: TRUE if we were able to restore
3083
the cursor on a user record with the
3084
same ordering prefix in in the
3086
ulint latch_mode, /*!< in: latch mode wished in
3088
btr_pcur_t* pcur, /*!< in: cursor whose position
3090
ibool moves_up, /*!< in: TRUE if the cursor moves up
3092
mtr_t* mtr) /*!< in: mtr; CAUTION: may commit
3096
ulint relative_position;
3098
relative_position = pcur->rel_pos;
3100
success = btr_pcur_restore_position(latch_mode, pcur, mtr);
3102
*same_user_rec = success;
3104
if (relative_position == BTR_PCUR_ON) {
3110
btr_pcur_move_to_next(pcur, mtr);
3116
if (relative_position == BTR_PCUR_AFTER
3117
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
3123
if (btr_pcur_is_on_user_rec(pcur)) {
3124
btr_pcur_move_to_prev(pcur, mtr);
3130
ut_ad(relative_position == BTR_PCUR_BEFORE
3131
|| relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
3133
if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
3134
btr_pcur_move_to_next(pcur, mtr);
3140
/********************************************************************//**
3141
Pops a cached row for MySQL from the fetch cache. */
3144
row_sel_pop_cached_row_for_mysql(
3145
/*=============================*/
3146
byte* buf, /*!< in/out: buffer where to copy the
3148
row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */
3151
const mysql_row_templ_t*templ;
3153
ut_ad(prebuilt->n_fetch_cached > 0);
3154
ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
3156
if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
3157
/* Copy cache record field by field, don't touch fields that
3158
are not covered by current key */
3159
cached_rec = prebuilt->fetch_cache[
3160
prebuilt->fetch_cache_first];
3162
for (i = 0; i < prebuilt->n_template; i++) {
3163
templ = prebuilt->mysql_template + i;
3164
#if 0 /* Some of the cached_rec may legitimately be uninitialized. */
3165
UNIV_MEM_ASSERT_RW(cached_rec
3166
+ templ->mysql_col_offset,
3167
templ->mysql_col_len);
3169
ut_memcpy(buf + templ->mysql_col_offset,
3170
cached_rec + templ->mysql_col_offset,
3171
templ->mysql_col_len);
3172
/* Copy NULL bit of the current field from cached_rec
3174
if (templ->mysql_null_bit_mask) {
3175
buf[templ->mysql_null_byte_offset]
3176
^= (buf[templ->mysql_null_byte_offset]
3177
^ cached_rec[templ->mysql_null_byte_offset])
3178
& (byte)templ->mysql_null_bit_mask;
3183
#if 0 /* Some of the cached_rec may legitimately be uninitialized. */
3184
UNIV_MEM_ASSERT_RW(prebuilt->fetch_cache
3185
[prebuilt->fetch_cache_first],
3186
prebuilt->mysql_prefix_len);
3189
prebuilt->fetch_cache[prebuilt->fetch_cache_first],
3190
prebuilt->mysql_prefix_len);
3192
prebuilt->n_fetch_cached--;
3193
prebuilt->fetch_cache_first++;
3195
if (prebuilt->n_fetch_cached == 0) {
3196
prebuilt->fetch_cache_first = 0;
3200
/********************************************************************//**
3201
Pushes a row for MySQL to the fetch cache.
3202
@return TRUE on success, FALSE if the record contains incomplete BLOBs */
3205
__attribute__((warn_unused_result))
3208
row_sel_push_cache_row_for_mysql(
3209
/*=============================*/
3210
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
3211
const rec_t* rec, /*!< in: record to push, in the index
3212
which was described in prebuilt's
3213
template, or in the clustered index;
3214
must be protected by a page latch */
3215
ibool rec_clust, /*!< in: TRUE if rec is in the
3216
clustered index instead of
3218
const ulint* offsets) /*!< in: rec_get_offsets(rec) */
3223
ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
3224
ut_ad(rec_offs_validate(rec, NULL, offsets));
3225
ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
3226
ut_a(!prebuilt->templ_contains_blob);
3228
if (prebuilt->fetch_cache[0] == NULL) {
3229
/* Allocate memory for the fetch cache */
3231
for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
3233
/* A user has reported memory corruption in these
3234
buffers in Linux. Put magic numbers there to help
3235
to track a possible bug. */
3237
buf = static_cast<byte *>(mem_alloc(prebuilt->mysql_row_len + 8));
3239
prebuilt->fetch_cache[i] = buf + 4;
3241
mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
3242
mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
3243
ROW_PREBUILT_FETCH_MAGIC_N);
3247
ut_ad(prebuilt->fetch_cache_first == 0);
3248
UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
3249
prebuilt->mysql_row_len);
3251
if (UNIV_UNLIKELY(!row_sel_store_mysql_rec(
3252
prebuilt->fetch_cache[
3253
prebuilt->n_fetch_cached],
3254
prebuilt, rec, rec_clust, offsets))) {
3258
prebuilt->n_fetch_cached++;
3262
/*********************************************************************//**
3263
Tries to do a shortcut to fetch a clustered index record with a unique key,
3264
using the hash index if possible (not always). We assume that the search
3265
mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
3266
btr search latch has been locked in S-mode.
3267
@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
3270
row_sel_try_search_shortcut_for_mysql(
3271
/*==================================*/
3272
const rec_t** out_rec,/*!< out: record if found */
3273
row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */
3274
ulint** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
3275
mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
3276
mtr_t* mtr) /*!< in: started mtr */
3278
dict_index_t* index = prebuilt->index;
3279
const dtuple_t* search_tuple = prebuilt->search_tuple;
3280
btr_pcur_t* pcur = prebuilt->pcur;
3281
trx_t* trx = prebuilt->trx;
3284
ut_ad(dict_index_is_clust(index));
3285
ut_ad(!prebuilt->templ_contains_blob);
3287
#ifndef UNIV_SEARCH_DEBUG
3288
btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3289
BTR_SEARCH_LEAF, pcur,
3292
#else /* UNIV_SEARCH_DEBUG */
3293
btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3294
BTR_SEARCH_LEAF, pcur,
3297
#endif /* UNIV_SEARCH_DEBUG */
3298
rec = btr_pcur_get_rec(pcur);
3300
if (!page_rec_is_user_rec(rec)) {
3305
/* As the cursor is now placed on a user record after a search with
3306
the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
3307
fields in the user record matched to the search tuple */
3309
if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
3311
return(SEL_EXHAUSTED);
3314
/* This is a non-locking consistent read: if necessary, fetch
3315
a previous version of the record */
3317
*offsets = rec_get_offsets(rec, index, *offsets,
3318
ULINT_UNDEFINED, heap);
3320
if (!lock_clust_rec_cons_read_sees(rec, index,
3321
*offsets, trx->read_view)) {
3326
if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
3328
return(SEL_EXHAUSTED);
3336
/********************************************************************//**
3337
Searches for rows in the database. This is used in the interface to
3338
MySQL. This function opens a cursor, and also implements fetch next
3339
and fetch prev. NOTE that if we do a search with a full key value
3340
from a unique index (ROW_SEL_EXACT), then we will not store the cursor
3341
position and fetch next or fetch prev must not be tried to the cursor!
3342
@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
3343
DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
3346
row_search_for_mysql(
3347
/*=================*/
3348
byte* buf, /*!< in/out: buffer for the fetched
3349
row in the MySQL format */
3350
ulint mode, /*!< in: search mode PAGE_CUR_L, ... */
3351
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
3352
table handle; this contains the info
3353
of search_tuple, index; if search
3354
tuple contains 0 fields then we
3355
position the cursor at the start or
3356
the end of the index, depending on
3358
ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or
3359
ROW_SEL_EXACT_PREFIX */
3360
ulint direction) /*!< in: 0 or ROW_SEL_NEXT or
3361
ROW_SEL_PREV; NOTE: if this is != 0,
3362
then prebuilt must have a pcur
3363
with stored position! In opening of a
3364
cursor 'direction' should be 0. */
3366
dict_index_t* index = prebuilt->index;
3367
ibool comp = dict_table_is_comp(index->table);
3368
const dtuple_t* search_tuple = prebuilt->search_tuple;
3369
btr_pcur_t* pcur = prebuilt->pcur;
3370
trx_t* trx = prebuilt->trx;
3371
dict_index_t* clust_index;
3374
const rec_t* result_rec;
3375
const rec_t* clust_rec;
3376
ulint err = DB_SUCCESS;
3377
ibool unique_search = FALSE;
3378
ibool unique_search_from_clust_index = FALSE;
3379
ibool mtr_has_extra_clust_latch = FALSE;
3380
ibool moves_up = FALSE;
3381
ibool set_also_gap_locks = TRUE;
3382
/* if the query is a plain locking SELECT, and the isolation level
3383
is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
3384
ibool did_semi_consistent_read = FALSE;
3385
/* if the returned record was locked and we did a semi-consistent
3386
read (fetch the newest committed version), then this is set to
3388
#ifdef UNIV_SEARCH_DEBUG
3390
#endif /* UNIV_SEARCH_DEBUG */
3392
ibool same_user_rec;
3394
mem_heap_t* heap = NULL;
3395
ulint offsets_[REC_OFFS_NORMAL_SIZE];
3396
ulint* offsets = offsets_;
3397
ibool table_lock_waited = FALSE;
3399
rec_offs_init(offsets_);
3401
ut_ad(index && pcur && search_tuple);
3402
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
3404
if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) {
3405
ut_print_timestamp(stderr);
3406
fprintf(stderr, " InnoDB: Error:\n"
3407
"InnoDB: MySQL is trying to use a table handle"
3408
" but the .ibd file for\n"
3409
"InnoDB: table %s does not exist.\n"
3410
"InnoDB: Have you deleted the .ibd file"
3411
" from the database directory under\n"
3412
"InnoDB: the MySQL datadir, or have you used"
3413
" DISCARD TABLESPACE?\n"
3414
"InnoDB: Look from\n"
3415
"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
3416
"InnoDB: how you can resolve the problem.\n",
3417
prebuilt->table->name);
3422
if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
3424
return(DB_MISSING_HISTORY);
3427
if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
3429
"InnoDB: Error: trying to free a corrupt\n"
3430
"InnoDB: table handle. Magic n %lu, table name ",
3431
(ulong) prebuilt->magic_n);
3432
ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
3435
mem_analyze_corruption(prebuilt);
3441
fprintf(stderr, "Match mode %lu\n search tuple ",
3442
(ulong) match_mode);
3443
dtuple_print(search_tuple);
3444
fprintf(stderr, "N tables locked %lu\n",
3445
(ulong) trx->mysql_n_tables_locked);
3447
/*-------------------------------------------------------------*/
3448
/* PHASE 0: Release a possible s-latch we are holding on the
3449
adaptive hash index latch if there is someone waiting behind */
3451
if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
3452
&& trx->has_search_latch) {
3454
/* There is an x-latch request on the adaptive hash index:
3455
release the s-latch to reduce starvation and wait for
3456
BTR_SEA_TIMEOUT rounds before trying to keep it again over
3459
rw_lock_s_unlock(&btr_search_latch);
3460
trx->has_search_latch = FALSE;
3462
trx->search_latch_timeout = BTR_SEA_TIMEOUT;
3465
/* Reset the new record lock info if srv_locks_unsafe_for_binlog
3466
is set or session is using a READ COMMITED isolation level. Then
3467
we are able to remove the record locks set here on an individual
3469
prebuilt->new_rec_locks = 0;
3471
/*-------------------------------------------------------------*/
3472
/* PHASE 1: Try to pop the row from the prefetch cache */
3474
if (UNIV_UNLIKELY(direction == 0)) {
3475
trx->op_info = "starting index read";
3477
prebuilt->n_rows_fetched = 0;
3478
prebuilt->n_fetch_cached = 0;
3479
prebuilt->fetch_cache_first = 0;
3481
if (prebuilt->sel_graph == NULL) {
3482
/* Build a dummy select query graph */
3483
row_prebuild_sel_graph(prebuilt);
3486
trx->op_info = "fetching rows";
3488
if (prebuilt->n_rows_fetched == 0) {
3489
prebuilt->fetch_direction = direction;
3492
if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
3493
if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
3495
/* TODO: scrollable cursor: restore cursor to
3496
the place of the latest returned row,
3497
or better: prevent caching for a scroll
3501
prebuilt->n_rows_fetched = 0;
3502
prebuilt->n_fetch_cached = 0;
3503
prebuilt->fetch_cache_first = 0;
3505
} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
3506
row_sel_pop_cached_row_for_mysql(buf, prebuilt);
3508
prebuilt->n_rows_fetched++;
3515
if (prebuilt->fetch_cache_first > 0
3516
&& prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
3518
/* The previous returned row was popped from the fetch
3519
cache, but the cache was not full at the time of the
3520
popping: no more rows can exist in the result set */
3522
err = DB_RECORD_NOT_FOUND;
3526
prebuilt->n_rows_fetched++;
3528
if (prebuilt->n_rows_fetched > 1000000000) {
3529
/* Prevent wrap-over */
3530
prebuilt->n_rows_fetched = 500000000;
3533
mode = pcur->search_mode;
3536
/* In a search where at most one record in the index may match, we
3537
can use a LOCK_REC_NOT_GAP type record lock when locking a
3538
non-delete-marked matching record.
3540
Note that in a unique secondary index there may be different
3541
delete-marked versions of a record where only the primary key
3542
values differ: thus in a secondary index we must use next-key
3543
locks when locking delete-marked records. */
3545
if (match_mode == ROW_SEL_EXACT
3546
&& dict_index_is_unique(index)
3547
&& dtuple_get_n_fields(search_tuple)
3548
== dict_index_get_n_unique(index)
3549
&& (dict_index_is_clust(index)
3550
|| !dtuple_contains_null(search_tuple))) {
3552
/* Note above that a UNIQUE secondary index can contain many
3553
rows with the same key value if one of the columns is the SQL
3554
null. A clustered index under MySQL can never contain null
3555
columns because we demand that all the columns in primary key
3558
unique_search = TRUE;
3560
/* Even if the condition is unique, MySQL seems to try to
3561
retrieve also a second row if a primary key contains more than
3564
if (UNIV_UNLIKELY(direction != 0)) {
3566
err = DB_RECORD_NOT_FOUND;
3573
/*-------------------------------------------------------------*/
3574
/* PHASE 2: Try fast adaptive hash index search if possible */
3576
/* Next test if this is the special case where we can use the fast
3577
adaptive hash index to try the search. Since we must release the
3578
search system latch when we retrieve an externally stored field, we
3579
cannot use the adaptive hash index in a search in the case the row
3580
may be long and there may be externally stored fields */
3582
if (UNIV_UNLIKELY(direction == 0)
3584
&& dict_index_is_clust(index)
3585
&& !prebuilt->templ_contains_blob
3586
&& (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
3590
unique_search_from_clust_index = TRUE;
3592
if (trx->mysql_n_tables_locked == 0
3593
&& prebuilt->select_lock_type == LOCK_NONE
3594
&& trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3595
&& trx->read_view) {
3597
/* This is a SELECT query done as a consistent read,
3598
and the read view has already been allocated:
3599
let us try a search shortcut through the hash
3601
NOTE that we must also test that
3602
mysql_n_tables_locked == 0, because this might
3603
also be INSERT INTO ... SELECT ... or
3604
CREATE TABLE ... SELECT ... . Our algorithm is
3605
NOT prepared to inserts interleaved with the SELECT,
3606
and if we try that, we can deadlock on the adaptive
3607
hash index semaphore! */
3609
#ifndef UNIV_SEARCH_DEBUG
3610
if (!trx->has_search_latch) {
3611
rw_lock_s_lock(&btr_search_latch);
3612
trx->has_search_latch = TRUE;
3615
switch (row_sel_try_search_shortcut_for_mysql(
3616
&rec, prebuilt, &offsets, &heap,
3619
#ifdef UNIV_SEARCH_DEBUG
3620
ut_a(0 == cmp_dtuple_rec(search_tuple,
3623
/* At this point, rec is protected by
3624
a page latch that was acquired by
3625
row_sel_try_search_shortcut_for_mysql().
3626
The latch will not be released until
3627
mtr_commit(&mtr). */
3628
ut_ad(!rec_get_deleted_flag(rec, comp));
3630
if (!row_sel_store_mysql_rec(buf, prebuilt,
3633
/* Only fresh inserts may contain
3634
incomplete externally stored
3635
columns. Pretend that such
3636
records do not exist. Such
3637
records may only be accessed
3638
at the READ UNCOMMITTED
3639
isolation level or when
3640
rolling back a recovered
3641
transaction. Rollback happens
3642
at a lower level, not here. */
3643
ut_a(trx->isolation_level
3644
== TRX_ISO_READ_UNCOMMITTED);
3646
/* Proceed as in case SEL_RETRY. */
3652
/* ut_print_name(stderr, index->name);
3653
fputs(" shortcut\n", stderr); */
3658
goto release_search_latch_if_needed;
3663
/* ut_print_name(stderr, index->name);
3664
fputs(" record not found 2\n", stderr); */
3666
err = DB_RECORD_NOT_FOUND;
3667
release_search_latch_if_needed:
3668
if (trx->search_latch_timeout > 0
3669
&& trx->has_search_latch) {
3671
trx->search_latch_timeout--;
3673
rw_lock_s_unlock(&btr_search_latch);
3674
trx->has_search_latch = FALSE;
3677
/* NOTE that we do NOT store the cursor
3693
/*-------------------------------------------------------------*/
3694
/* PHASE 3: Open or restore index cursor position */
3696
if (trx->has_search_latch) {
3697
rw_lock_s_unlock(&btr_search_latch);
3698
trx->has_search_latch = FALSE;
3701
ut_ad(prebuilt->sql_stat_start || trx->conc_state == TRX_ACTIVE);
3702
ut_ad(trx->conc_state == TRX_NOT_STARTED
3703
|| trx->conc_state == TRX_ACTIVE);
3704
ut_ad(prebuilt->sql_stat_start
3705
|| prebuilt->select_lock_type != LOCK_NONE
3708
ut_ad(prebuilt->sql_stat_start || trx->conc_state == TRX_ACTIVE);
3709
ut_ad(trx->conc_state == TRX_NOT_STARTED
3710
|| trx->conc_state == TRX_ACTIVE);
3711
ut_ad(prebuilt->sql_stat_start
3712
|| prebuilt->select_lock_type != LOCK_NONE
3715
trx_start_if_not_started(trx);
3717
if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
3718
&& prebuilt->select_lock_type != LOCK_NONE
3719
&& trx->mysql_thd != NULL
3720
&& thd_is_select(trx->mysql_thd)) {
3721
/* It is a plain locking SELECT and the isolation
3722
level is low: do not lock gaps */
3724
set_also_gap_locks = FALSE;
3727
/* Note that if the search mode was GE or G, then the cursor
3728
naturally moves upward (in fetch next) in alphabetical order,
3729
otherwise downward */
3731
if (UNIV_UNLIKELY(direction == 0)) {
3732
if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
3735
} else if (direction == ROW_SEL_NEXT) {
3739
thr = que_fork_get_first_thr(prebuilt->sel_graph);
3741
que_thr_move_to_run_state_for_mysql(thr, trx);
3743
clust_index = dict_table_get_first_index(index->table);
3745
/* Do some start-of-statement preparations */
3747
if (!prebuilt->sql_stat_start) {
3748
/* No need to set an intention lock or assign a read view */
3750
if (trx->read_view == NULL
3751
&& prebuilt->select_lock_type == LOCK_NONE) {
3753
fputs("InnoDB: Error: MySQL is trying to"
3754
" perform a consistent read\n"
3755
"InnoDB: but the read view is not assigned!\n",
3757
trx_print(stderr, trx, 600);
3758
fputc('\n', stderr);
3761
} else if (prebuilt->select_lock_type == LOCK_NONE) {
3762
/* This is a consistent read */
3763
/* Assign a read view for the query */
3765
trx_assign_read_view(trx);
3766
prebuilt->sql_stat_start = FALSE;
3769
err = lock_table(0, index->table,
3770
prebuilt->select_lock_type == LOCK_S
3771
? LOCK_IS : LOCK_IX, thr);
3773
if (err != DB_SUCCESS) {
3775
table_lock_waited = TRUE;
3776
goto lock_table_wait;
3778
prebuilt->sql_stat_start = FALSE;
3781
/* Open or restore index cursor position */
3783
if (UNIV_LIKELY(direction != 0)) {
3784
ibool need_to_process = sel_restore_position_for_mysql(
3785
&same_user_rec, BTR_SEARCH_LEAF,
3786
pcur, moves_up, &mtr);
3788
if (UNIV_UNLIKELY(need_to_process)) {
3789
if (UNIV_UNLIKELY(prebuilt->row_read_type
3790
== ROW_READ_DID_SEMI_CONSISTENT)) {
3791
/* We did a semi-consistent read,
3792
but the record was removed in
3794
prebuilt->row_read_type
3795
= ROW_READ_TRY_SEMI_CONSISTENT;
3797
} else if (UNIV_LIKELY(prebuilt->row_read_type
3798
!= ROW_READ_DID_SEMI_CONSISTENT)) {
3800
/* The cursor was positioned on the record
3801
that we returned previously. If we need
3802
to repeat a semi-consistent read as a
3803
pessimistic locking read, the record
3804
cannot be skipped. */
3809
} else if (dtuple_get_n_fields(search_tuple) > 0) {
3811
btr_pcur_open_with_no_init(index, search_tuple, mode,
3815
pcur->trx_if_known = trx;
3817
rec = btr_pcur_get_rec(pcur);
3820
&& !page_rec_is_supremum(rec)
3821
&& set_also_gap_locks
3822
&& !(srv_locks_unsafe_for_binlog
3823
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
3824
&& prebuilt->select_lock_type != LOCK_NONE) {
3826
/* Try to place a gap lock on the next index record
3827
to prevent phantoms in ORDER BY ... DESC queries */
3828
const rec_t* next = page_rec_get_next_const(rec);
3830
offsets = rec_get_offsets(next, index, offsets,
3831
ULINT_UNDEFINED, &heap);
3832
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
3833
next, index, offsets,
3834
prebuilt->select_lock_type,
3838
case DB_SUCCESS_LOCKED_REC:
3843
goto lock_wait_or_error;
3847
if (mode == PAGE_CUR_G) {
3848
btr_pcur_open_at_index_side(
3849
TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3851
} else if (mode == PAGE_CUR_L) {
3852
btr_pcur_open_at_index_side(
3853
FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3859
/*-------------------------------------------------------------*/
3860
/* PHASE 4: Look for matching records in a loop */
3862
rec = btr_pcur_get_rec(pcur);
3863
ut_ad(!!page_rec_is_comp(rec) == comp);
3864
#ifdef UNIV_SEARCH_DEBUG
3866
fputs("Using ", stderr);
3867
dict_index_name_print(stderr, index);
3868
fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
3869
page_get_page_no(page_align(rec)));
3872
#endif /* UNIV_SEARCH_DEBUG */
3874
if (page_rec_is_infimum(rec)) {
3876
/* The infimum record on a page cannot be in the result set,
3877
and neither can a record lock be placed on it: we skip such
3883
if (page_rec_is_supremum(rec)) {
3885
if (set_also_gap_locks
3886
&& !(srv_locks_unsafe_for_binlog
3887
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
3888
&& prebuilt->select_lock_type != LOCK_NONE) {
3890
/* Try to place a lock on the index record */
3892
/* If innodb_locks_unsafe_for_binlog option is used
3893
or this session is using a READ COMMITTED isolation
3894
level we do not lock gaps. Supremum record is really
3895
a gap and therefore we do not set locks there. */
3897
offsets = rec_get_offsets(rec, index, offsets,
3898
ULINT_UNDEFINED, &heap);
3899
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
3900
rec, index, offsets,
3901
prebuilt->select_lock_type,
3902
LOCK_ORDINARY, thr);
3905
case DB_SUCCESS_LOCKED_REC:
3910
goto lock_wait_or_error;
3913
/* A page supremum record cannot be in the result set: skip
3914
it now that we have placed a possible lock on it */
3919
/*-------------------------------------------------------------*/
3920
/* Do sanity checks in case our cursor has bumped into page
3924
next_offs = rec_get_next_offs(rec, TRUE);
3925
if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
3930
next_offs = rec_get_next_offs(rec, FALSE);
3931
if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
3937
if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
3940
if (srv_force_recovery == 0 || moves_up == FALSE) {
3941
ut_print_timestamp(stderr);
3942
buf_page_print(page_align(rec), 0);
3944
"\nInnoDB: rec address %p,"
3945
" buf block fix count %lu\n",
3946
(void*) rec, (ulong)
3947
btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
3948
->page.buf_fix_count);
3950
"InnoDB: Index corruption: rec offs %lu"
3951
" next offs %lu, page no %lu,\n"
3953
(ulong) page_offset(rec),
3955
(ulong) page_get_page_no(page_align(rec)));
3956
dict_index_name_print(stderr, trx, index);
3957
fputs(". Run CHECK TABLE. You may need to\n"
3958
"InnoDB: restore from a backup, or"
3959
" dump + drop + reimport the table.\n",
3962
err = DB_CORRUPTION;
3964
goto lock_wait_or_error;
3966
/* The user may be dumping a corrupt table. Jump
3967
over the corruption to recover as much as possible. */
3970
"InnoDB: Index corruption: rec offs %lu"
3971
" next offs %lu, page no %lu,\n"
3973
(ulong) page_offset(rec),
3975
(ulong) page_get_page_no(page_align(rec)));
3976
dict_index_name_print(stderr, trx, index);
3977
fputs(". We try to skip the rest of the page.\n",
3980
btr_pcur_move_to_last_on_page(pcur, &mtr);
3985
/*-------------------------------------------------------------*/
3987
/* Calculate the 'offsets' associated with 'rec' */
3989
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
3991
if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
3992
if (!rec_validate(rec, offsets)
3993
|| !btr_index_rec_validate(rec, index, FALSE)) {
3995
"InnoDB: Index corruption: rec offs %lu"
3996
" next offs %lu, page no %lu,\n"
3998
(ulong) page_offset(rec),
4000
(ulong) page_get_page_no(page_align(rec)));
4001
dict_index_name_print(stderr, trx, index);
4002
fputs(". We try to skip the record.\n",
4009
/* Note that we cannot trust the up_match value in the cursor at this
4010
place because we can arrive here after moving the cursor! Thus
4011
we have to recompare rec and search_tuple to determine if they
4014
if (match_mode == ROW_SEL_EXACT) {
4015
/* Test if the index record matches completely to search_tuple
4016
in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
4018
/* fputs("Comparing rec and search tuple\n", stderr); */
4020
if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
4022
if (set_also_gap_locks
4023
&& !(srv_locks_unsafe_for_binlog
4024
|| trx->isolation_level
4025
<= TRX_ISO_READ_COMMITTED)
4026
&& prebuilt->select_lock_type != LOCK_NONE) {
4028
/* Try to place a gap lock on the index
4029
record only if innodb_locks_unsafe_for_binlog
4030
option is not set or this session is not
4031
using a READ COMMITTED isolation level. */
4033
err = sel_set_rec_lock(
4034
btr_pcur_get_block(pcur),
4035
rec, index, offsets,
4036
prebuilt->select_lock_type, LOCK_GAP,
4040
case DB_SUCCESS_LOCKED_REC:
4044
goto lock_wait_or_error;
4048
btr_pcur_store_position(pcur, &mtr);
4050
err = DB_RECORD_NOT_FOUND;
4051
/* ut_print_name(stderr, index->name);
4052
fputs(" record not found 3\n", stderr); */
4057
} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
4059
if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
4061
if (set_also_gap_locks
4062
&& !(srv_locks_unsafe_for_binlog
4063
|| trx->isolation_level
4064
<= TRX_ISO_READ_COMMITTED)
4065
&& prebuilt->select_lock_type != LOCK_NONE) {
4067
/* Try to place a gap lock on the index
4068
record only if innodb_locks_unsafe_for_binlog
4069
option is not set or this session is not
4070
using a READ COMMITTED isolation level. */
4072
err = sel_set_rec_lock(
4073
btr_pcur_get_block(pcur),
4074
rec, index, offsets,
4075
prebuilt->select_lock_type, LOCK_GAP,
4079
case DB_SUCCESS_LOCKED_REC:
4083
goto lock_wait_or_error;
4087
btr_pcur_store_position(pcur, &mtr);
4089
err = DB_RECORD_NOT_FOUND;
4090
/* ut_print_name(stderr, index->name);
4091
fputs(" record not found 4\n", stderr); */
4097
/* We are ready to look at a possible new index entry in the result
4098
set: the cursor is now placed on a user record */
4100
if (prebuilt->select_lock_type != LOCK_NONE) {
4101
/* Try to place a lock on the index record; note that delete
4102
marked records are a special case in a unique search. If there
4103
is a non-delete marked record, then it is enough to lock its
4104
existence with LOCK_REC_NOT_GAP. */
4106
/* If innodb_locks_unsafe_for_binlog option is used
4107
or this session is using a READ COMMITED isolation
4108
level we lock only the record, i.e., next-key locking is
4113
if (!set_also_gap_locks
4114
|| srv_locks_unsafe_for_binlog
4115
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED
4117
&& !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
4121
lock_type = LOCK_ORDINARY;
4124
/* If we are doing a 'greater or equal than a primary key
4125
value' search from a clustered index, and we find a record
4126
that has that exact primary key value, then there is no need
4127
to lock the gap before the record, because no insert in the
4128
gap can be in our search range. That is, no phantom row can
4131
An example: if col1 is the primary key, the search is WHERE
4132
col1 >= 100, and we find a record where col1 = 100, then no
4133
need to lock the gap before that record. */
4135
if (index == clust_index
4136
&& mode == PAGE_CUR_GE
4138
&& dtuple_get_n_fields_cmp(search_tuple)
4139
== dict_index_get_n_unique(index)
4140
&& 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
4142
lock_type = LOCK_REC_NOT_GAP;
4145
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4146
rec, index, offsets,
4147
prebuilt->select_lock_type,
4151
const rec_t* old_vers;
4152
case DB_SUCCESS_LOCKED_REC:
4153
if (srv_locks_unsafe_for_binlog
4154
|| trx->isolation_level
4155
<= TRX_ISO_READ_COMMITTED) {
4156
/* Note that a record of
4157
prebuilt->index was locked. */
4158
prebuilt->new_rec_locks = 1;
4164
/* Never unlock rows that were part of a conflict. */
4165
prebuilt->new_rec_locks = 0;
4167
if (UNIV_LIKELY(prebuilt->row_read_type
4168
!= ROW_READ_TRY_SEMI_CONSISTENT)
4170
|| index != clust_index) {
4172
goto lock_wait_or_error;
4175
/* The following call returns 'offsets'
4176
associated with 'old_vers' */
4177
err = row_sel_build_committed_vers_for_mysql(
4178
clust_index, prebuilt, rec,
4179
&offsets, &heap, &old_vers, &mtr);
4182
case DB_SUCCESS_LOCKED_REC:
4187
goto lock_wait_or_error;
4190
mutex_enter(&kernel_mutex);
4191
if (trx->was_chosen_as_deadlock_victim) {
4192
mutex_exit(&kernel_mutex);
4195
goto lock_wait_or_error;
4197
if (UNIV_LIKELY(trx->wait_lock != NULL)) {
4198
lock_cancel_waiting_and_release(
4201
mutex_exit(&kernel_mutex);
4203
/* The lock was granted while we were
4204
searching for the last committed version.
4205
Do a normal locking read. */
4207
offsets = rec_get_offsets(rec, index, offsets,
4213
mutex_exit(&kernel_mutex);
4215
if (old_vers == NULL) {
4216
/* The row was not yet committed */
4221
did_semi_consistent_read = TRUE;
4226
goto lock_wait_or_error;
4229
/* This is a non-locking consistent read: if necessary, fetch
4230
a previous version of the record */
4232
if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
4234
/* Do nothing: we let a non-locking SELECT read the
4235
latest version of the record */
4237
} else if (index == clust_index) {
4239
/* Fetch a previous version of the row if the current
4240
one is not visible in the snapshot; if we have a very
4241
high force recovery level set, we try to avoid crashes
4242
by skipping this lookup */
4244
if (UNIV_LIKELY(srv_force_recovery < 5)
4245
&& !lock_clust_rec_cons_read_sees(
4246
rec, index, offsets, trx->read_view)) {
4249
/* The following call returns 'offsets'
4250
associated with 'old_vers' */
4251
err = row_sel_build_prev_vers_for_mysql(
4252
trx->read_view, clust_index,
4253
prebuilt, rec, &offsets, &heap,
4257
case DB_SUCCESS_LOCKED_REC:
4261
goto lock_wait_or_error;
4264
if (old_vers == NULL) {
4265
/* The row did not exist yet in
4274
/* We are looking into a non-clustered index,
4275
and to get the right version of the record we
4276
have to look also into the clustered index: this
4277
is necessary, because we can only get the undo
4278
information via the clustered index record. */
4280
ut_ad(!dict_index_is_clust(index));
4281
if (!lock_sec_rec_cons_read_sees(
4282
rec, trx->read_view)) {
4283
goto requires_clust_rec;
4288
/* NOTE that at this point rec can be an old version of a clustered
4289
index record built for a consistent read. We cannot assume after this
4290
point that rec is on a buffer pool page. Functions like
4291
page_rec_is_comp() cannot be used! */
4293
if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
4295
/* The record is delete-marked: we can skip it */
4297
if ((srv_locks_unsafe_for_binlog
4298
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4299
&& prebuilt->select_lock_type != LOCK_NONE
4300
&& !did_semi_consistent_read) {
4302
/* No need to keep a lock on a delete-marked record
4303
if we do not want to use next-key locking. */
4305
row_unlock_for_mysql(prebuilt, TRUE);
4308
/* This is an optimization to skip setting the next key lock
4309
on the record that follows this delete-marked record. This
4310
optimization works because of the unique search criteria
4311
which precludes the presence of a range lock between this
4312
delete marked record and the record following it.
4314
For now this is applicable only to clustered indexes while
4315
doing a unique search. There is scope for further optimization
4316
applicable to unique secondary indexes. Current behaviour is
4317
to widen the scope of a lock on an already delete marked record
4318
if the same record is deleted twice by the same transaction */
4319
if (index == clust_index && unique_search) {
4320
err = DB_RECORD_NOT_FOUND;
4328
/* Get the clustered index record if needed, if we did not do the
4329
search using the clustered index. */
4331
if (index != clust_index && prebuilt->need_to_access_clustered) {
4334
/* We use a 'goto' to the preceding label if a consistent
4335
read of a secondary index record requires us to look up old
4336
versions of the associated clustered index record. */
4338
ut_ad(rec_offs_validate(rec, index, offsets));
4340
/* It was a non-clustered index and we must fetch also the
4341
clustered index record */
4343
mtr_has_extra_clust_latch = TRUE;
4345
/* The following call returns 'offsets' associated with
4346
'clust_rec'. Note that 'clust_rec' can be an old version
4347
built for a consistent read. */
4349
err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
4351
&offsets, &heap, &mtr);
4354
if (clust_rec == NULL) {
4355
/* The record did not exist in the read view */
4356
ut_ad(prebuilt->select_lock_type == LOCK_NONE);
4361
case DB_SUCCESS_LOCKED_REC:
4362
ut_a(clust_rec != NULL);
4363
if (srv_locks_unsafe_for_binlog
4364
|| trx->isolation_level
4365
<= TRX_ISO_READ_COMMITTED) {
4366
/* Note that the clustered index record
4368
prebuilt->new_rec_locks = 2;
4373
goto lock_wait_or_error;
4376
if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
4378
/* The record is delete marked: we can skip it */
4380
if ((srv_locks_unsafe_for_binlog
4381
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4382
&& prebuilt->select_lock_type != LOCK_NONE) {
4384
/* No need to keep a lock on a delete-marked
4385
record if we do not want to use next-key
4388
row_unlock_for_mysql(prebuilt, TRUE);
4394
result_rec = clust_rec;
4395
ut_ad(rec_offs_validate(result_rec, clust_index, offsets));
4400
/* We found a qualifying record 'result_rec'. At this point,
4401
'offsets' are associated with 'result_rec'. */
4403
ut_ad(rec_offs_validate(result_rec,
4404
result_rec != rec ? clust_index : index,
4406
ut_ad(!rec_get_deleted_flag(result_rec, comp));
4408
/* At this point, the clustered index record is protected
4409
by a page latch that was acquired when pcur was positioned.
4410
The latch will not be released until mtr_commit(&mtr). */
4412
if ((match_mode == ROW_SEL_EXACT
4413
|| prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
4414
&& prebuilt->select_lock_type == LOCK_NONE
4415
&& !prebuilt->templ_contains_blob
4416
&& !prebuilt->clust_index_was_generated
4417
&& prebuilt->template_type
4418
!= ROW_MYSQL_DUMMY_TEMPLATE) {
4420
/* Inside an update, for example, we do not cache rows,
4421
since we may use the cursor position to do the actual
4422
update, that is why we require ...lock_type == LOCK_NONE.
4423
Since we keep space in prebuilt only for the BLOBs of
4424
a single row, we cannot cache rows in the case there
4425
are BLOBs in the fields to be fetched. In HANDLER we do
4426
not cache rows because there the cursor is a scrollable
4429
if (!row_sel_push_cache_row_for_mysql(prebuilt, result_rec,
4432
/* Only fresh inserts may contain incomplete
4433
externally stored columns. Pretend that such
4434
records do not exist. Such records may only be
4435
accessed at the READ UNCOMMITTED isolation
4436
level or when rolling back a recovered
4437
transaction. Rollback happens at a lower
4439
ut_a(trx->isolation_level == TRX_ISO_READ_UNCOMMITTED);
4440
} else if (prebuilt->n_fetch_cached
4441
== MYSQL_FETCH_CACHE_SIZE) {
4449
(prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE)) {
4450
/* CHECK TABLE: fetch the row */
4452
if (result_rec != rec
4453
&& !prebuilt->need_to_access_clustered) {
4454
/* We used 'offsets' for the clust
4455
rec, recalculate them for 'rec' */
4456
offsets = rec_get_offsets(rec, index, offsets,
4462
memcpy(buf + 4, result_rec
4463
- rec_offs_extra_size(offsets),
4464
rec_offs_size(offsets));
4465
mach_write_to_4(buf,
4466
rec_offs_extra_size(offsets) + 4);
4468
/* Returning a row to MySQL */
4470
if (!row_sel_store_mysql_rec(buf, prebuilt, result_rec,
4473
/* Only fresh inserts may contain
4474
incomplete externally stored
4475
columns. Pretend that such records do
4476
not exist. Such records may only be
4477
accessed at the READ UNCOMMITTED
4478
isolation level or when rolling back a
4479
recovered transaction. Rollback
4480
happens at a lower level, not here. */
4481
ut_a(trx->isolation_level
4482
== TRX_ISO_READ_UNCOMMITTED);
4487
if (prebuilt->clust_index_was_generated) {
4488
if (result_rec != rec) {
4489
offsets = rec_get_offsets(
4490
rec, index, offsets, ULINT_UNDEFINED,
4493
row_sel_store_row_id_to_prebuilt(prebuilt, rec,
4498
/* From this point on, 'offsets' are invalid. */
4501
/* We have an optimization to save CPU time: if this is a consistent
4502
read on a unique condition on the clustered index, then we do not
4503
store the pcur position, because any fetch next or prev will anyway
4504
return 'end of file'. Exceptions are locking reads and the MySQL
4505
HANDLER command where the user can move the cursor with PREV or NEXT
4506
even after a unique search. */
4508
if (!unique_search_from_clust_index
4509
|| prebuilt->select_lock_type != LOCK_NONE) {
4511
/* Inside an update always store the cursor position */
4513
btr_pcur_store_position(pcur, &mtr);
4521
/* Reset the old and new "did semi-consistent read" flags. */
4522
if (UNIV_UNLIKELY(prebuilt->row_read_type
4523
== ROW_READ_DID_SEMI_CONSISTENT)) {
4524
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4526
did_semi_consistent_read = FALSE;
4527
prebuilt->new_rec_locks = 0;
4529
/*-------------------------------------------------------------*/
4530
/* PHASE 5: Move the cursor to the next index record */
4532
if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
4533
/* We must commit mtr if we are moving to the next
4534
non-clustered index record, because we could break the
4535
latching order if we would access a different clustered
4536
index page right away without releasing the previous. */
4538
btr_pcur_store_position(pcur, &mtr);
4541
mtr_has_extra_clust_latch = FALSE;
4544
if (sel_restore_position_for_mysql(&same_user_rec,
4546
pcur, moves_up, &mtr)) {
4547
#ifdef UNIV_SEARCH_DEBUG
4549
#endif /* UNIV_SEARCH_DEBUG */
4556
if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
4558
btr_pcur_store_position(pcur, &mtr);
4560
if (match_mode != 0) {
4561
err = DB_RECORD_NOT_FOUND;
4563
err = DB_END_OF_INDEX;
4569
if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
4574
#ifdef UNIV_SEARCH_DEBUG
4576
#endif /* UNIV_SEARCH_DEBUG */
4581
/* Reset the old and new "did semi-consistent read" flags. */
4582
if (UNIV_UNLIKELY(prebuilt->row_read_type
4583
== ROW_READ_DID_SEMI_CONSISTENT)) {
4584
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4586
did_semi_consistent_read = FALSE;
4588
/*-------------------------------------------------------------*/
4590
btr_pcur_store_position(pcur, &mtr);
4594
mtr_has_extra_clust_latch = FALSE;
4596
trx->error_state = err;
4598
/* The following is a patch for MySQL */
4600
que_thr_stop_for_mysql(thr);
4602
thr->lock_state = QUE_THR_LOCK_ROW;
4604
if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
4605
/* It was a lock wait, and it ended */
4607
thr->lock_state = QUE_THR_LOCK_NOLOCK;
4610
/* Table lock waited, go try to obtain table lock
4612
if (table_lock_waited) {
4613
table_lock_waited = FALSE;
4615
goto wait_table_again;
4618
sel_restore_position_for_mysql(&same_user_rec,
4619
BTR_SEARCH_LEAF, pcur,
4622
if ((srv_locks_unsafe_for_binlog
4623
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4624
&& !same_user_rec) {
4626
/* Since we were not able to restore the cursor
4627
on the same user record, we cannot use
4628
row_unlock_for_mysql() to unlock any records, and
4629
we must thus reset the new rec lock info. Since
4630
in lock0lock.c we have blocked the inheriting of gap
4631
X-locks, we actually do not have any new record locks
4634
Note that if we were able to restore on the 'same'
4635
user record, it is still possible that we were actually
4636
waiting on a delete-marked record, and meanwhile
4637
it was removed by purge and inserted again by some
4638
other user. But that is no problem, because in
4639
rec_loop we will again try to set a lock, and
4640
new_rec_lock_info in trx will be right at the end. */
4642
prebuilt->new_rec_locks = 0;
4645
mode = pcur->search_mode;
4650
thr->lock_state = QUE_THR_LOCK_NOLOCK;
4652
#ifdef UNIV_SEARCH_DEBUG
4653
/* fputs("Using ", stderr);
4654
dict_index_name_print(stderr, index);
4655
fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4656
#endif /* UNIV_SEARCH_DEBUG */
4660
/*-------------------------------------------------------------*/
4661
que_thr_stop_for_mysql_no_error(thr, trx);
4665
if (prebuilt->n_fetch_cached > 0) {
4666
row_sel_pop_cached_row_for_mysql(buf, prebuilt);
4671
#ifdef UNIV_SEARCH_DEBUG
4672
/* fputs("Using ", stderr);
4673
dict_index_name_print(stderr, index);
4674
fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4675
#endif /* UNIV_SEARCH_DEBUG */
4676
if (err == DB_SUCCESS) {
4682
if (UNIV_LIKELY_NULL(heap)) {
4683
mem_heap_free(heap);
4686
/* Set or reset the "did semi-consistent read" flag on return.
4687
The flag did_semi_consistent_read is set if and only if
4688
the record being returned was fetched with a semi-consistent read. */
4689
ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
4690
|| !did_semi_consistent_read);
4692
if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
4693
if (UNIV_UNLIKELY(did_semi_consistent_read)) {
4694
prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
4696
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4702
/*******************************************************************//**
4703
Checks if MySQL at the moment is allowed for this table to retrieve a
4704
consistent read result, or store it to the query cache.
4705
@return TRUE if storing or retrieving from the query cache is permitted */
4708
row_search_check_if_query_cache_permitted(
4709
/*======================================*/
4710
trx_t* trx, /*!< in: transaction object */
4711
const char* norm_name) /*!< in: concatenation of database name,
4712
'/' char, table name */
4714
dict_table_t* table;
4717
table = dict_table_get(norm_name, FALSE);
4719
if (table == NULL) {
4724
mutex_enter(&kernel_mutex);
4726
/* Start the transaction if it is not started yet */
4728
trx_start_if_not_started_low(trx);
4730
/* If there are locks on the table or some trx has invalidated the
4731
cache up to our trx id, then ret = FALSE.
4732
We do not check what type locks there are on the table, though only
4733
IX type locks actually would require ret = FALSE. */
4735
if (UT_LIST_GET_LEN(table->locks) == 0
4736
&& trx->id >= table->query_cache_inv_trx_id) {
4740
/* If the isolation level is high, assign a read view for the
4741
transaction if it does not yet have one */
4743
if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
4744
&& !trx->read_view) {
4746
trx->read_view = read_view_open_now(
4747
trx->id, trx->global_read_view_heap);
4748
trx->global_read_view = trx->read_view;
4752
mutex_exit(&kernel_mutex);
4757
/*******************************************************************//**
4758
Read the AUTOINC column from the current row. If the value is less than
4759
0 and the type is not unsigned then we reset the value to 0.
4760
@return value read from the column */
4763
row_search_autoinc_read_column(
4764
/*===========================*/
4765
dict_index_t* index, /*!< in: index to read from */
4766
const rec_t* rec, /*!< in: current rec */
4767
ulint col_no, /*!< in: column number */
4768
ulint mtype, /*!< in: column main type */
4769
ibool unsigned_type) /*!< in: signed or unsigned flag */
4774
mem_heap_t* heap = NULL;
4775
ulint offsets_[REC_OFFS_NORMAL_SIZE];
4776
ulint* offsets = offsets_;
4778
rec_offs_init(offsets_);
4780
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4782
data = rec_get_nth_field(rec, offsets, col_no, &len);
4784
ut_a(len != UNIV_SQL_NULL);
4788
ut_a(len <= sizeof value);
4789
value = mach_read_int_type(data, len, unsigned_type);
4793
ut_a(len == sizeof(float));
4794
value = (ib_uint64_t) mach_float_read(data);
4798
ut_a(len == sizeof(double));
4799
value = (ib_uint64_t) mach_double_read(data);
4806
if (UNIV_LIKELY_NULL(heap)) {
4807
mem_heap_free(heap);
4810
if (!unsigned_type && (ib_int64_t) value < 0) {
4817
/*******************************************************************//**
4819
@return current rec or NULL */
4822
row_search_autoinc_get_rec(
4823
/*=======================*/
4824
btr_pcur_t* pcur, /*!< in: the current cursor */
4825
mtr_t* mtr) /*!< in: mini transaction */
4828
const rec_t* rec = btr_pcur_get_rec(pcur);
4830
if (page_rec_is_user_rec(rec)) {
4833
} while (btr_pcur_move_to_prev(pcur, mtr));
4838
/*******************************************************************//**
4839
Read the max AUTOINC value from an index.
4840
@return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if
4841
column name can't be found in index */
4844
row_search_max_autoinc(
4845
/*===================*/
4846
dict_index_t* index, /*!< in: index to search */
4847
const char* col_name, /*!< in: name of autoinc column */
4848
ib_uint64_t* value) /*!< out: AUTOINC value read */
4852
dict_field_t* dfield = NULL;
4853
ulint error = DB_SUCCESS;
4855
n_cols = dict_index_get_n_ordering_defined_by_user(index);
4857
/* Search the index for the AUTOINC column name */
4858
for (i = 0; i < n_cols; ++i) {
4859
dfield = dict_index_get_nth_field(index, i);
4861
if (strcmp(col_name, dfield->name) == 0) {
4868
/* Must find the AUTOINC column name */
4869
if (i < n_cols && dfield) {
4875
/* Open at the high/right end (FALSE), and INIT
4877
btr_pcur_open_at_index_side(
4878
FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
4880
if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
4883
rec = row_search_autoinc_get_rec(&pcur, &mtr);
4886
ibool unsigned_type = (
4887
dfield->col->prtype & DATA_UNSIGNED);
4889
*value = row_search_autoinc_read_column(
4891
dfield->col->mtype, unsigned_type);
4895
btr_pcur_close(&pcur);
4899
error = DB_RECORD_NOT_FOUND;