1
/*****************************************************************************
3
Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
4
Copyright (c) 2008, Google Inc.
6
Portions of this file contain modifications contributed and copyrighted by
7
Google, Inc. Those modifications are gratefully acknowledged and are described
8
briefly in the InnoDB documentation. The contributions by Google are
9
incorporated with their permission, and subject to the conditions contained in
10
the file COPYING.Google.
12
This program is free software; you can redistribute it and/or modify it under
13
the terms of the GNU General Public License as published by the Free Software
14
Foundation; version 2 of the License.
16
This program is distributed in the hope that it will be useful, but WITHOUT
17
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20
You should have received a copy of the GNU General Public License along with
21
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
22
Place, Suite 330, Boston, MA 02111-1307 USA
24
*****************************************************************************/
26
/***************************************************//**
30
Created 12/19/1997 Heikki Tuuri
31
*******************************************************/
39
#include "dict0dict.h"
40
#include "dict0boot.h"
46
#include "mach0data.h"
52
#include "lock0lock.h"
53
#include "eval0eval.h"
55
#include "pars0pars.h"
56
#include "row0mysql.h"
57
#include "read0read.h"
59
#include "ha_prototypes.h"
61
/* Maximum number of rows to prefetch; MySQL interface has another parameter */
62
#define SEL_MAX_N_PREFETCH 16
64
/* Number of rows fetched, after which to start prefetching; MySQL interface
65
has another parameter */
66
#define SEL_PREFETCH_LIMIT 1
68
/* When a select has accessed about this many pages, it returns control back
69
to que_run_threads: this is to allow canceling runaway queries */
71
#define SEL_COST_LIMIT 100
73
/* Flags for search shortcut */
75
#define SEL_EXHAUSTED 1
78
/********************************************************************//**
79
Returns TRUE if the user-defined column in a secondary index record
80
is alphabetically the same as the corresponding BLOB column in the clustered
82
NOTE: the comparison is NOT done as a binary comparison, but character
83
fields are compared with collation!
84
@return TRUE if the columns are equal */
87
row_sel_sec_rec_is_for_blob(
88
/*========================*/
89
ulint mtype, /*!< in: main type */
90
ulint prtype, /*!< in: precise type */
91
ulint mbminlen, /*!< in: minimum length of a
92
multi-byte character */
93
ulint mbmaxlen, /*!< in: maximum length of a
94
multi-byte character */
95
const byte* clust_field, /*!< in: the locally stored part of
96
the clustered index column, including
97
the BLOB pointer; the clustered
98
index record must be covered by
99
a lock or a page latch to protect it
100
against deletion (rollback or purge) */
101
ulint clust_len, /*!< in: length of clust_field */
102
const byte* sec_field, /*!< in: column in secondary index */
103
ulint sec_len, /*!< in: length of sec_field */
104
ulint zip_size) /*!< in: compressed page size, or 0 */
107
byte buf[DICT_MAX_INDEX_COL_LEN];
109
len = btr_copy_externally_stored_field_prefix(buf, sizeof buf,
111
clust_field, clust_len);
113
if (UNIV_UNLIKELY(len == 0)) {
114
/* The BLOB was being deleted as the server crashed.
115
There should not be any secondary index records
116
referring to this clustered index record, because
117
btr_free_externally_stored_field() is called after all
118
secondary index entries of the row have been purged. */
122
len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen,
123
sec_len, len, (const char*) buf);
125
return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
128
/********************************************************************//**
129
Returns TRUE if the user-defined column values in a secondary index record
130
are alphabetically the same as the corresponding columns in the clustered
132
NOTE: the comparison is NOT done as a binary comparison, but character
133
fields are compared with collation!
134
@return TRUE if the secondary record is equal to the corresponding
135
fields in the clustered record, when compared with collation */
138
row_sel_sec_rec_is_for_clust_rec(
139
/*=============================*/
140
const rec_t* sec_rec, /*!< in: secondary index record */
141
dict_index_t* sec_index, /*!< in: secondary index */
142
const rec_t* clust_rec, /*!< in: clustered index record;
143
must be protected by a lock or
144
a page latch against deletion
145
in rollback or purge */
146
dict_index_t* clust_index) /*!< in: clustered index */
148
const byte* sec_field;
150
const byte* clust_field;
153
mem_heap_t* heap = NULL;
154
ulint clust_offsets_[REC_OFFS_NORMAL_SIZE];
155
ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
156
ulint* clust_offs = clust_offsets_;
157
ulint* sec_offs = sec_offsets_;
158
ibool is_equal = TRUE;
160
rec_offs_init(clust_offsets_);
161
rec_offs_init(sec_offsets_);
163
if (rec_get_deleted_flag(clust_rec,
164
dict_table_is_comp(clust_index->table))) {
166
/* The clustered index record is delete-marked;
167
it is not visible in the read view. Besides,
168
if there are any externally stored columns,
169
some of them may have already been purged. */
173
clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
174
ULINT_UNDEFINED, &heap);
175
sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
176
ULINT_UNDEFINED, &heap);
178
n = dict_index_get_n_ordering_defined_by_user(sec_index);
180
for (i = 0; i < n; i++) {
181
const dict_field_t* ifield;
182
const dict_col_t* col;
187
ifield = dict_index_get_nth_field(sec_index, i);
188
col = dict_field_get_col(ifield);
189
clust_pos = dict_col_get_clust_pos(col, clust_index);
191
clust_field = rec_get_nth_field(
192
clust_rec, clust_offs, clust_pos, &clust_len);
193
sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
197
if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL) {
199
if (rec_offs_nth_extern(clust_offs, clust_pos)) {
200
len -= BTR_EXTERN_FIELD_REF_SIZE;
203
len = dtype_get_at_most_n_mbchars(
204
col->prtype, col->mbminlen, col->mbmaxlen,
205
ifield->prefix_len, len, (char*) clust_field);
207
if (rec_offs_nth_extern(clust_offs, clust_pos)
209
if (!row_sel_sec_rec_is_for_blob(
210
col->mtype, col->prtype,
211
col->mbminlen, col->mbmaxlen,
212
clust_field, clust_len,
215
clust_index->table))) {
223
if (0 != cmp_data_data(col->mtype, col->prtype,
225
sec_field, sec_len)) {
233
if (UNIV_LIKELY_NULL(heap)) {
239
/*********************************************************************//**
240
Creates a select node struct.
241
@return own: select node struct */
246
mem_heap_t* heap) /*!< in: memory heap where created */
250
node = mem_heap_alloc(heap, sizeof(sel_node_t));
251
node->common.type = QUE_NODE_SELECT;
252
node->state = SEL_NODE_OPEN;
259
/*********************************************************************//**
260
Frees the memory private to a select node when a query graph is freed,
261
does not free the heap where the node was originally created. */
264
sel_node_free_private(
265
/*==================*/
266
sel_node_t* node) /*!< in: select node struct */
271
if (node->plans != NULL) {
272
for (i = 0; i < node->n_tables; i++) {
273
plan = sel_node_get_nth_plan(node, i);
275
btr_pcur_close(&(plan->pcur));
276
btr_pcur_close(&(plan->clust_pcur));
278
if (plan->old_vers_heap) {
279
mem_heap_free(plan->old_vers_heap);
285
/*********************************************************************//**
286
Evaluates the values in a select list. If there are aggregate functions,
287
their argument value is added to the aggregate total. */
290
sel_eval_select_list(
291
/*=================*/
292
sel_node_t* node) /*!< in: select node */
296
exp = node->select_list;
301
exp = que_node_get_next(exp);
305
/*********************************************************************//**
306
Assigns the values in the select list to the possible into-variables in
307
SELECT ... INTO ... */
310
sel_assign_into_var_values(
311
/*=======================*/
312
sym_node_t* var, /*!< in: first variable in a list of variables */
313
sel_node_t* node) /*!< in: select node */
322
exp = node->select_list;
327
eval_node_copy_val(var->alias, exp);
329
exp = que_node_get_next(exp);
330
var = que_node_get_next(var);
334
/*********************************************************************//**
335
Resets the aggregate value totals in the select list of an aggregate type
339
sel_reset_aggregate_vals(
340
/*=====================*/
341
sel_node_t* node) /*!< in: select node */
343
func_node_t* func_node;
345
ut_ad(node->is_aggregate);
347
func_node = node->select_list;
350
eval_node_set_int_val(func_node, 0);
352
func_node = que_node_get_next(func_node);
355
node->aggregate_already_fetched = FALSE;
358
/*********************************************************************//**
359
Copies the input variable values when an explicit cursor is opened. */
362
row_sel_copy_input_variable_vals(
363
/*=============================*/
364
sel_node_t* node) /*!< in: select node */
368
var = UT_LIST_GET_FIRST(node->copy_variables);
371
eval_node_copy_val(var, var->alias);
373
var->indirection = NULL;
375
var = UT_LIST_GET_NEXT(col_var_list, var);
379
/*********************************************************************//**
380
Fetches the column values from a record. */
383
row_sel_fetch_columns(
384
/*==================*/
385
dict_index_t* index, /*!< in: record index */
386
const rec_t* rec, /*!< in: record in a clustered or non-clustered
387
index; must be protected by a page latch */
388
const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
389
sym_node_t* column) /*!< in: first column in a column list, or
398
ut_ad(rec_offs_validate(rec, index, offsets));
400
if (dict_index_is_clust(index)) {
401
index_type = SYM_CLUST_FIELD_NO;
403
index_type = SYM_SEC_FIELD_NO;
407
mem_heap_t* heap = NULL;
410
field_no = column->field_nos[index_type];
412
if (field_no != ULINT_UNDEFINED) {
414
if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
417
/* Copy an externally stored field to the
420
heap = mem_heap_create(1);
422
data = btr_rec_copy_externally_stored_field(
424
dict_table_zip_size(index->table),
425
field_no, &len, heap);
427
ut_a(len != UNIV_SQL_NULL);
431
data = rec_get_nth_field(rec, offsets,
434
if (len == UNIV_SQL_NULL) {
438
needs_copy = column->copy_val;
442
eval_node_copy_and_alloc_val(column, data,
445
val = que_node_get_val(column);
446
dfield_set_data(val, data, len);
449
if (UNIV_LIKELY_NULL(heap)) {
454
column = UT_LIST_GET_NEXT(col_var_list, column);
458
/*********************************************************************//**
459
Allocates a prefetch buffer for a column when prefetch is first time done. */
462
sel_col_prefetch_buf_alloc(
463
/*=======================*/
464
sym_node_t* column) /*!< in: symbol table node for a column */
469
ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
471
column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
472
* sizeof(sel_buf_t));
473
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
474
sel_buf = column->prefetch_buf + i;
476
sel_buf->data = NULL;
478
sel_buf->val_buf_size = 0;
482
/*********************************************************************//**
483
Frees a prefetch buffer for a column, including the dynamically allocated
484
memory for data stored there. */
487
sel_col_prefetch_buf_free(
488
/*======================*/
489
sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */
494
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
495
sel_buf = prefetch_buf + i;
497
if (sel_buf->val_buf_size > 0) {
499
mem_free(sel_buf->data);
504
/*********************************************************************//**
505
Pops the column values for a prefetched, cached row from the column prefetch
506
buffers and places them to the val fields in the column nodes. */
509
sel_pop_prefetched_row(
510
/*===================*/
511
plan_t* plan) /*!< in: plan node for a table */
520
ut_ad(plan->n_rows_prefetched > 0);
522
column = UT_LIST_GET_FIRST(plan->columns);
525
val = que_node_get_val(column);
527
if (!column->copy_val) {
528
/* We did not really push any value for the
531
ut_ad(!column->prefetch_buf);
532
ut_ad(que_node_get_val_buf_size(column) == 0);
533
ut_d(dfield_set_null(val));
538
ut_ad(column->prefetch_buf);
539
ut_ad(!dfield_is_ext(val));
541
sel_buf = column->prefetch_buf + plan->first_prefetched;
543
data = sel_buf->data;
545
val_buf_size = sel_buf->val_buf_size;
547
/* We must keep track of the allocated memory for
548
column values to be able to free it later: therefore
549
we swap the values for sel_buf and val */
551
sel_buf->data = dfield_get_data(val);
552
sel_buf->len = dfield_get_len(val);
553
sel_buf->val_buf_size = que_node_get_val_buf_size(column);
555
dfield_set_data(val, data, len);
556
que_node_set_val_buf_size(column, val_buf_size);
558
column = UT_LIST_GET_NEXT(col_var_list, column);
561
plan->n_rows_prefetched--;
563
plan->first_prefetched++;
566
/*********************************************************************//**
567
Pushes the column values for a prefetched, cached row to the column prefetch
568
buffers from the val fields in the column nodes. */
571
sel_push_prefetched_row(
572
/*====================*/
573
plan_t* plan) /*!< in: plan node for a table */
583
if (plan->n_rows_prefetched == 0) {
585
plan->first_prefetched = 0;
587
pos = plan->n_rows_prefetched;
589
/* We have the convention that pushing new rows starts only
590
after the prefetch stack has been emptied: */
592
ut_ad(plan->first_prefetched == 0);
595
plan->n_rows_prefetched++;
597
ut_ad(pos < SEL_MAX_N_PREFETCH);
599
column = UT_LIST_GET_FIRST(plan->columns);
602
if (!column->copy_val) {
603
/* There is no sense to push pointers to database
604
page fields when we do not keep latch on the page! */
609
if (!column->prefetch_buf) {
610
/* Allocate a new prefetch buffer */
612
sel_col_prefetch_buf_alloc(column);
615
sel_buf = column->prefetch_buf + pos;
617
val = que_node_get_val(column);
619
data = dfield_get_data(val);
620
len = dfield_get_len(val);
621
val_buf_size = que_node_get_val_buf_size(column);
623
/* We must keep track of the allocated memory for
624
column values to be able to free it later: therefore
625
we swap the values for sel_buf and val */
627
dfield_set_data(val, sel_buf->data, sel_buf->len);
628
que_node_set_val_buf_size(column, sel_buf->val_buf_size);
630
sel_buf->data = data;
632
sel_buf->val_buf_size = val_buf_size;
634
column = UT_LIST_GET_NEXT(col_var_list, column);
638
/*********************************************************************//**
639
Builds a previous version of a clustered index record for a consistent read
640
@return DB_SUCCESS or error code */
643
row_sel_build_prev_vers(
644
/*====================*/
645
read_view_t* read_view, /*!< in: read view */
646
dict_index_t* index, /*!< in: plan node for table */
647
rec_t* rec, /*!< in: record in a clustered index */
648
ulint** offsets, /*!< in/out: offsets returned by
649
rec_get_offsets(rec, plan->index) */
650
mem_heap_t** offset_heap, /*!< in/out: memory heap from which
651
the offsets are allocated */
652
mem_heap_t** old_vers_heap, /*!< out: old version heap to use */
653
rec_t** old_vers, /*!< out: old version, or NULL if the
654
record does not exist in the view:
655
i.e., it was freshly inserted
657
mtr_t* mtr) /*!< in: mtr */
661
if (*old_vers_heap) {
662
mem_heap_empty(*old_vers_heap);
664
*old_vers_heap = mem_heap_create(512);
667
err = row_vers_build_for_consistent_read(
668
rec, mtr, index, offsets, read_view, offset_heap,
669
*old_vers_heap, old_vers);
673
/*********************************************************************//**
674
Builds the last committed version of a clustered index record for a
675
semi-consistent read.
676
@return DB_SUCCESS or error code */
679
row_sel_build_committed_vers_for_mysql(
680
/*===================================*/
681
dict_index_t* clust_index, /*!< in: clustered index */
682
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
683
const rec_t* rec, /*!< in: record in a clustered index */
684
ulint** offsets, /*!< in/out: offsets returned by
685
rec_get_offsets(rec, clust_index) */
686
mem_heap_t** offset_heap, /*!< in/out: memory heap from which
687
the offsets are allocated */
688
const rec_t** old_vers, /*!< out: old version, or NULL if the
689
record does not exist in the view:
690
i.e., it was freshly inserted
692
mtr_t* mtr) /*!< in: mtr */
696
if (prebuilt->old_vers_heap) {
697
mem_heap_empty(prebuilt->old_vers_heap);
699
prebuilt->old_vers_heap = mem_heap_create(200);
702
err = row_vers_build_for_semi_consistent_read(
703
rec, mtr, clust_index, offsets, offset_heap,
704
prebuilt->old_vers_heap, old_vers);
708
/*********************************************************************//**
709
Tests the conditions which determine when the index segment we are searching
710
through has been exhausted.
711
@return TRUE if row passed the tests */
714
row_sel_test_end_conds(
715
/*===================*/
716
plan_t* plan) /*!< in: plan for the table; the column values must
717
already have been retrieved and the right sides of
718
comparisons evaluated */
722
/* All conditions in end_conds are comparisons of a column to an
725
cond = UT_LIST_GET_FIRST(plan->end_conds);
728
/* Evaluate the left side of the comparison, i.e., get the
729
column value if there is an indirection */
731
eval_sym(cond->args);
733
/* Do the comparison */
735
if (!eval_cmp(cond)) {
740
cond = UT_LIST_GET_NEXT(cond_list, cond);
746
/*********************************************************************//**
747
Tests the other conditions.
748
@return TRUE if row passed the tests */
751
row_sel_test_other_conds(
752
/*=====================*/
753
plan_t* plan) /*!< in: plan for the table; the column values must
754
already have been retrieved */
758
cond = UT_LIST_GET_FIRST(plan->other_conds);
763
if (!eval_node_get_ibool_val(cond)) {
768
cond = UT_LIST_GET_NEXT(cond_list, cond);
774
/*********************************************************************//**
775
Retrieves the clustered index record corresponding to a record in a
776
non-clustered index. Does the necessary locking.
777
@return DB_SUCCESS or error code */
780
row_sel_get_clust_rec(
781
/*==================*/
782
sel_node_t* node, /*!< in: select_node */
783
plan_t* plan, /*!< in: plan node for table */
784
rec_t* rec, /*!< in: record in a non-clustered index */
785
que_thr_t* thr, /*!< in: query thread */
786
rec_t** out_rec,/*!< out: clustered record or an old version of
787
it, NULL if the old version did not exist
788
in the read view, i.e., it was a fresh
790
mtr_t* mtr) /*!< in: mtr used to get access to the
791
non-clustered record; the same mtr is used to
792
access the clustered index */
798
mem_heap_t* heap = NULL;
799
ulint offsets_[REC_OFFS_NORMAL_SIZE];
800
ulint* offsets = offsets_;
801
rec_offs_init(offsets_);
805
offsets = rec_get_offsets(rec,
806
btr_pcur_get_btr_cur(&plan->pcur)->index,
807
offsets, ULINT_UNDEFINED, &heap);
809
row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
811
index = dict_table_get_first_index(plan->table);
813
btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
814
BTR_SEARCH_LEAF, &plan->clust_pcur,
817
clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
819
/* Note: only if the search ends up on a non-infimum record is the
820
low_match value the real match to the search tuple */
822
if (!page_rec_is_user_rec(clust_rec)
823
|| btr_pcur_get_low_match(&(plan->clust_pcur))
824
< dict_index_get_n_unique(index)) {
826
ut_a(rec_get_deleted_flag(rec,
827
dict_table_is_comp(plan->table)));
828
ut_a(node->read_view);
830
/* In a rare case it is possible that no clust rec is found
831
for a delete-marked secondary index record: if in row0umod.c
832
in row_undo_mod_remove_clust_low() we have already removed
833
the clust rec, while purge is still cleaning and removing
834
secondary index records associated with earlier versions of
835
the clustered index record. In that case we know that the
836
clustered index record did not exist in the read view of
842
offsets = rec_get_offsets(clust_rec, index, offsets,
843
ULINT_UNDEFINED, &heap);
845
if (!node->read_view) {
846
/* Try to place a lock on the index record */
848
/* If innodb_locks_unsafe_for_binlog option is used
849
or this session is using READ COMMITTED isolation level
850
we lock only the record, i.e., next-key locking is
855
trx = thr_get_trx(thr);
857
if (srv_locks_unsafe_for_binlog
858
|| trx->isolation_level == TRX_ISO_READ_COMMITTED) {
859
lock_type = LOCK_REC_NOT_GAP;
861
lock_type = LOCK_ORDINARY;
864
err = lock_clust_rec_read_check_and_lock(
865
0, btr_pcur_get_block(&plan->clust_pcur),
866
clust_rec, index, offsets,
867
node->row_lock_mode, lock_type, thr);
869
if (err != DB_SUCCESS) {
874
/* This is a non-locking consistent read: if necessary, fetch
875
a previous version of the record */
879
if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
882
err = row_sel_build_prev_vers(
883
node->read_view, index, clust_rec,
884
&offsets, &heap, &plan->old_vers_heap,
887
if (err != DB_SUCCESS) {
892
clust_rec = old_vers;
894
if (clust_rec == NULL) {
899
/* If we had to go to an earlier version of row or the
900
secondary index record is delete marked, then it may be that
901
the secondary index record corresponding to clust_rec
902
(or old_vers) is not rec; in that case we must ignore
903
such row because in our snapshot rec would not have existed.
904
Remember that from rec we cannot see directly which transaction
905
id corresponds to it: we have to go to the clustered index
906
record. A query where we want to fetch all rows where
907
the secondary index value is in some interval would return
908
a wrong result if we would not drop rows which we come to
909
visit through secondary index records that would not really
910
exist in our snapshot. */
913
|| rec_get_deleted_flag(rec, dict_table_is_comp(
915
&& !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
921
/* Fetch the columns needed in test conditions. The clustered
922
index record is protected by a page latch that was acquired
923
when plan->clust_pcur was positioned. The latch will not be
924
released until mtr_commit(mtr). */
926
row_sel_fetch_columns(index, clust_rec, offsets,
927
UT_LIST_GET_FIRST(plan->columns));
928
*out_rec = clust_rec;
932
if (UNIV_LIKELY_NULL(heap)) {
938
/*********************************************************************//**
939
Sets a lock on a record.
940
@return DB_SUCCESS or error code */
945
const buf_block_t* block, /*!< in: buffer block of rec */
946
const rec_t* rec, /*!< in: record */
947
dict_index_t* index, /*!< in: index */
948
const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
949
ulint mode, /*!< in: lock mode */
950
ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
952
que_thr_t* thr) /*!< in: query thread */
957
trx = thr_get_trx(thr);
959
if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
960
if (buf_LRU_buf_pool_running_out()) {
962
return(DB_LOCK_TABLE_FULL);
966
if (dict_index_is_clust(index)) {
967
err = lock_clust_rec_read_check_and_lock(
968
0, block, rec, index, offsets, mode, type, thr);
970
err = lock_sec_rec_read_check_and_lock(
971
0, block, rec, index, offsets, mode, type, thr);
977
/*********************************************************************//**
978
Opens a pcur to a table index. */
983
plan_t* plan, /*!< in: table plan */
984
ibool search_latch_locked,
985
/*!< in: TRUE if the thread currently
986
has the search latch locked in
988
mtr_t* mtr) /*!< in: mtr */
994
ulint has_search_latch = 0; /* RW_S_LATCH or 0 */
997
if (search_latch_locked) {
998
has_search_latch = RW_S_LATCH;
1001
index = plan->index;
1003
/* Calculate the value of the search tuple: the exact match columns
1004
get their expressions evaluated when we evaluate the right sides of
1007
cond = UT_LIST_GET_FIRST(plan->end_conds);
1010
eval_exp(que_node_get_next(cond->args));
1012
cond = UT_LIST_GET_NEXT(cond_list, cond);
1016
n_fields = dtuple_get_n_fields(plan->tuple);
1018
if (plan->n_exact_match < n_fields) {
1019
/* There is a non-exact match field which must be
1020
evaluated separately */
1022
eval_exp(plan->tuple_exps[n_fields - 1]);
1025
for (i = 0; i < n_fields; i++) {
1026
exp = plan->tuple_exps[i];
1028
dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
1029
que_node_get_val(exp));
1032
/* Open pcur to the index */
1034
btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
1035
BTR_SEARCH_LEAF, &plan->pcur,
1036
has_search_latch, mtr);
1038
/* Open the cursor to the start or the end of the index
1041
btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
1042
&(plan->pcur), FALSE, mtr);
1045
ut_ad(plan->n_rows_prefetched == 0);
1046
ut_ad(plan->n_rows_fetched == 0);
1047
ut_ad(plan->cursor_at_end == FALSE);
1049
plan->pcur_is_open = TRUE;
1052
/*********************************************************************//**
1053
Restores a stored pcur position to a table index.
1054
@return TRUE if the cursor should be moved to the next record after we
1055
return from this function (moved to the previous, in the case of a
1056
descending cursor) without processing again the current cursor
1060
row_sel_restore_pcur_pos(
1061
/*=====================*/
1062
plan_t* plan, /*!< in: table plan */
1063
mtr_t* mtr) /*!< in: mtr */
1065
ibool equal_position;
1066
ulint relative_position;
1068
ut_ad(!plan->cursor_at_end);
1070
relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
1072
equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
1073
&(plan->pcur), mtr);
1075
/* If the cursor is traveling upwards, and relative_position is
1077
(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
1078
yet on the successor of the page infimum;
1079
(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1080
first record GREATER than the predecessor of a page supremum; we have
1081
not yet processed the cursor record: no need to move the cursor to the
1083
(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1084
last record LESS or EQUAL to the old stored user record; (a) if
1085
equal_position is FALSE, this means that the cursor is now on a record
1086
less than the old user record, and we must move to the next record;
1087
(b) if equal_position is TRUE, then if
1088
plan->stored_cursor_rec_processed is TRUE, we must move to the next
1089
record, else there is no need to move the cursor. */
1092
if (relative_position == BTR_PCUR_ON) {
1094
if (equal_position) {
1096
return(plan->stored_cursor_rec_processed);
1102
ut_ad(relative_position == BTR_PCUR_AFTER
1103
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1108
/* If the cursor is traveling downwards, and relative_position is
1110
(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
1111
the last record LESS than the successor of a page infimum; we have not
1112
processed the cursor record: no need to move the cursor;
1113
(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1114
first record GREATER than the predecessor of a page supremum; we have
1115
processed the cursor record: we should move the cursor to the previous
1117
(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1118
last record LESS or EQUAL to the old stored user record; (a) if
1119
equal_position is FALSE, this means that the cursor is now on a record
1120
less than the old user record, and we need not move to the previous
1121
record; (b) if equal_position is TRUE, then if
1122
plan->stored_cursor_rec_processed is TRUE, we must move to the previous
1123
record, else there is no need to move the cursor. */
1125
if (relative_position == BTR_PCUR_BEFORE
1126
|| relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
1131
if (relative_position == BTR_PCUR_ON) {
1133
if (equal_position) {
1135
return(plan->stored_cursor_rec_processed);
1141
ut_ad(relative_position == BTR_PCUR_AFTER
1142
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1147
/*********************************************************************//**
1148
Resets a plan cursor to a closed state. */
1153
plan_t* plan) /*!< in: plan */
1155
plan->pcur_is_open = FALSE;
1156
plan->cursor_at_end = FALSE;
1157
plan->n_rows_fetched = 0;
1158
plan->n_rows_prefetched = 0;
1161
/*********************************************************************//**
1162
Tries to do a shortcut to fetch a clustered index record with a unique key,
1163
using the hash index if possible (not always).
1164
@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
1167
row_sel_try_search_shortcut(
1168
/*========================*/
1169
sel_node_t* node, /*!< in: select node for a consistent read */
1170
plan_t* plan, /*!< in: plan for a unique search in clustered
1172
mtr_t* mtr) /*!< in: mtr */
1174
dict_index_t* index;
1176
mem_heap_t* heap = NULL;
1177
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1178
ulint* offsets = offsets_;
1180
rec_offs_init(offsets_);
1182
index = plan->index;
1184
ut_ad(node->read_view);
1185
ut_ad(plan->unique_search);
1186
ut_ad(!plan->must_get_clust);
1187
#ifdef UNIV_SYNC_DEBUG
1188
ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
1189
#endif /* UNIV_SYNC_DEBUG */
1191
row_sel_open_pcur(plan, TRUE, mtr);
1193
rec = btr_pcur_get_rec(&(plan->pcur));
1195
if (!page_rec_is_user_rec(rec)) {
1200
ut_ad(plan->mode == PAGE_CUR_GE);
1202
/* As the cursor is now placed on a user record after a search with
1203
the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
1204
fields in the user record matched to the search tuple */
1206
if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
1208
return(SEL_EXHAUSTED);
1211
/* This is a non-locking consistent read: if necessary, fetch
1212
a previous version of the record */
1214
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1216
if (dict_index_is_clust(index)) {
1217
if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1222
} else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
1228
/* Test the deleted flag. */
1230
if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
1232
ret = SEL_EXHAUSTED;
1236
/* Fetch the columns needed in test conditions. The index
1237
record is protected by a page latch that was acquired when
1238
plan->pcur was positioned. The latch will not be released
1239
until mtr_commit(mtr). */
1241
row_sel_fetch_columns(index, rec, offsets,
1242
UT_LIST_GET_FIRST(plan->columns));
1244
/* Test the rest of search conditions */
1246
if (!row_sel_test_other_conds(plan)) {
1248
ret = SEL_EXHAUSTED;
1252
ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1254
plan->n_rows_fetched++;
1257
if (UNIV_LIKELY_NULL(heap)) {
1258
mem_heap_free(heap);
1263
/*********************************************************************//**
1264
Performs a select step.
1265
@return DB_SUCCESS or error code */
1270
sel_node_t* node, /*!< in: select node */
1271
que_thr_t* thr) /*!< in: query thread */
1273
dict_index_t* index;
1280
ibool search_latch_locked;
1281
ibool consistent_read;
1283
/* The following flag becomes TRUE when we are doing a
1284
consistent read from a non-clustered index and we must look
1285
at the clustered index to find out the previous delete mark
1286
state of the non-clustered record: */
1288
ibool cons_read_requires_clust_rec = FALSE;
1289
ulint cost_counter = 0;
1290
ibool cursor_just_opened;
1291
ibool must_go_to_next;
1292
ibool mtr_has_extra_clust_latch = FALSE;
1293
/* TRUE if the search was made using
1294
a non-clustered index, and we had to
1295
access the clustered record: now &mtr
1296
contains a clustered index latch, and
1297
&mtr must be committed before we move
1298
to the next non-clustered record */
1301
mem_heap_t* heap = NULL;
1302
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1303
ulint* offsets = offsets_;
1304
rec_offs_init(offsets_);
1306
ut_ad(thr->run_node == node);
1308
search_latch_locked = FALSE;
1310
if (node->read_view) {
1311
/* In consistent reads, we try to do with the hash index and
1312
not to use the buffer page get. This is to reduce memory bus
1313
load resulting from semaphore operations. The search latch
1314
will be s-locked when we access an index with a unique search
1315
condition, but not locked when we access an index with a
1316
less selective search condition. */
1318
consistent_read = TRUE;
1320
consistent_read = FALSE;
1326
This is the outer major loop in calculating a join. We come here when
1327
node->fetch_table changes, and after adding a row to aggregate totals
1328
and, of course, when this function is called. */
1330
ut_ad(mtr_has_extra_clust_latch == FALSE);
1332
plan = sel_node_get_nth_plan(node, node->fetch_table);
1333
index = plan->index;
1335
if (plan->n_rows_prefetched > 0) {
1336
sel_pop_prefetched_row(plan);
1338
goto next_table_no_mtr;
1341
if (plan->cursor_at_end) {
1342
/* The cursor has already reached the result set end: no more
1343
rows to process for this table cursor, as also the prefetch
1346
ut_ad(plan->pcur_is_open);
1348
goto table_exhausted_no_mtr;
1351
/* Open a cursor to index, or restore an open cursor position */
1355
if (consistent_read && plan->unique_search && !plan->pcur_is_open
1356
&& !plan->must_get_clust
1357
&& !plan->table->big_rows) {
1358
if (!search_latch_locked) {
1359
rw_lock_s_lock(&btr_search_latch);
1361
search_latch_locked = TRUE;
1362
} else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
1364
/* There is an x-latch request waiting: release the
1365
s-latch for a moment; as an s-latch here is often
1366
kept for some 10 searches before being released,
1367
a waiting x-latch request would block other threads
1368
from acquiring an s-latch for a long time, lowering
1369
performance significantly in multiprocessors. */
1371
rw_lock_s_unlock(&btr_search_latch);
1372
rw_lock_s_lock(&btr_search_latch);
1375
found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
1377
if (found_flag == SEL_FOUND) {
1381
} else if (found_flag == SEL_EXHAUSTED) {
1383
goto table_exhausted;
1386
ut_ad(found_flag == SEL_RETRY);
1388
plan_reset_cursor(plan);
1394
if (search_latch_locked) {
1395
rw_lock_s_unlock(&btr_search_latch);
1397
search_latch_locked = FALSE;
1400
if (!plan->pcur_is_open) {
1401
/* Evaluate the expressions to build the search tuple and
1404
row_sel_open_pcur(plan, search_latch_locked, &mtr);
1406
cursor_just_opened = TRUE;
1408
/* A new search was made: increment the cost counter */
1411
/* Restore pcur position to the index */
1413
must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
1415
cursor_just_opened = FALSE;
1417
if (must_go_to_next) {
1418
/* We have already processed the cursor record: move
1428
In this loop we use pcur and try to fetch a qualifying row, and
1429
also fill the prefetch buffer for this table if n_rows_fetched has
1430
exceeded a threshold. While we are inside this loop, the following
1432
(1) &mtr is started,
1433
(2) pcur is positioned and open.
1435
NOTE that if cursor_just_opened is TRUE here, it means that we came
1436
to this point right after row_sel_open_pcur. */
1438
ut_ad(mtr_has_extra_clust_latch == FALSE);
1440
rec = btr_pcur_get_rec(&(plan->pcur));
1442
/* PHASE 1: Set a lock if specified */
1444
if (!node->asc && cursor_just_opened
1445
&& !page_rec_is_supremum(rec)) {
1447
/* When we open a cursor for a descending search, we must set
1448
a next-key lock on the successor record: otherwise it would
1449
be possible to insert new records next to the cursor position,
1450
and it might be that these new records should appear in the
1451
search result set, resulting in the phantom problem. */
1453
if (!consistent_read) {
1455
/* If innodb_locks_unsafe_for_binlog option is used
1456
or this session is using READ COMMITTED isolation
1457
level, we lock only the record, i.e., next-key
1458
locking is not used. */
1460
rec_t* next_rec = page_rec_get_next(rec);
1464
trx = thr_get_trx(thr);
1466
offsets = rec_get_offsets(next_rec, index, offsets,
1467
ULINT_UNDEFINED, &heap);
1469
if (srv_locks_unsafe_for_binlog
1470
|| trx->isolation_level
1471
== TRX_ISO_READ_COMMITTED) {
1473
if (page_rec_is_supremum(next_rec)) {
1478
lock_type = LOCK_REC_NOT_GAP;
1480
lock_type = LOCK_ORDINARY;
1483
err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1484
next_rec, index, offsets,
1485
node->row_lock_mode,
1488
if (err != DB_SUCCESS) {
1489
/* Note that in this case we will store in pcur
1490
the PREDECESSOR of the record we are waiting
1493
goto lock_wait_or_error;
1499
if (page_rec_is_infimum(rec)) {
1501
/* The infimum record on a page cannot be in the result set,
1502
and neither can a record lock be placed on it: we skip such
1503
a record. We also increment the cost counter as we may have
1504
processed yet another page of index. */
1511
if (!consistent_read) {
1512
/* Try to place a lock on the index record */
1514
/* If innodb_locks_unsafe_for_binlog option is used
1515
or this session is using READ COMMITTED isolation level,
1516
we lock only the record, i.e., next-key locking is
1522
offsets = rec_get_offsets(rec, index, offsets,
1523
ULINT_UNDEFINED, &heap);
1525
trx = thr_get_trx(thr);
1527
if (srv_locks_unsafe_for_binlog
1528
|| trx->isolation_level == TRX_ISO_READ_COMMITTED) {
1530
if (page_rec_is_supremum(rec)) {
1535
lock_type = LOCK_REC_NOT_GAP;
1537
lock_type = LOCK_ORDINARY;
1540
err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1541
rec, index, offsets,
1542
node->row_lock_mode, lock_type, thr);
1544
if (err != DB_SUCCESS) {
1546
goto lock_wait_or_error;
1550
if (page_rec_is_supremum(rec)) {
1552
/* A page supremum record cannot be in the result set: skip
1553
it now when we have placed a possible lock on it */
1558
ut_ad(page_rec_is_user_rec(rec));
1560
if (cost_counter > SEL_COST_LIMIT) {
1562
/* Now that we have placed the necessary locks, we can stop
1563
for a while and store the cursor position; NOTE that if we
1564
would store the cursor position BEFORE placing a record lock,
1565
it might happen that the cursor would jump over some records
1566
that another transaction could meanwhile insert adjacent to
1567
the cursor: this would result in the phantom problem. */
1569
goto stop_for_a_while;
1572
/* PHASE 2: Check a mixed index mix id if needed */
1574
if (plan->unique_search && cursor_just_opened) {
1576
ut_ad(plan->mode == PAGE_CUR_GE);
1578
/* As the cursor is now placed on a user record after a search
1579
with the mode PAGE_CUR_GE, the up_match field in the cursor
1580
tells how many fields in the user record matched to the search
1583
if (btr_pcur_get_up_match(&(plan->pcur))
1584
< plan->n_exact_match) {
1585
goto table_exhausted;
1588
/* Ok, no need to test end_conds or mix id */
1592
/* We are ready to look at a possible new index entry in the result
1593
set: the cursor is now placed on a user record */
1595
/* PHASE 3: Get previous version in a consistent read */
1597
cons_read_requires_clust_rec = FALSE;
1598
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1600
if (consistent_read) {
1601
/* This is a non-locking consistent read: if necessary, fetch
1602
a previous version of the record */
1604
if (dict_index_is_clust(index)) {
1606
if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1609
err = row_sel_build_prev_vers(
1610
node->read_view, index, rec,
1611
&offsets, &heap, &plan->old_vers_heap,
1614
if (err != DB_SUCCESS) {
1616
goto lock_wait_or_error;
1619
if (old_vers == NULL) {
1620
offsets = rec_get_offsets(
1621
rec, index, offsets,
1622
ULINT_UNDEFINED, &heap);
1624
/* Fetch the columns needed in
1625
test conditions. The clustered
1626
index record is protected by a
1627
page latch that was acquired
1628
by row_sel_open_pcur() or
1629
row_sel_restore_pcur_pos().
1630
The latch will not be released
1631
until mtr_commit(mtr). */
1633
row_sel_fetch_columns(
1634
index, rec, offsets,
1638
if (!row_sel_test_end_conds(plan)) {
1640
goto table_exhausted;
1648
} else if (!lock_sec_rec_cons_read_sees(rec,
1650
cons_read_requires_clust_rec = TRUE;
1654
/* PHASE 4: Test search end conditions and deleted flag */
1656
/* Fetch the columns needed in test conditions. The record is
1657
protected by a page latch that was acquired by
1658
row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
1659
will not be released until mtr_commit(mtr). */
1661
row_sel_fetch_columns(index, rec, offsets,
1662
UT_LIST_GET_FIRST(plan->columns));
1664
/* Test the selection end conditions: these can only contain columns
1665
which already are found in the index, even though the index might be
1668
if (plan->unique_search && cursor_just_opened) {
1670
/* No test necessary: the test was already made above */
1672
} else if (!row_sel_test_end_conds(plan)) {
1674
goto table_exhausted;
1677
if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
1678
&& !cons_read_requires_clust_rec) {
1680
/* The record is delete marked: we can skip it if this is
1681
not a consistent read which might see an earlier version
1682
of a non-clustered index record */
1684
if (plan->unique_search) {
1686
goto table_exhausted;
1692
/* PHASE 5: Get the clustered index record, if needed and if we did
1693
not do the search using the clustered index */
1695
if (plan->must_get_clust || cons_read_requires_clust_rec) {
1697
/* It was a non-clustered index and we must fetch also the
1698
clustered index record */
1700
err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
1702
mtr_has_extra_clust_latch = TRUE;
1704
if (err != DB_SUCCESS) {
1706
goto lock_wait_or_error;
1709
/* Retrieving the clustered record required a search:
1710
increment the cost counter */
1714
if (clust_rec == NULL) {
1715
/* The record did not exist in the read view */
1716
ut_ad(consistent_read);
1721
if (rec_get_deleted_flag(clust_rec,
1722
dict_table_is_comp(plan->table))) {
1724
/* The record is delete marked: we can skip it */
1729
if (node->can_get_updated) {
1731
btr_pcur_store_position(&(plan->clust_pcur), &mtr);
1735
/* PHASE 6: Test the rest of search conditions */
1737
if (!row_sel_test_other_conds(plan)) {
1739
if (plan->unique_search) {
1741
goto table_exhausted;
1747
/* PHASE 7: We found a new qualifying row for the current table; push
1748
the row if prefetch is on, or move to the next table in the join */
1750
plan->n_rows_fetched++;
1752
ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1754
if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
1755
|| plan->unique_search || plan->no_prefetch
1756
|| plan->table->big_rows) {
1758
/* No prefetch in operation: go to the next table */
1763
sel_push_prefetched_row(plan);
1765
if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
1767
/* The prefetch buffer is now full */
1769
sel_pop_prefetched_row(plan);
1775
ut_ad(!search_latch_locked);
1777
if (mtr_has_extra_clust_latch) {
1779
/* We must commit &mtr if we are moving to the next
1780
non-clustered index record, because we could break the
1781
latching order if we would access a different clustered
1782
index page right away without releasing the previous. */
1784
goto commit_mtr_for_a_while;
1788
moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
1790
moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
1795
goto table_exhausted;
1798
cursor_just_opened = FALSE;
1800
/* END OF RECORD LOOP
1801
------------------ */
1805
/* We found a record which satisfies the conditions: we can move to
1806
the next table or return a row in the result set */
1808
ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
1810
if (plan->unique_search && !node->can_get_updated) {
1812
plan->cursor_at_end = TRUE;
1814
ut_ad(!search_latch_locked);
1816
plan->stored_cursor_rec_processed = TRUE;
1818
btr_pcur_store_position(&(plan->pcur), &mtr);
1823
mtr_has_extra_clust_latch = FALSE;
1826
/* If we use 'goto' to this label, it means that the row was popped
1827
from the prefetched rows stack, and &mtr is already committed */
1829
if (node->fetch_table + 1 == node->n_tables) {
1831
sel_eval_select_list(node);
1833
if (node->is_aggregate) {
1838
sel_assign_into_var_values(node->into_list, node);
1840
thr->run_node = que_node_get_parent(node);
1846
node->fetch_table++;
1848
/* When we move to the next table, we first reset the plan cursor:
1849
we do not care about resetting it when we backtrack from a table */
1851
plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
1856
/* The table cursor pcur reached the result set end: backtrack to the
1857
previous table in the join if we do not have cached prefetched rows */
1859
plan->cursor_at_end = TRUE;
1863
mtr_has_extra_clust_latch = FALSE;
1865
if (plan->n_rows_prefetched > 0) {
1866
/* The table became exhausted during a prefetch */
1868
sel_pop_prefetched_row(plan);
1870
goto next_table_no_mtr;
1873
table_exhausted_no_mtr:
1874
if (node->fetch_table == 0) {
1877
if (node->is_aggregate && !node->aggregate_already_fetched) {
1879
node->aggregate_already_fetched = TRUE;
1881
sel_assign_into_var_values(node->into_list, node);
1883
thr->run_node = que_node_get_parent(node);
1885
node->state = SEL_NODE_NO_MORE_ROWS;
1887
thr->run_node = que_node_get_parent(node);
1893
node->fetch_table--;
1898
/* Return control for a while to que_run_threads, so that runaway
1899
queries can be canceled. NOTE that when we come here, we must, in a
1900
locking read, have placed the necessary (possibly waiting request)
1901
record lock on the cursor record or its successor: when we reposition
1902
the cursor, this record lock guarantees that nobody can meanwhile have
1903
inserted new records which should have appeared in the result set,
1904
which would result in the phantom problem. */
1906
ut_ad(!search_latch_locked);
1908
plan->stored_cursor_rec_processed = FALSE;
1909
btr_pcur_store_position(&(plan->pcur), &mtr);
1913
#ifdef UNIV_SYNC_DEBUG
1914
ut_ad(sync_thread_levels_empty_gen(TRUE));
1915
#endif /* UNIV_SYNC_DEBUG */
1919
commit_mtr_for_a_while:
1920
/* Stores the cursor position and commits &mtr; this is used if
1921
&mtr may contain latches which would break the latching order if
1922
&mtr would not be committed and the latches released. */
1924
plan->stored_cursor_rec_processed = TRUE;
1926
ut_ad(!search_latch_locked);
1927
btr_pcur_store_position(&(plan->pcur), &mtr);
1931
mtr_has_extra_clust_latch = FALSE;
1933
#ifdef UNIV_SYNC_DEBUG
1934
ut_ad(sync_thread_levels_empty_gen(TRUE));
1935
#endif /* UNIV_SYNC_DEBUG */
1940
/* See the note at stop_for_a_while: the same holds for this case */
1942
ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
1943
ut_ad(!search_latch_locked);
1945
plan->stored_cursor_rec_processed = FALSE;
1946
btr_pcur_store_position(&(plan->pcur), &mtr);
1950
#ifdef UNIV_SYNC_DEBUG
1951
ut_ad(sync_thread_levels_empty_gen(TRUE));
1952
#endif /* UNIV_SYNC_DEBUG */
1955
if (search_latch_locked) {
1956
rw_lock_s_unlock(&btr_search_latch);
1958
if (UNIV_LIKELY_NULL(heap)) {
1959
mem_heap_free(heap);
1964
/**********************************************************************//**
1965
Performs a select step. This is a high-level function used in SQL execution
1967
@return query thread to run next or NULL */
1972
que_thr_t* thr) /*!< in: query thread */
1975
sym_node_t* table_node;
1981
node = thr->run_node;
1983
ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
1985
/* If this is a new time this node is executed (or when execution
1986
resumes after wait for a table intention lock), set intention locks
1987
on the tables, or assign a read view */
1989
if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
1991
node->state = SEL_NODE_OPEN;
1994
if (node->state == SEL_NODE_OPEN) {
1996
/* It may be that the current session has not yet started
1997
its transaction, or it has been committed: */
1999
trx_start_if_not_started(thr_get_trx(thr));
2001
plan_reset_cursor(sel_node_get_nth_plan(node, 0));
2003
if (node->consistent_read) {
2004
/* Assign a read view for the query */
2005
node->read_view = trx_assign_read_view(
2008
if (node->set_x_locks) {
2009
i_lock_mode = LOCK_IX;
2011
i_lock_mode = LOCK_IS;
2014
table_node = node->table_list;
2016
while (table_node) {
2017
err = lock_table(0, table_node->table,
2019
if (err != DB_SUCCESS) {
2020
thr_get_trx(thr)->error_state = err;
2025
table_node = que_node_get_next(table_node);
2029
/* If this is an explicit cursor, copy stored procedure
2030
variable values, so that the values cannot change between
2031
fetches (currently, we copy them also for non-explicit
2034
if (node->explicit_cursor
2035
&& UT_LIST_GET_FIRST(node->copy_variables)) {
2037
row_sel_copy_input_variable_vals(node);
2040
node->state = SEL_NODE_FETCH;
2041
node->fetch_table = 0;
2043
if (node->is_aggregate) {
2044
/* Reset the aggregate total values */
2045
sel_reset_aggregate_vals(node);
2049
err = row_sel(node, thr);
2051
/* NOTE! if queries are parallelized, the following assignment may
2052
have problems; the assignment should be made only if thr is the
2053
only top-level thr in the graph: */
2055
thr->graph->last_sel_node = node;
2057
if (err != DB_SUCCESS) {
2058
thr_get_trx(thr)->error_state = err;
2066
/**********************************************************************//**
2067
Performs a fetch for a cursor.
2068
@return query thread to run next or NULL */
2073
que_thr_t* thr) /*!< in: query thread */
2075
sel_node_t* sel_node;
2080
node = thr->run_node;
2081
sel_node = node->cursor_def;
2083
ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
2085
if (thr->prev_node != que_node_get_parent(node)) {
2087
if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
2089
if (node->into_list) {
2090
sel_assign_into_var_values(node->into_list,
2093
void* ret = (*node->func->func)(
2094
sel_node, node->func->arg);
2098
= SEL_NODE_NO_MORE_ROWS;
2103
thr->run_node = que_node_get_parent(node);
2108
/* Make the fetch node the parent of the cursor definition for
2109
the time of the fetch, so that execution knows to return to this
2110
fetch node after a row has been selected or we know that there is
2113
sel_node->common.parent = node;
2115
if (sel_node->state == SEL_NODE_CLOSED) {
2117
"InnoDB: Error: fetch called on a closed cursor\n");
2119
thr_get_trx(thr)->error_state = DB_ERROR;
2124
thr->run_node = sel_node;
2129
/****************************************************************//**
2130
Sample callback function for fetch that prints each row.
2131
@return always returns non-NULL */
2136
void* row, /*!< in: sel_node_t* */
2137
void* user_arg) /*!< in: not used */
2139
sel_node_t* node = row;
2143
UT_NOT_USED(user_arg);
2145
fprintf(stderr, "row_fetch_print: row %p\n", row);
2147
exp = node->select_list;
2150
dfield_t* dfield = que_node_get_val(exp);
2151
const dtype_t* type = dfield_get_type(dfield);
2153
fprintf(stderr, " column %lu:\n", (ulong)i);
2158
if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
2159
ut_print_buf(stderr, dfield_get_data(dfield),
2160
dfield_get_len(dfield));
2163
fputs(" <NULL>;\n", stderr);
2166
exp = que_node_get_next(exp);
2173
/****************************************************************//**
2174
Callback function for fetch that stores an unsigned 4 byte integer to the
2175
location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length
2177
@return always returns NULL */
2180
row_fetch_store_uint4(
2181
/*==================*/
2182
void* row, /*!< in: sel_node_t* */
2183
void* user_arg) /*!< in: data pointer */
2185
sel_node_t* node = row;
2186
ib_uint32_t* val = user_arg;
2189
dfield_t* dfield = que_node_get_val(node->select_list);
2190
const dtype_t* type = dfield_get_type(dfield);
2191
ulint len = dfield_get_len(dfield);
2193
ut_a(dtype_get_mtype(type) == DATA_INT);
2194
ut_a(dtype_get_prtype(type) & DATA_UNSIGNED);
2197
tmp = mach_read_from_4(dfield_get_data(dfield));
2198
*val = (ib_uint32_t) tmp;
2203
/***********************************************************//**
2204
Prints a row in a select result.
2205
@return query thread to run next or NULL */
2210
que_thr_t* thr) /*!< in: query thread */
2212
row_printf_node_t* node;
2213
sel_node_t* sel_node;
2218
node = thr->run_node;
2220
sel_node = node->sel_node;
2222
ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
2224
if (thr->prev_node == que_node_get_parent(node)) {
2226
/* Reset the cursor */
2227
sel_node->state = SEL_NODE_OPEN;
2229
/* Fetch next row to print */
2231
thr->run_node = sel_node;
2236
if (sel_node->state != SEL_NODE_FETCH) {
2238
ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
2240
/* No more rows to print */
2242
thr->run_node = que_node_get_parent(node);
2247
arg = sel_node->select_list;
2250
dfield_print_also_hex(que_node_get_val(arg));
2252
fputs(" ::: ", stderr);
2254
arg = que_node_get_next(arg);
2259
/* Fetch next row to print */
2261
thr->run_node = sel_node;
2266
/****************************************************************//**
2267
Converts a key value stored in MySQL format to an Innobase dtuple. The last
2268
field of the key value may be just a prefix of a fixed length field: hence
2269
the parameter key_len. But currently we do not allow search keys where the
2270
last field is only a prefix of the full key field len and print a warning if
2271
such appears. A counterpart of this function is
2272
ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2275
row_sel_convert_mysql_key_to_innobase(
2276
/*==================================*/
2277
dtuple_t* tuple, /*!< in/out: tuple where to build;
2278
NOTE: we assume that the type info
2279
in the tuple is already according
2281
byte* buf, /*!< in: buffer to use in field
2283
ulint buf_len, /*!< in: buffer length */
2284
dict_index_t* index, /*!< in: index of the key value */
2285
const byte* key_ptr, /*!< in: MySQL key value */
2286
ulint key_len, /*!< in: MySQL key value length */
2287
trx_t* trx) /*!< in: transaction */
2289
byte* original_buf = buf;
2290
const byte* original_key_ptr = key_ptr;
2291
dict_field_t* field;
2295
ulint data_field_len;
2297
const byte* key_end;
2300
/* For documentation of the key value storage format in MySQL, see
2301
ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2303
key_end = key_ptr + key_len;
2305
/* Permit us to access any field in the tuple (ULINT_MAX): */
2307
dtuple_set_n_fields(tuple, ULINT_MAX);
2309
dfield = dtuple_get_nth_field(tuple, 0);
2310
field = dict_index_get_nth_field(index, 0);
2312
if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
2313
/* A special case: we are looking for a position in the
2314
generated clustered index which InnoDB automatically added
2315
to a table with no primary key: the first and the only
2316
ordering column is ROW_ID which InnoDB stored to the key_ptr
2319
ut_a(key_len == DATA_ROW_ID_LEN);
2321
dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
2323
dtuple_set_n_fields(tuple, 1);
2328
while (key_ptr < key_end) {
2330
ulint type = dfield_get_type(dfield)->mtype;
2331
ut_a(field->col->mtype == type);
2336
if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
2337
/* The first byte in the field tells if this is
2338
an SQL NULL value */
2342
if (*key_ptr != 0) {
2343
dfield_set_null(dfield);
2349
/* Calculate data length and data field total length */
2351
if (type == DATA_BLOB) {
2352
/* The key field is a column prefix of a BLOB or
2355
ut_a(field->prefix_len > 0);
2357
/* MySQL stores the actual data length to the first 2
2358
bytes after the optional SQL NULL marker byte. The
2359
storage format is little-endian, that is, the most
2360
significant byte at a higher address. In UTF-8, MySQL
2361
seems to reserve field->prefix_len bytes for
2362
storing this field in the key value buffer, even
2363
though the actual value only takes data_len bytes
2366
data_len = key_ptr[data_offset]
2367
+ 256 * key_ptr[data_offset + 1];
2368
data_field_len = data_offset + 2 + field->prefix_len;
2372
/* Now that we know the length, we store the column
2373
value like it would be a fixed char field */
2375
} else if (field->prefix_len > 0) {
2376
/* Looks like MySQL pads unused end bytes in the
2377
prefix with space. Therefore, also in UTF-8, it is ok
2378
to compare with a prefix containing full prefix_len
2379
bytes, and no need to take at most prefix_len / 3
2380
UTF-8 characters from the start.
2381
If the prefix is used as the upper end of a LIKE
2382
'abc%' query, then MySQL pads the end with chars
2383
0xff. TODO: in that case does it any harm to compare
2384
with the full prefix_len bytes. How do characters
2385
0xff in UTF-8 behave? */
2387
data_len = field->prefix_len;
2388
data_field_len = data_offset + data_len;
2390
data_len = dfield_get_type(dfield)->len;
2391
data_field_len = data_offset + data_len;
2395
(dtype_get_mysql_type(dfield_get_type(dfield))
2396
== DATA_MYSQL_TRUE_VARCHAR)
2397
&& UNIV_LIKELY(type != DATA_INT)) {
2398
/* In a MySQL key value format, a true VARCHAR is
2399
always preceded by 2 bytes of a length field.
2400
dfield_get_type(dfield)->len returns the maximum
2401
'payload' len in bytes. That does not include the
2402
2 bytes that tell the actual data length.
2404
We added the check != DATA_INT to make sure we do
2405
not treat MySQL ENUM or SET as a true VARCHAR! */
2408
data_field_len += 2;
2411
/* Storing may use at most data_len bytes of buf */
2413
if (UNIV_LIKELY(!is_null)) {
2414
row_mysql_store_col_in_innobase_format(
2416
FALSE, /* MySQL key value format col */
2417
key_ptr + data_offset, data_len,
2418
dict_table_is_comp(index->table));
2422
key_ptr += data_field_len;
2424
if (UNIV_UNLIKELY(key_ptr > key_end)) {
2425
/* The last field in key was not a complete key field
2428
Print a warning about this! HA_READ_PREFIX_LAST does
2429
not currently work in InnoDB with partial-field key
2430
value prefixes. Since MySQL currently uses a padding
2431
trick to calculate LIKE 'abc%' type queries there
2432
should never be partial-field prefixes in searches. */
2434
ut_print_timestamp(stderr);
2436
fputs(" InnoDB: Warning: using a partial-field"
2437
" key prefix in search.\n"
2438
"InnoDB: ", stderr);
2439
dict_index_name_print(stderr, trx, index);
2440
fprintf(stderr, ". Last data field length %lu bytes,\n"
2441
"InnoDB: key ptr now exceeds"
2442
" key end by %lu bytes.\n"
2443
"InnoDB: Key value in the MySQL format:\n",
2444
(ulong) data_field_len,
2445
(ulong) (key_ptr - key_end));
2447
ut_print_buf(stderr, original_key_ptr, key_len);
2451
ulint len = dfield_get_len(dfield);
2452
dfield_set_len(dfield, len
2453
- (ulint) (key_ptr - key_end));
2462
ut_a(buf <= original_buf + buf_len);
2464
/* We set the length of tuple to n_fields: we assume that the memory
2465
area allocated for it is big enough (usually bigger than n_fields). */
2467
dtuple_set_n_fields(tuple, n_fields);
2470
/**************************************************************//**
2471
Stores the row id to the prebuilt struct. */
2474
row_sel_store_row_id_to_prebuilt(
2475
/*=============================*/
2476
row_prebuilt_t* prebuilt, /*!< in/out: prebuilt */
2477
const rec_t* index_rec, /*!< in: record */
2478
const dict_index_t* index, /*!< in: index of the record */
2479
const ulint* offsets) /*!< in: rec_get_offsets
2480
(index_rec, index) */
2485
ut_ad(rec_offs_validate(index_rec, index, offsets));
2487
data = rec_get_nth_field(
2489
dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
2491
if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
2493
"InnoDB: Error: Row id field is"
2494
" wrong length %lu in ", (ulong) len);
2495
dict_index_name_print(stderr, prebuilt->trx, index);
2496
fprintf(stderr, "\n"
2497
"InnoDB: Field number %lu, record:\n",
2498
(ulong) dict_index_get_sys_col_pos(index,
2500
rec_print_new(stderr, index_rec, offsets);
2505
ut_memcpy(prebuilt->row_id, data, len);
2508
/**************************************************************//**
2509
Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
2510
function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
2513
row_sel_field_store_in_mysql_format(
2514
/*================================*/
2515
byte* dest, /*!< in/out: buffer where to store; NOTE
2516
that BLOBs are not in themselves
2517
stored here: the caller must allocate
2518
and copy the BLOB into buffer before,
2519
and pass the pointer to the BLOB in
2521
const mysql_row_templ_t* templ,
2522
/*!< in: MySQL column template.
2523
Its following fields are referenced:
2524
type, is_unsigned, mysql_col_len,
2525
mbminlen, mbmaxlen */
2526
const byte* data, /*!< in: data to store */
2527
ulint len) /*!< in: length of the data */
2533
ut_ad(len != UNIV_SQL_NULL);
2535
switch (templ->type) {
2537
/* Convert integer data from Innobase to a little-endian
2538
format, sign bit restored to normal */
2551
if (!templ->is_unsigned) {
2552
dest[len - 1] = (byte) (dest[len - 1] ^ 128);
2555
ut_ad(templ->mysql_col_len == len);
2561
field_end = dest + templ->mysql_col_len;
2563
if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
2564
/* This is a >= 5.0.3 type true VARCHAR. Store the
2565
length of the data to the first byte or the first
2566
two bytes of dest. */
2568
dest = row_mysql_store_true_var_len(
2569
dest, len, templ->mysql_length_bytes);
2572
/* Copy the actual data */
2573
ut_memcpy(dest, data, len);
2575
/* Pad with trailing spaces. We pad with spaces also the
2576
unused end of a >= 5.0.3 true VARCHAR column, just in case
2577
MySQL expects its contents to be deterministic. */
2579
pad_ptr = dest + len;
2581
ut_ad(templ->mbminlen <= templ->mbmaxlen);
2583
/* We handle UCS2 charset strings differently. */
2584
if (templ->mbminlen == 2) {
2585
/* A space char is two bytes, 0x0020 in UCS2 */
2588
/* A 0x20 has been stripped from the column.
2591
if (pad_ptr < field_end) {
2597
/* Pad the rest of the string with 0x0020 */
2599
while (pad_ptr < field_end) {
2606
ut_ad(templ->mbminlen == 1);
2609
memset(pad_ptr, 0x20, field_end - pad_ptr);
2614
/* Store a pointer to the BLOB buffer to dest: the BLOB was
2615
already copied to the buffer in row_sel_store_mysql_rec */
2617
row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
2622
memcpy(dest, data, len);
2624
ut_ad(templ->mysql_col_len >= len);
2625
ut_ad(templ->mbmaxlen >= templ->mbminlen);
2627
ut_ad(templ->mbmaxlen > templ->mbminlen
2628
|| templ->mysql_col_len == len);
2629
/* The following assertion would fail for old tables
2630
containing UTF-8 ENUM columns due to Bug #9526. */
2631
ut_ad(!templ->mbmaxlen
2632
|| !(templ->mysql_col_len % templ->mbmaxlen));
2633
ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
2635
if (templ->mbminlen != templ->mbmaxlen) {
2636
/* Pad with spaces. This undoes the stripping
2637
done in row0mysql.ic, function
2638
row_mysql_store_col_in_innobase_format(). */
2640
memset(dest + len, 0x20, templ->mysql_col_len - len);
2646
case DATA_SYS_CHILD:
2648
/* These column types should never be shipped to MySQL. */
2652
case DATA_FIXBINARY:
2656
/* Above are the valid column types for MySQL data. */
2657
#endif /* UNIV_DEBUG */
2658
ut_ad(templ->mysql_col_len == len);
2659
memcpy(dest, data, len);
2663
/**************************************************************//**
2664
Convert a row in the Innobase format to a row in the MySQL format.
2665
Note that the template in prebuilt may advise us to copy only a few
2666
columns to mysql_rec, other columns are left blank. All columns may not
2667
be needed in the query.
2668
@return TRUE if success, FALSE if could not allocate memory for a BLOB
2669
(though we may also assert in that case) */
2672
row_sel_store_mysql_rec(
2673
/*====================*/
2674
byte* mysql_rec, /*!< out: row in the MySQL format */
2675
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
2676
const rec_t* rec, /*!< in: Innobase record in the index
2677
which was described in prebuilt's
2678
template; must be protected by
2680
const ulint* offsets) /*!< in: array returned by
2681
rec_get_offsets() */
2683
mysql_row_templ_t* templ;
2684
mem_heap_t* extern_field_heap = NULL;
2690
ut_ad(prebuilt->mysql_template);
2691
ut_ad(prebuilt->default_rec);
2692
ut_ad(rec_offs_validate(rec, NULL, offsets));
2694
if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
2695
mem_heap_free(prebuilt->blob_heap);
2696
prebuilt->blob_heap = NULL;
2699
for (i = 0; i < prebuilt->n_template ; i++) {
2701
templ = prebuilt->mysql_template + i;
2703
if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
2704
templ->rec_field_no))) {
2706
/* Copy an externally stored field to the temporary
2709
ut_a(!prebuilt->trx->has_search_latch);
2711
if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2712
if (prebuilt->blob_heap == NULL) {
2713
prebuilt->blob_heap = mem_heap_create(
2717
heap = prebuilt->blob_heap;
2720
= mem_heap_create(UNIV_PAGE_SIZE);
2722
heap = extern_field_heap;
2725
/* NOTE: if we are retrieving a big BLOB, we may
2726
already run out of memory in the next call, which
2729
data = btr_rec_copy_externally_stored_field(
2731
dict_table_zip_size(prebuilt->table),
2732
templ->rec_field_no, &len, heap);
2734
ut_a(len != UNIV_SQL_NULL);
2736
/* Field is stored in the row. */
2738
data = rec_get_nth_field(rec, offsets,
2739
templ->rec_field_no, &len);
2741
if (UNIV_UNLIKELY(templ->type == DATA_BLOB)
2742
&& len != UNIV_SQL_NULL) {
2744
/* It is a BLOB field locally stored in the
2745
InnoDB record: we MUST copy its contents to
2746
prebuilt->blob_heap here because later code
2747
assumes all BLOB values have been copied to a
2750
if (prebuilt->blob_heap == NULL) {
2751
prebuilt->blob_heap = mem_heap_create(
2755
data = memcpy(mem_heap_alloc(
2756
prebuilt->blob_heap, len),
2761
if (len != UNIV_SQL_NULL) {
2762
row_sel_field_store_in_mysql_format(
2763
mysql_rec + templ->mysql_col_offset,
2767
if (extern_field_heap) {
2768
mem_heap_free(extern_field_heap);
2769
extern_field_heap = NULL;
2772
if (templ->mysql_null_bit_mask) {
2773
/* It is a nullable column with a non-NULL
2775
mysql_rec[templ->mysql_null_byte_offset]
2776
&= ~(byte) templ->mysql_null_bit_mask;
2779
/* MySQL assumes that the field for an SQL
2780
NULL value is set to the default value. */
2782
mysql_rec[templ->mysql_null_byte_offset]
2783
|= (byte) templ->mysql_null_bit_mask;
2784
memcpy(mysql_rec + templ->mysql_col_offset,
2785
(const byte*) prebuilt->default_rec
2786
+ templ->mysql_col_offset,
2787
templ->mysql_col_len);
2794
/*********************************************************************//**
2795
Builds a previous version of a clustered index record for a consistent read
2796
@return DB_SUCCESS or error code */
2799
row_sel_build_prev_vers_for_mysql(
2800
/*==============================*/
2801
read_view_t* read_view, /*!< in: read view */
2802
dict_index_t* clust_index, /*!< in: clustered index */
2803
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
2804
const rec_t* rec, /*!< in: record in a clustered index */
2805
ulint** offsets, /*!< in/out: offsets returned by
2806
rec_get_offsets(rec, clust_index) */
2807
mem_heap_t** offset_heap, /*!< in/out: memory heap from which
2808
the offsets are allocated */
2809
rec_t** old_vers, /*!< out: old version, or NULL if the
2810
record does not exist in the view:
2811
i.e., it was freshly inserted
2813
mtr_t* mtr) /*!< in: mtr */
2817
if (prebuilt->old_vers_heap) {
2818
mem_heap_empty(prebuilt->old_vers_heap);
2820
prebuilt->old_vers_heap = mem_heap_create(200);
2823
err = row_vers_build_for_consistent_read(
2824
rec, mtr, clust_index, offsets, read_view, offset_heap,
2825
prebuilt->old_vers_heap, old_vers);
2829
/*********************************************************************//**
2830
Retrieves the clustered index record corresponding to a record in a
2831
non-clustered index. Does the necessary locking. Used in the MySQL
2833
@return DB_SUCCESS or error code */
2836
row_sel_get_clust_rec_for_mysql(
2837
/*============================*/
2838
row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */
2839
dict_index_t* sec_index,/*!< in: secondary index where rec resides */
2840
const rec_t* rec, /*!< in: record in a non-clustered index; if
2841
this is a locking read, then rec is not
2842
allowed to be delete-marked, and that would
2843
not make sense either */
2844
que_thr_t* thr, /*!< in: query thread */
2845
const rec_t** out_rec,/*!< out: clustered record or an old version of
2846
it, NULL if the old version did not exist
2847
in the read view, i.e., it was a fresh
2849
ulint** offsets,/*!< in: offsets returned by
2850
rec_get_offsets(rec, sec_index);
2851
out: offsets returned by
2852
rec_get_offsets(out_rec, clust_index) */
2853
mem_heap_t** offset_heap,/*!< in/out: memory heap from which
2854
the offsets are allocated */
2855
mtr_t* mtr) /*!< in: mtr used to get access to the
2856
non-clustered record; the same mtr is used to
2857
access the clustered index */
2859
dict_index_t* clust_index;
2860
const rec_t* clust_rec;
2866
trx = thr_get_trx(thr);
2868
row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
2869
sec_index, *offsets, trx);
2871
clust_index = dict_table_get_first_index(sec_index->table);
2873
btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
2874
PAGE_CUR_LE, BTR_SEARCH_LEAF,
2875
prebuilt->clust_pcur, 0, mtr);
2877
clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
2879
prebuilt->clust_pcur->trx_if_known = trx;
2881
/* Note: only if the search ends up on a non-infimum record is the
2882
low_match value the real match to the search tuple */
2884
if (!page_rec_is_user_rec(clust_rec)
2885
|| btr_pcur_get_low_match(prebuilt->clust_pcur)
2886
< dict_index_get_n_unique(clust_index)) {
2888
/* In a rare case it is possible that no clust rec is found
2889
for a delete-marked secondary index record: if in row0umod.c
2890
in row_undo_mod_remove_clust_low() we have already removed
2891
the clust rec, while purge is still cleaning and removing
2892
secondary index records associated with earlier versions of
2893
the clustered index record. In that case we know that the
2894
clustered index record did not exist in the read view of
2897
if (!rec_get_deleted_flag(rec,
2898
dict_table_is_comp(sec_index->table))
2899
|| prebuilt->select_lock_type != LOCK_NONE) {
2900
ut_print_timestamp(stderr);
2901
fputs(" InnoDB: error clustered record"
2902
" for sec rec not found\n"
2903
"InnoDB: ", stderr);
2904
dict_index_name_print(stderr, trx, sec_index);
2906
"InnoDB: sec index record ", stderr);
2907
rec_print(stderr, rec, sec_index);
2909
"InnoDB: clust index record ", stderr);
2910
rec_print(stderr, clust_rec, clust_index);
2912
trx_print(stderr, trx, 600);
2915
"InnoDB: Submit a detailed bug report"
2916
" to http://bugs.mysql.com\n", stderr);
2924
*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
2925
ULINT_UNDEFINED, offset_heap);
2927
if (prebuilt->select_lock_type != LOCK_NONE) {
2928
/* Try to place a lock on the index record; we are searching
2929
the clust rec with a unique condition, hence
2930
we set a LOCK_REC_NOT_GAP type lock */
2932
err = lock_clust_rec_read_check_and_lock(
2933
0, btr_pcur_get_block(prebuilt->clust_pcur),
2934
clust_rec, clust_index, *offsets,
2935
prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr);
2936
if (err != DB_SUCCESS) {
2941
/* This is a non-locking consistent read: if necessary, fetch
2942
a previous version of the record */
2946
/* If the isolation level allows reading of uncommitted data,
2947
then we never look for an earlier version */
2949
if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
2950
&& !lock_clust_rec_cons_read_sees(
2951
clust_rec, clust_index, *offsets,
2954
/* The following call returns 'offsets' associated with
2956
err = row_sel_build_prev_vers_for_mysql(
2957
trx->read_view, clust_index, prebuilt,
2958
clust_rec, offsets, offset_heap, &old_vers,
2961
if (err != DB_SUCCESS || old_vers == NULL) {
2966
clust_rec = old_vers;
2969
/* If we had to go to an earlier version of row or the
2970
secondary index record is delete marked, then it may be that
2971
the secondary index record corresponding to clust_rec
2972
(or old_vers) is not rec; in that case we must ignore
2973
such row because in our snapshot rec would not have existed.
2974
Remember that from rec we cannot see directly which transaction
2975
id corresponds to it: we have to go to the clustered index
2976
record. A query where we want to fetch all rows where
2977
the secondary index value is in some interval would return
2978
a wrong result if we would not drop rows which we come to
2979
visit through secondary index records that would not really
2980
exist in our snapshot. */
2984
|| rec_get_deleted_flag(rec, dict_table_is_comp(
2986
&& !row_sel_sec_rec_is_for_clust_rec(
2987
rec, sec_index, clust_rec, clust_index)) {
2989
#ifdef UNIV_SEARCH_DEBUG
2991
ut_a(clust_rec == NULL
2992
|| row_sel_sec_rec_is_for_clust_rec(
2993
rec, sec_index, clust_rec, clust_index));
2999
*out_rec = clust_rec;
3001
if (prebuilt->select_lock_type != LOCK_NONE) {
3002
/* We may use the cursor in update or in unlock_row():
3003
store its position */
3005
btr_pcur_store_position(prebuilt->clust_pcur, mtr);
3013
/********************************************************************//**
3014
Restores cursor position after it has been stored. We have to take into
3015
account that the record cursor was positioned on may have been deleted.
3016
Then we may have to move the cursor one step up or down.
3017
@return TRUE if we may need to process the record the cursor is now
3018
positioned on (i.e. we should not go to the next record yet) */
3021
sel_restore_position_for_mysql(
3022
/*===========================*/
3023
ibool* same_user_rec, /*!< out: TRUE if we were able to restore
3024
the cursor on a user record with the
3025
same ordering prefix in in the
3027
ulint latch_mode, /*!< in: latch mode wished in
3029
btr_pcur_t* pcur, /*!< in: cursor whose position
3031
ibool moves_up, /*!< in: TRUE if the cursor moves up
3033
mtr_t* mtr) /*!< in: mtr; CAUTION: may commit
3037
ulint relative_position;
3039
relative_position = pcur->rel_pos;
3041
success = btr_pcur_restore_position(latch_mode, pcur, mtr);
3043
*same_user_rec = success;
3045
if (relative_position == BTR_PCUR_ON) {
3051
btr_pcur_move_to_next(pcur, mtr);
3057
if (relative_position == BTR_PCUR_AFTER
3058
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
3064
if (btr_pcur_is_on_user_rec(pcur)) {
3065
btr_pcur_move_to_prev(pcur, mtr);
3071
ut_ad(relative_position == BTR_PCUR_BEFORE
3072
|| relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
3074
if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
3075
btr_pcur_move_to_next(pcur, mtr);
3081
/********************************************************************//**
3082
Pops a cached row for MySQL from the fetch cache. */
3085
row_sel_pop_cached_row_for_mysql(
3086
/*=============================*/
3087
byte* buf, /*!< in/out: buffer where to copy the
3089
row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */
3092
mysql_row_templ_t* templ;
3094
ut_ad(prebuilt->n_fetch_cached > 0);
3095
ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
3097
if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
3098
/* Copy cache record field by field, don't touch fields that
3099
are not covered by current key */
3100
cached_rec = prebuilt->fetch_cache[
3101
prebuilt->fetch_cache_first];
3103
for (i = 0; i < prebuilt->n_template; i++) {
3104
templ = prebuilt->mysql_template + i;
3105
ut_memcpy(buf + templ->mysql_col_offset,
3106
cached_rec + templ->mysql_col_offset,
3107
templ->mysql_col_len);
3108
/* Copy NULL bit of the current field from cached_rec
3110
if (templ->mysql_null_bit_mask) {
3111
buf[templ->mysql_null_byte_offset]
3112
^= (buf[templ->mysql_null_byte_offset]
3113
^ cached_rec[templ->mysql_null_byte_offset])
3114
& (byte)templ->mysql_null_bit_mask;
3120
prebuilt->fetch_cache[prebuilt->fetch_cache_first],
3121
prebuilt->mysql_prefix_len);
3123
prebuilt->n_fetch_cached--;
3124
prebuilt->fetch_cache_first++;
3126
if (prebuilt->n_fetch_cached == 0) {
3127
prebuilt->fetch_cache_first = 0;
3131
/********************************************************************//**
3132
Pushes a row for MySQL to the fetch cache. */
3135
row_sel_push_cache_row_for_mysql(
3136
/*=============================*/
3137
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
3138
const rec_t* rec, /*!< in: record to push; must
3139
be protected by a page latch */
3140
const ulint* offsets) /*!<in: rec_get_offsets() */
3145
ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
3146
ut_ad(rec_offs_validate(rec, NULL, offsets));
3147
ut_a(!prebuilt->templ_contains_blob);
3149
if (prebuilt->fetch_cache[0] == NULL) {
3150
/* Allocate memory for the fetch cache */
3152
for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
3154
/* A user has reported memory corruption in these
3155
buffers in Linux. Put magic numbers there to help
3156
to track a possible bug. */
3158
buf = mem_alloc(prebuilt->mysql_row_len + 8);
3160
prebuilt->fetch_cache[i] = buf + 4;
3162
mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
3163
mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
3164
ROW_PREBUILT_FETCH_MAGIC_N);
3168
ut_ad(prebuilt->fetch_cache_first == 0);
3170
if (UNIV_UNLIKELY(!row_sel_store_mysql_rec(
3171
prebuilt->fetch_cache[
3172
prebuilt->n_fetch_cached],
3173
prebuilt, rec, offsets))) {
3177
prebuilt->n_fetch_cached++;
3180
/*********************************************************************//**
3181
Tries to do a shortcut to fetch a clustered index record with a unique key,
3182
using the hash index if possible (not always). We assume that the search
3183
mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
3184
btr search latch has been locked in S-mode.
3185
@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
3188
row_sel_try_search_shortcut_for_mysql(
3189
/*==================================*/
3190
const rec_t** out_rec,/*!< out: record if found */
3191
row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */
3192
ulint** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
3193
mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
3194
mtr_t* mtr) /*!< in: started mtr */
3196
dict_index_t* index = prebuilt->index;
3197
const dtuple_t* search_tuple = prebuilt->search_tuple;
3198
btr_pcur_t* pcur = prebuilt->pcur;
3199
trx_t* trx = prebuilt->trx;
3202
ut_ad(dict_index_is_clust(index));
3203
ut_ad(!prebuilt->templ_contains_blob);
3205
btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3206
BTR_SEARCH_LEAF, pcur,
3207
#ifndef UNIV_SEARCH_DEBUG
3213
rec = btr_pcur_get_rec(pcur);
3215
if (!page_rec_is_user_rec(rec)) {
3220
/* As the cursor is now placed on a user record after a search with
3221
the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
3222
fields in the user record matched to the search tuple */
3224
if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
3226
return(SEL_EXHAUSTED);
3229
/* This is a non-locking consistent read: if necessary, fetch
3230
a previous version of the record */
3232
*offsets = rec_get_offsets(rec, index, *offsets,
3233
ULINT_UNDEFINED, heap);
3235
if (!lock_clust_rec_cons_read_sees(rec, index,
3236
*offsets, trx->read_view)) {
3241
if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
3243
return(SEL_EXHAUSTED);
3251
/********************************************************************//**
3252
Searches for rows in the database. This is used in the interface to
3253
MySQL. This function opens a cursor, and also implements fetch next
3254
and fetch prev. NOTE that if we do a search with a full key value
3255
from a unique index (ROW_SEL_EXACT), then we will not store the cursor
3256
position and fetch next or fetch prev must not be tried to the cursor!
3257
@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
3258
DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
3261
row_search_for_mysql(
3262
/*=================*/
3263
byte* buf, /*!< in/out: buffer for the fetched
3264
row in the MySQL format */
3265
ulint mode, /*!< in: search mode PAGE_CUR_L, ... */
3266
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
3267
table handle; this contains the info
3268
of search_tuple, index; if search
3269
tuple contains 0 fields then we
3270
position the cursor at the start or
3271
the end of the index, depending on
3273
ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or
3274
ROW_SEL_EXACT_PREFIX */
3275
ulint direction) /*!< in: 0 or ROW_SEL_NEXT or
3276
ROW_SEL_PREV; NOTE: if this is != 0,
3277
then prebuilt must have a pcur
3278
with stored position! In opening of a
3279
cursor 'direction' should be 0. */
3281
dict_index_t* index = prebuilt->index;
3282
ibool comp = dict_table_is_comp(index->table);
3283
const dtuple_t* search_tuple = prebuilt->search_tuple;
3284
btr_pcur_t* pcur = prebuilt->pcur;
3285
trx_t* trx = prebuilt->trx;
3286
dict_index_t* clust_index;
3289
const rec_t* result_rec;
3290
const rec_t* clust_rec;
3291
ulint err = DB_SUCCESS;
3292
ibool unique_search = FALSE;
3293
ibool unique_search_from_clust_index = FALSE;
3294
ibool mtr_has_extra_clust_latch = FALSE;
3295
ibool moves_up = FALSE;
3296
ibool set_also_gap_locks = TRUE;
3297
/* if the query is a plain locking SELECT, and the isolation level
3298
is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
3299
ibool did_semi_consistent_read = FALSE;
3300
/* if the returned record was locked and we did a semi-consistent
3301
read (fetch the newest committed version), then this is set to
3303
#ifdef UNIV_SEARCH_DEBUG
3305
#endif /* UNIV_SEARCH_DEBUG */
3307
ibool same_user_rec;
3309
mem_heap_t* heap = NULL;
3310
ulint offsets_[REC_OFFS_NORMAL_SIZE];
3311
ulint* offsets = offsets_;
3313
rec_offs_init(offsets_);
3315
ut_ad(index && pcur && search_tuple);
3316
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
3318
if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) {
3319
ut_print_timestamp(stderr);
3320
fprintf(stderr, " InnoDB: Error:\n"
3321
"InnoDB: MySQL is trying to use a table handle"
3322
" but the .ibd file for\n"
3323
"InnoDB: table %s does not exist.\n"
3324
"InnoDB: Have you deleted the .ibd file"
3325
" from the database directory under\n"
3326
"InnoDB: the MySQL datadir, or have you used"
3327
" DISCARD TABLESPACE?\n"
3328
"InnoDB: Look from\n"
3329
"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
3330
"InnoDB: how you can resolve the problem.\n",
3331
prebuilt->table->name);
3336
if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
3338
return(DB_MISSING_HISTORY);
3341
if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
3343
"InnoDB: Error: trying to free a corrupt\n"
3344
"InnoDB: table handle. Magic n %lu, table name ",
3345
(ulong) prebuilt->magic_n);
3346
ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
3349
mem_analyze_corruption(prebuilt);
3355
fprintf(stderr, "Match mode %lu\n search tuple ",
3356
(ulong) match_mode);
3357
dtuple_print(search_tuple);
3358
fprintf(stderr, "N tables locked %lu\n",
3359
(ulong) trx->mysql_n_tables_locked);
3361
/*-------------------------------------------------------------*/
3362
/* PHASE 0: Release a possible s-latch we are holding on the
3363
adaptive hash index latch if there is someone waiting behind */
3365
if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
3366
&& trx->has_search_latch) {
3368
/* There is an x-latch request on the adaptive hash index:
3369
release the s-latch to reduce starvation and wait for
3370
BTR_SEA_TIMEOUT rounds before trying to keep it again over
3373
rw_lock_s_unlock(&btr_search_latch);
3374
trx->has_search_latch = FALSE;
3376
trx->search_latch_timeout = BTR_SEA_TIMEOUT;
3379
/* Reset the new record lock info if srv_locks_unsafe_for_binlog
3380
is set or session is using a READ COMMITED isolation level. Then
3381
we are able to remove the record locks set here on an individual
3383
prebuilt->new_rec_locks = 0;
3385
/*-------------------------------------------------------------*/
3386
/* PHASE 1: Try to pop the row from the prefetch cache */
3388
if (UNIV_UNLIKELY(direction == 0)) {
3389
trx->op_info = "starting index read";
3391
prebuilt->n_rows_fetched = 0;
3392
prebuilt->n_fetch_cached = 0;
3393
prebuilt->fetch_cache_first = 0;
3395
if (prebuilt->sel_graph == NULL) {
3396
/* Build a dummy select query graph */
3397
row_prebuild_sel_graph(prebuilt);
3400
trx->op_info = "fetching rows";
3402
if (prebuilt->n_rows_fetched == 0) {
3403
prebuilt->fetch_direction = direction;
3406
if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
3407
if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
3409
/* TODO: scrollable cursor: restore cursor to
3410
the place of the latest returned row,
3411
or better: prevent caching for a scroll
3415
prebuilt->n_rows_fetched = 0;
3416
prebuilt->n_fetch_cached = 0;
3417
prebuilt->fetch_cache_first = 0;
3419
} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
3420
row_sel_pop_cached_row_for_mysql(buf, prebuilt);
3422
prebuilt->n_rows_fetched++;
3429
if (prebuilt->fetch_cache_first > 0
3430
&& prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
3432
/* The previous returned row was popped from the fetch
3433
cache, but the cache was not full at the time of the
3434
popping: no more rows can exist in the result set */
3436
err = DB_RECORD_NOT_FOUND;
3440
prebuilt->n_rows_fetched++;
3442
if (prebuilt->n_rows_fetched > 1000000000) {
3443
/* Prevent wrap-over */
3444
prebuilt->n_rows_fetched = 500000000;
3447
mode = pcur->search_mode;
3450
/* In a search where at most one record in the index may match, we
3451
can use a LOCK_REC_NOT_GAP type record lock when locking a
3452
non-delete-marked matching record.
3454
Note that in a unique secondary index there may be different
3455
delete-marked versions of a record where only the primary key
3456
values differ: thus in a secondary index we must use next-key
3457
locks when locking delete-marked records. */
3459
if (match_mode == ROW_SEL_EXACT
3460
&& dict_index_is_unique(index)
3461
&& dtuple_get_n_fields(search_tuple)
3462
== dict_index_get_n_unique(index)
3463
&& (dict_index_is_clust(index)
3464
|| !dtuple_contains_null(search_tuple))) {
3466
/* Note above that a UNIQUE secondary index can contain many
3467
rows with the same key value if one of the columns is the SQL
3468
null. A clustered index under MySQL can never contain null
3469
columns because we demand that all the columns in primary key
3472
unique_search = TRUE;
3474
/* Even if the condition is unique, MySQL seems to try to
3475
retrieve also a second row if a primary key contains more than
3478
if (UNIV_UNLIKELY(direction != 0)) {
3480
err = DB_RECORD_NOT_FOUND;
3487
/*-------------------------------------------------------------*/
3488
/* PHASE 2: Try fast adaptive hash index search if possible */
3490
/* Next test if this is the special case where we can use the fast
3491
adaptive hash index to try the search. Since we must release the
3492
search system latch when we retrieve an externally stored field, we
3493
cannot use the adaptive hash index in a search in the case the row
3494
may be long and there may be externally stored fields */
3496
if (UNIV_UNLIKELY(direction == 0)
3498
&& dict_index_is_clust(index)
3499
&& !prebuilt->templ_contains_blob
3500
&& (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
3504
unique_search_from_clust_index = TRUE;
3506
if (trx->mysql_n_tables_locked == 0
3507
&& prebuilt->select_lock_type == LOCK_NONE
3508
&& trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3509
&& trx->read_view) {
3511
/* This is a SELECT query done as a consistent read,
3512
and the read view has already been allocated:
3513
let us try a search shortcut through the hash
3515
NOTE that we must also test that
3516
mysql_n_tables_locked == 0, because this might
3517
also be INSERT INTO ... SELECT ... or
3518
CREATE TABLE ... SELECT ... . Our algorithm is
3519
NOT prepared to inserts interleaved with the SELECT,
3520
and if we try that, we can deadlock on the adaptive
3521
hash index semaphore! */
3523
#ifndef UNIV_SEARCH_DEBUG
3524
if (!trx->has_search_latch) {
3525
rw_lock_s_lock(&btr_search_latch);
3526
trx->has_search_latch = TRUE;
3529
switch (row_sel_try_search_shortcut_for_mysql(
3530
&rec, prebuilt, &offsets, &heap,
3533
#ifdef UNIV_SEARCH_DEBUG
3534
ut_a(0 == cmp_dtuple_rec(search_tuple,
3537
/* At this point, rec is protected by
3538
a page latch that was acquired by
3539
row_sel_try_search_shortcut_for_mysql().
3540
The latch will not be released until
3541
mtr_commit(&mtr). */
3543
if (!row_sel_store_mysql_rec(buf, prebuilt,
3545
err = DB_TOO_BIG_RECORD;
3547
/* We let the main loop to do the
3549
goto shortcut_fails_too_big_rec;
3554
/* ut_print_name(stderr, index->name);
3555
fputs(" shortcut\n", stderr); */
3560
goto release_search_latch_if_needed;
3565
/* ut_print_name(stderr, index->name);
3566
fputs(" record not found 2\n", stderr); */
3568
err = DB_RECORD_NOT_FOUND;
3569
release_search_latch_if_needed:
3570
if (trx->search_latch_timeout > 0
3571
&& trx->has_search_latch) {
3573
trx->search_latch_timeout--;
3575
rw_lock_s_unlock(&btr_search_latch);
3576
trx->has_search_latch = FALSE;
3579
/* NOTE that we do NOT store the cursor
3589
shortcut_fails_too_big_rec:
3595
/*-------------------------------------------------------------*/
3596
/* PHASE 3: Open or restore index cursor position */
3598
if (trx->has_search_latch) {
3599
rw_lock_s_unlock(&btr_search_latch);
3600
trx->has_search_latch = FALSE;
3603
trx_start_if_not_started(trx);
3605
if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
3606
&& prebuilt->select_lock_type != LOCK_NONE
3607
&& trx->mysql_thd != NULL
3608
&& thd_is_select(trx->mysql_thd)) {
3609
/* It is a plain locking SELECT and the isolation
3610
level is low: do not lock gaps */
3612
set_also_gap_locks = FALSE;
3615
/* Note that if the search mode was GE or G, then the cursor
3616
naturally moves upward (in fetch next) in alphabetical order,
3617
otherwise downward */
3619
if (UNIV_UNLIKELY(direction == 0)) {
3620
if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
3623
} else if (direction == ROW_SEL_NEXT) {
3627
thr = que_fork_get_first_thr(prebuilt->sel_graph);
3629
que_thr_move_to_run_state_for_mysql(thr, trx);
3631
clust_index = dict_table_get_first_index(index->table);
3633
if (UNIV_LIKELY(direction != 0)) {
3634
ibool need_to_process = sel_restore_position_for_mysql(
3635
&same_user_rec, BTR_SEARCH_LEAF,
3636
pcur, moves_up, &mtr);
3638
if (UNIV_UNLIKELY(need_to_process)) {
3639
if (UNIV_UNLIKELY(prebuilt->row_read_type
3640
== ROW_READ_DID_SEMI_CONSISTENT)) {
3641
/* We did a semi-consistent read,
3642
but the record was removed in
3644
prebuilt->row_read_type
3645
= ROW_READ_TRY_SEMI_CONSISTENT;
3647
} else if (UNIV_LIKELY(prebuilt->row_read_type
3648
!= ROW_READ_DID_SEMI_CONSISTENT)) {
3650
/* The cursor was positioned on the record
3651
that we returned previously. If we need
3652
to repeat a semi-consistent read as a
3653
pessimistic locking read, the record
3654
cannot be skipped. */
3659
} else if (dtuple_get_n_fields(search_tuple) > 0) {
3661
btr_pcur_open_with_no_init(index, search_tuple, mode,
3665
pcur->trx_if_known = trx;
3667
rec = btr_pcur_get_rec(pcur);
3670
&& !page_rec_is_supremum(rec)
3671
&& set_also_gap_locks
3672
&& !(srv_locks_unsafe_for_binlog
3673
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
3674
&& prebuilt->select_lock_type != LOCK_NONE) {
3676
/* Try to place a gap lock on the next index record
3677
to prevent phantoms in ORDER BY ... DESC queries */
3678
const rec_t* next = page_rec_get_next_const(rec);
3680
offsets = rec_get_offsets(next, index, offsets,
3681
ULINT_UNDEFINED, &heap);
3682
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
3683
next, index, offsets,
3684
prebuilt->select_lock_type,
3687
if (err != DB_SUCCESS) {
3689
goto lock_wait_or_error;
3693
if (mode == PAGE_CUR_G) {
3694
btr_pcur_open_at_index_side(
3695
TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3697
} else if (mode == PAGE_CUR_L) {
3698
btr_pcur_open_at_index_side(
3699
FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3704
if (!prebuilt->sql_stat_start) {
3705
/* No need to set an intention lock or assign a read view */
3707
if (trx->read_view == NULL
3708
&& prebuilt->select_lock_type == LOCK_NONE) {
3710
fputs("InnoDB: Error: MySQL is trying to"
3711
" perform a consistent read\n"
3712
"InnoDB: but the read view is not assigned!\n",
3714
trx_print(stderr, trx, 600);
3715
fputc('\n', stderr);
3718
} else if (prebuilt->select_lock_type == LOCK_NONE) {
3719
/* This is a consistent read */
3720
/* Assign a read view for the query */
3722
trx_assign_read_view(trx);
3723
prebuilt->sql_stat_start = FALSE;
3726
if (prebuilt->select_lock_type == LOCK_S) {
3727
lock_mode = LOCK_IS;
3729
lock_mode = LOCK_IX;
3731
err = lock_table(0, index->table, lock_mode, thr);
3733
if (err != DB_SUCCESS) {
3735
goto lock_wait_or_error;
3737
prebuilt->sql_stat_start = FALSE;
3741
/*-------------------------------------------------------------*/
3742
/* PHASE 4: Look for matching records in a loop */
3744
rec = btr_pcur_get_rec(pcur);
3745
ut_ad(!!page_rec_is_comp(rec) == comp);
3746
#ifdef UNIV_SEARCH_DEBUG
3748
fputs("Using ", stderr);
3749
dict_index_name_print(stderr, index);
3750
fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
3751
page_get_page_no(page_align(rec)));
3754
#endif /* UNIV_SEARCH_DEBUG */
3756
if (page_rec_is_infimum(rec)) {
3758
/* The infimum record on a page cannot be in the result set,
3759
and neither can a record lock be placed on it: we skip such
3765
if (page_rec_is_supremum(rec)) {
3767
if (set_also_gap_locks
3768
&& !(srv_locks_unsafe_for_binlog
3769
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
3770
&& prebuilt->select_lock_type != LOCK_NONE) {
3772
/* Try to place a lock on the index record */
3774
/* If innodb_locks_unsafe_for_binlog option is used
3775
or this session is using a READ COMMITTED isolation
3776
level we do not lock gaps. Supremum record is really
3777
a gap and therefore we do not set locks there. */
3779
offsets = rec_get_offsets(rec, index, offsets,
3780
ULINT_UNDEFINED, &heap);
3781
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
3782
rec, index, offsets,
3783
prebuilt->select_lock_type,
3784
LOCK_ORDINARY, thr);
3786
if (err != DB_SUCCESS) {
3788
goto lock_wait_or_error;
3791
/* A page supremum record cannot be in the result set: skip
3792
it now that we have placed a possible lock on it */
3797
/*-------------------------------------------------------------*/
3798
/* Do sanity checks in case our cursor has bumped into page
3802
next_offs = rec_get_next_offs(rec, TRUE);
3803
if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
3808
next_offs = rec_get_next_offs(rec, FALSE);
3809
if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
3815
if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
3818
if (srv_force_recovery == 0 || moves_up == FALSE) {
3819
ut_print_timestamp(stderr);
3820
buf_page_print(page_align(rec), 0);
3822
"\nInnoDB: rec address %p,"
3823
" buf block fix count %lu\n",
3824
(void*) rec, (ulong)
3825
btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
3826
->page.buf_fix_count);
3828
"InnoDB: Index corruption: rec offs %lu"
3829
" next offs %lu, page no %lu,\n"
3831
(ulong) page_offset(rec),
3833
(ulong) page_get_page_no(page_align(rec)));
3834
dict_index_name_print(stderr, trx, index);
3835
fputs(". Run CHECK TABLE. You may need to\n"
3836
"InnoDB: restore from a backup, or"
3837
" dump + drop + reimport the table.\n",
3840
err = DB_CORRUPTION;
3842
goto lock_wait_or_error;
3844
/* The user may be dumping a corrupt table. Jump
3845
over the corruption to recover as much as possible. */
3848
"InnoDB: Index corruption: rec offs %lu"
3849
" next offs %lu, page no %lu,\n"
3851
(ulong) page_offset(rec),
3853
(ulong) page_get_page_no(page_align(rec)));
3854
dict_index_name_print(stderr, trx, index);
3855
fputs(". We try to skip the rest of the page.\n",
3858
btr_pcur_move_to_last_on_page(pcur, &mtr);
3863
/*-------------------------------------------------------------*/
3865
/* Calculate the 'offsets' associated with 'rec' */
3867
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
3869
if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
3870
if (!rec_validate(rec, offsets)
3871
|| !btr_index_rec_validate(rec, index, FALSE)) {
3873
"InnoDB: Index corruption: rec offs %lu"
3874
" next offs %lu, page no %lu,\n"
3876
(ulong) page_offset(rec),
3878
(ulong) page_get_page_no(page_align(rec)));
3879
dict_index_name_print(stderr, trx, index);
3880
fputs(". We try to skip the record.\n",
3887
/* Note that we cannot trust the up_match value in the cursor at this
3888
place because we can arrive here after moving the cursor! Thus
3889
we have to recompare rec and search_tuple to determine if they
3892
if (match_mode == ROW_SEL_EXACT) {
3893
/* Test if the index record matches completely to search_tuple
3894
in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
3896
/* fputs("Comparing rec and search tuple\n", stderr); */
3898
if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
3900
if (set_also_gap_locks
3901
&& !(srv_locks_unsafe_for_binlog
3902
|| trx->isolation_level
3903
== TRX_ISO_READ_COMMITTED)
3904
&& prebuilt->select_lock_type != LOCK_NONE) {
3906
/* Try to place a gap lock on the index
3907
record only if innodb_locks_unsafe_for_binlog
3908
option is not set or this session is not
3909
using a READ COMMITTED isolation level. */
3911
err = sel_set_rec_lock(
3912
btr_pcur_get_block(pcur),
3913
rec, index, offsets,
3914
prebuilt->select_lock_type, LOCK_GAP,
3917
if (err != DB_SUCCESS) {
3919
goto lock_wait_or_error;
3923
btr_pcur_store_position(pcur, &mtr);
3925
err = DB_RECORD_NOT_FOUND;
3926
/* ut_print_name(stderr, index->name);
3927
fputs(" record not found 3\n", stderr); */
3932
} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
3934
if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
3936
if (set_also_gap_locks
3937
&& !(srv_locks_unsafe_for_binlog
3938
|| trx->isolation_level
3939
== TRX_ISO_READ_COMMITTED)
3940
&& prebuilt->select_lock_type != LOCK_NONE) {
3942
/* Try to place a gap lock on the index
3943
record only if innodb_locks_unsafe_for_binlog
3944
option is not set or this session is not
3945
using a READ COMMITTED isolation level. */
3947
err = sel_set_rec_lock(
3948
btr_pcur_get_block(pcur),
3949
rec, index, offsets,
3950
prebuilt->select_lock_type, LOCK_GAP,
3953
if (err != DB_SUCCESS) {
3955
goto lock_wait_or_error;
3959
btr_pcur_store_position(pcur, &mtr);
3961
err = DB_RECORD_NOT_FOUND;
3962
/* ut_print_name(stderr, index->name);
3963
fputs(" record not found 4\n", stderr); */
3969
/* We are ready to look at a possible new index entry in the result
3970
set: the cursor is now placed on a user record */
3972
if (prebuilt->select_lock_type != LOCK_NONE) {
3973
/* Try to place a lock on the index record; note that delete
3974
marked records are a special case in a unique search. If there
3975
is a non-delete marked record, then it is enough to lock its
3976
existence with LOCK_REC_NOT_GAP. */
3978
/* If innodb_locks_unsafe_for_binlog option is used
3979
or this session is using a READ COMMITED isolation
3980
level we lock only the record, i.e., next-key locking is
3985
if (!set_also_gap_locks
3986
|| srv_locks_unsafe_for_binlog
3987
|| trx->isolation_level == TRX_ISO_READ_COMMITTED
3989
&& !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
3993
lock_type = LOCK_ORDINARY;
3996
/* If we are doing a 'greater or equal than a primary key
3997
value' search from a clustered index, and we find a record
3998
that has that exact primary key value, then there is no need
3999
to lock the gap before the record, because no insert in the
4000
gap can be in our search range. That is, no phantom row can
4003
An example: if col1 is the primary key, the search is WHERE
4004
col1 >= 100, and we find a record where col1 = 100, then no
4005
need to lock the gap before that record. */
4007
if (index == clust_index
4008
&& mode == PAGE_CUR_GE
4010
&& dtuple_get_n_fields_cmp(search_tuple)
4011
== dict_index_get_n_unique(index)
4012
&& 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
4014
lock_type = LOCK_REC_NOT_GAP;
4017
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4018
rec, index, offsets,
4019
prebuilt->select_lock_type,
4023
const rec_t* old_vers;
4025
if (srv_locks_unsafe_for_binlog
4026
|| trx->isolation_level == TRX_ISO_READ_COMMITTED) {
4027
/* Note that a record of
4028
prebuilt->index was locked. */
4029
prebuilt->new_rec_locks = 1;
4033
if (UNIV_LIKELY(prebuilt->row_read_type
4034
!= ROW_READ_TRY_SEMI_CONSISTENT)
4035
|| index != clust_index) {
4037
goto lock_wait_or_error;
4040
/* The following call returns 'offsets'
4041
associated with 'old_vers' */
4042
err = row_sel_build_committed_vers_for_mysql(
4043
clust_index, prebuilt, rec,
4044
&offsets, &heap, &old_vers, &mtr);
4046
if (err != DB_SUCCESS) {
4048
goto lock_wait_or_error;
4051
mutex_enter(&kernel_mutex);
4052
if (trx->was_chosen_as_deadlock_victim) {
4053
mutex_exit(&kernel_mutex);
4056
goto lock_wait_or_error;
4058
if (UNIV_LIKELY(trx->wait_lock != NULL)) {
4059
lock_cancel_waiting_and_release(
4061
prebuilt->new_rec_locks = 0;
4063
mutex_exit(&kernel_mutex);
4065
/* The lock was granted while we were
4066
searching for the last committed version.
4067
Do a normal locking read. */
4069
offsets = rec_get_offsets(rec, index, offsets,
4073
/* Note that a record of
4074
prebuilt->index was locked. */
4075
prebuilt->new_rec_locks = 1;
4078
mutex_exit(&kernel_mutex);
4080
if (old_vers == NULL) {
4081
/* The row was not yet committed */
4086
did_semi_consistent_read = TRUE;
4091
goto lock_wait_or_error;
4094
/* This is a non-locking consistent read: if necessary, fetch
4095
a previous version of the record */
4097
if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
4099
/* Do nothing: we let a non-locking SELECT read the
4100
latest version of the record */
4102
} else if (index == clust_index) {
4104
/* Fetch a previous version of the row if the current
4105
one is not visible in the snapshot; if we have a very
4106
high force recovery level set, we try to avoid crashes
4107
by skipping this lookup */
4109
if (UNIV_LIKELY(srv_force_recovery < 5)
4110
&& !lock_clust_rec_cons_read_sees(
4111
rec, index, offsets, trx->read_view)) {
4114
/* The following call returns 'offsets'
4115
associated with 'old_vers' */
4116
err = row_sel_build_prev_vers_for_mysql(
4117
trx->read_view, clust_index,
4118
prebuilt, rec, &offsets, &heap,
4121
if (err != DB_SUCCESS) {
4123
goto lock_wait_or_error;
4126
if (old_vers == NULL) {
4127
/* The row did not exist yet in
4135
} else if (!lock_sec_rec_cons_read_sees(rec, trx->read_view)) {
4136
/* We are looking into a non-clustered index,
4137
and to get the right version of the record we
4138
have to look also into the clustered index: this
4139
is necessary, because we can only get the undo
4140
information via the clustered index record. */
4142
ut_ad(index != clust_index);
4143
goto requires_clust_rec;
4147
/* NOTE that at this point rec can be an old version of a clustered
4148
index record built for a consistent read. We cannot assume after this
4149
point that rec is on a buffer pool page. Functions like
4150
page_rec_is_comp() cannot be used! */
4152
if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
4154
/* The record is delete-marked: we can skip it */
4156
if ((srv_locks_unsafe_for_binlog
4157
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
4158
&& prebuilt->select_lock_type != LOCK_NONE
4159
&& !did_semi_consistent_read) {
4161
/* No need to keep a lock on a delete-marked record
4162
if we do not want to use next-key locking. */
4164
row_unlock_for_mysql(prebuilt, TRUE);
4167
/* This is an optimization to skip setting the next key lock
4168
on the record that follows this delete-marked record. This
4169
optimization works because of the unique search criteria
4170
which precludes the presence of a range lock between this
4171
delete marked record and the record following it.
4173
For now this is applicable only to clustered indexes while
4174
doing a unique search. There is scope for further optimization
4175
applicable to unique secondary indexes. Current behaviour is
4176
to widen the scope of a lock on an already delete marked record
4177
if the same record is deleted twice by the same transaction */
4178
if (index == clust_index && unique_search) {
4179
err = DB_RECORD_NOT_FOUND;
4187
/* Get the clustered index record if needed, if we did not do the
4188
search using the clustered index. */
4190
if (index != clust_index && prebuilt->need_to_access_clustered) {
4193
/* We use a 'goto' to the preceding label if a consistent
4194
read of a secondary index record requires us to look up old
4195
versions of the associated clustered index record. */
4197
ut_ad(rec_offs_validate(rec, index, offsets));
4199
/* It was a non-clustered index and we must fetch also the
4200
clustered index record */
4202
mtr_has_extra_clust_latch = TRUE;
4204
/* The following call returns 'offsets' associated with
4205
'clust_rec'. Note that 'clust_rec' can be an old version
4206
built for a consistent read. */
4208
err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
4210
&offsets, &heap, &mtr);
4211
if (err != DB_SUCCESS) {
4213
goto lock_wait_or_error;
4216
if (clust_rec == NULL) {
4217
/* The record did not exist in the read view */
4218
ut_ad(prebuilt->select_lock_type == LOCK_NONE);
4223
if ((srv_locks_unsafe_for_binlog
4224
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
4225
&& prebuilt->select_lock_type != LOCK_NONE) {
4226
/* Note that both the secondary index record
4227
and the clustered index record were locked. */
4228
ut_ad(prebuilt->new_rec_locks == 1);
4229
prebuilt->new_rec_locks = 2;
4232
if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
4234
/* The record is delete marked: we can skip it */
4236
if ((srv_locks_unsafe_for_binlog
4237
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
4238
&& prebuilt->select_lock_type != LOCK_NONE) {
4240
/* No need to keep a lock on a delete-marked
4241
record if we do not want to use next-key
4244
row_unlock_for_mysql(prebuilt, TRUE);
4250
if (prebuilt->need_to_access_clustered) {
4252
result_rec = clust_rec;
4254
ut_ad(rec_offs_validate(result_rec, clust_index,
4257
/* We used 'offsets' for the clust rec, recalculate
4259
offsets = rec_get_offsets(rec, index, offsets,
4260
ULINT_UNDEFINED, &heap);
4267
/* We found a qualifying record 'result_rec'. At this point,
4268
'offsets' are associated with 'result_rec'. */
4270
ut_ad(rec_offs_validate(result_rec,
4271
result_rec != rec ? clust_index : index,
4274
/* At this point, the clustered index record is protected
4275
by a page latch that was acquired when pcur was positioned.
4276
The latch will not be released until mtr_commit(&mtr). */
4278
if ((match_mode == ROW_SEL_EXACT
4279
|| prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
4280
&& prebuilt->select_lock_type == LOCK_NONE
4281
&& !prebuilt->templ_contains_blob
4282
&& !prebuilt->clust_index_was_generated
4283
&& prebuilt->template_type
4284
!= ROW_MYSQL_DUMMY_TEMPLATE) {
4286
/* Inside an update, for example, we do not cache rows,
4287
since we may use the cursor position to do the actual
4288
update, that is why we require ...lock_type == LOCK_NONE.
4289
Since we keep space in prebuilt only for the BLOBs of
4290
a single row, we cannot cache rows in the case there
4291
are BLOBs in the fields to be fetched. In HANDLER we do
4292
not cache rows because there the cursor is a scrollable
4295
row_sel_push_cache_row_for_mysql(prebuilt, result_rec,
4297
if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) {
4304
if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) {
4305
memcpy(buf + 4, result_rec
4306
- rec_offs_extra_size(offsets),
4307
rec_offs_size(offsets));
4308
mach_write_to_4(buf,
4309
rec_offs_extra_size(offsets) + 4);
4311
if (!row_sel_store_mysql_rec(buf, prebuilt,
4312
result_rec, offsets)) {
4313
err = DB_TOO_BIG_RECORD;
4315
goto lock_wait_or_error;
4319
if (prebuilt->clust_index_was_generated) {
4320
if (result_rec != rec) {
4321
offsets = rec_get_offsets(
4322
rec, index, offsets, ULINT_UNDEFINED,
4325
row_sel_store_row_id_to_prebuilt(prebuilt, rec,
4330
/* From this point on, 'offsets' are invalid. */
4333
/* We have an optimization to save CPU time: if this is a consistent
4334
read on a unique condition on the clustered index, then we do not
4335
store the pcur position, because any fetch next or prev will anyway
4336
return 'end of file'. Exceptions are locking reads and the MySQL
4337
HANDLER command where the user can move the cursor with PREV or NEXT
4338
even after a unique search. */
4340
if (!unique_search_from_clust_index
4341
|| prebuilt->select_lock_type != LOCK_NONE) {
4343
/* Inside an update always store the cursor position */
4345
btr_pcur_store_position(pcur, &mtr);
4353
/* Reset the old and new "did semi-consistent read" flags. */
4354
if (UNIV_UNLIKELY(prebuilt->row_read_type
4355
== ROW_READ_DID_SEMI_CONSISTENT)) {
4356
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4358
did_semi_consistent_read = FALSE;
4359
prebuilt->new_rec_locks = 0;
4361
/*-------------------------------------------------------------*/
4362
/* PHASE 5: Move the cursor to the next index record */
4364
if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
4365
/* We must commit mtr if we are moving to the next
4366
non-clustered index record, because we could break the
4367
latching order if we would access a different clustered
4368
index page right away without releasing the previous. */
4370
btr_pcur_store_position(pcur, &mtr);
4373
mtr_has_extra_clust_latch = FALSE;
4376
if (sel_restore_position_for_mysql(&same_user_rec,
4378
pcur, moves_up, &mtr)) {
4379
#ifdef UNIV_SEARCH_DEBUG
4381
#endif /* UNIV_SEARCH_DEBUG */
4388
if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
4390
btr_pcur_store_position(pcur, &mtr);
4392
if (match_mode != 0) {
4393
err = DB_RECORD_NOT_FOUND;
4395
err = DB_END_OF_INDEX;
4401
if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
4406
#ifdef UNIV_SEARCH_DEBUG
4408
#endif /* UNIV_SEARCH_DEBUG */
4413
/* Reset the old and new "did semi-consistent read" flags. */
4414
if (UNIV_UNLIKELY(prebuilt->row_read_type
4415
== ROW_READ_DID_SEMI_CONSISTENT)) {
4416
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4418
did_semi_consistent_read = FALSE;
4420
/*-------------------------------------------------------------*/
4422
btr_pcur_store_position(pcur, &mtr);
4425
mtr_has_extra_clust_latch = FALSE;
4427
trx->error_state = err;
4429
/* The following is a patch for MySQL */
4431
que_thr_stop_for_mysql(thr);
4433
thr->lock_state = QUE_THR_LOCK_ROW;
4435
if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
4436
/* It was a lock wait, and it ended */
4438
thr->lock_state = QUE_THR_LOCK_NOLOCK;
4441
sel_restore_position_for_mysql(&same_user_rec,
4442
BTR_SEARCH_LEAF, pcur,
4445
if ((srv_locks_unsafe_for_binlog
4446
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
4447
&& !same_user_rec) {
4449
/* Since we were not able to restore the cursor
4450
on the same user record, we cannot use
4451
row_unlock_for_mysql() to unlock any records, and
4452
we must thus reset the new rec lock info. Since
4453
in lock0lock.c we have blocked the inheriting of gap
4454
X-locks, we actually do not have any new record locks
4457
Note that if we were able to restore on the 'same'
4458
user record, it is still possible that we were actually
4459
waiting on a delete-marked record, and meanwhile
4460
it was removed by purge and inserted again by some
4461
other user. But that is no problem, because in
4462
rec_loop we will again try to set a lock, and
4463
new_rec_lock_info in trx will be right at the end. */
4465
prebuilt->new_rec_locks = 0;
4468
mode = pcur->search_mode;
4473
thr->lock_state = QUE_THR_LOCK_NOLOCK;
4475
#ifdef UNIV_SEARCH_DEBUG
4476
/* fputs("Using ", stderr);
4477
dict_index_name_print(stderr, index);
4478
fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4479
#endif /* UNIV_SEARCH_DEBUG */
4483
/*-------------------------------------------------------------*/
4484
que_thr_stop_for_mysql_no_error(thr, trx);
4488
if (prebuilt->n_fetch_cached > 0) {
4489
row_sel_pop_cached_row_for_mysql(buf, prebuilt);
4494
#ifdef UNIV_SEARCH_DEBUG
4495
/* fputs("Using ", stderr);
4496
dict_index_name_print(stderr, index);
4497
fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4498
#endif /* UNIV_SEARCH_DEBUG */
4499
if (err == DB_SUCCESS) {
4505
if (UNIV_LIKELY_NULL(heap)) {
4506
mem_heap_free(heap);
4509
/* Set or reset the "did semi-consistent read" flag on return.
4510
The flag did_semi_consistent_read is set if and only if
4511
the record being returned was fetched with a semi-consistent read. */
4512
ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
4513
|| !did_semi_consistent_read);
4515
if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
4516
if (UNIV_UNLIKELY(did_semi_consistent_read)) {
4517
prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
4519
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4525
/*******************************************************************//**
4526
Checks if MySQL at the moment is allowed for this table to retrieve a
4527
consistent read result, or store it to the query cache.
4528
@return TRUE if storing or retrieving from the query cache is permitted */
4531
row_search_check_if_query_cache_permitted(
4532
/*======================================*/
4533
trx_t* trx, /*!< in: transaction object */
4534
const char* norm_name) /*!< in: concatenation of database name,
4535
'/' char, table name */
4537
dict_table_t* table;
4540
table = dict_table_get(norm_name, FALSE);
4542
if (table == NULL) {
4547
mutex_enter(&kernel_mutex);
4549
/* Start the transaction if it is not started yet */
4551
trx_start_if_not_started_low(trx);
4553
/* If there are locks on the table or some trx has invalidated the
4554
cache up to our trx id, then ret = FALSE.
4555
We do not check what type locks there are on the table, though only
4556
IX type locks actually would require ret = FALSE. */
4558
if (UT_LIST_GET_LEN(table->locks) == 0
4559
&& ut_dulint_cmp(trx->id,
4560
table->query_cache_inv_trx_id) >= 0) {
4564
/* If the isolation level is high, assign a read view for the
4565
transaction if it does not yet have one */
4567
if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
4568
&& !trx->read_view) {
4570
trx->read_view = read_view_open_now(
4571
trx->id, trx->global_read_view_heap);
4572
trx->global_read_view = trx->read_view;
4576
mutex_exit(&kernel_mutex);
4581
/*******************************************************************//**
4582
Read the AUTOINC column from the current row. If the value is less than
4583
0 and the type is not unsigned then we reset the value to 0.
4584
@return value read from the column */
4587
row_search_autoinc_read_column(
4588
/*===========================*/
4589
dict_index_t* index, /*!< in: index to read from */
4590
const rec_t* rec, /*!< in: current rec */
4591
ulint col_no, /*!< in: column number */
4592
ibool unsigned_type) /*!< in: signed or unsigned flag */
4597
mem_heap_t* heap = NULL;
4598
ulint offsets_[REC_OFFS_NORMAL_SIZE];
4599
ulint* offsets = offsets_;
4601
rec_offs_init(offsets_);
4603
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4605
data = rec_get_nth_field(rec, offsets, col_no, &len);
4607
ut_a(len != UNIV_SQL_NULL);
4608
ut_a(len <= sizeof value);
4610
/* we assume AUTOINC value cannot be negative */
4611
value = mach_read_int_type(data, len, unsigned_type);
4613
if (UNIV_LIKELY_NULL(heap)) {
4614
mem_heap_free(heap);
4617
if (!unsigned_type && (ib_int64_t) value < 0) {
4624
/*******************************************************************//**
4626
@return current rec or NULL */
4629
row_search_autoinc_get_rec(
4630
/*=======================*/
4631
btr_pcur_t* pcur, /*!< in: the current cursor */
4632
mtr_t* mtr) /*!< in: mini transaction */
4635
const rec_t* rec = btr_pcur_get_rec(pcur);
4637
if (page_rec_is_user_rec(rec)) {
4640
} while (btr_pcur_move_to_prev(pcur, mtr));
4645
/*******************************************************************//**
4646
Read the max AUTOINC value from an index.
4647
@return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if
4648
column name can't be found in index */
4651
row_search_max_autoinc(
4652
/*===================*/
4653
dict_index_t* index, /*!< in: index to search */
4654
const char* col_name, /*!< in: name of autoinc column */
4655
ib_uint64_t* value) /*!< out: AUTOINC value read */
4659
dict_field_t* dfield = NULL;
4660
ulint error = DB_SUCCESS;
4662
n_cols = dict_index_get_n_ordering_defined_by_user(index);
4664
/* Search the index for the AUTOINC column name */
4665
for (i = 0; i < n_cols; ++i) {
4666
dfield = dict_index_get_nth_field(index, i);
4668
if (strcmp(col_name, dfield->name) == 0) {
4675
/* Must find the AUTOINC column name */
4676
if (i < n_cols && dfield) {
4682
/* Open at the high/right end (FALSE), and INIT
4684
btr_pcur_open_at_index_side(
4685
FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
4687
if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
4690
rec = row_search_autoinc_get_rec(&pcur, &mtr);
4693
ibool unsigned_type = (
4694
dfield->col->prtype & DATA_UNSIGNED);
4696
*value = row_search_autoinc_read_column(
4697
index, rec, i, unsigned_type);
4701
btr_pcur_close(&pcur);
4705
error = DB_RECORD_NOT_FOUND;