1
/*****************************************************************************
3
Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
4
Copyright (c) 2008, Google Inc.
6
Portions of this file contain modifications contributed and copyrighted by
7
Google, Inc. Those modifications are gratefully acknowledged and are described
8
briefly in the InnoDB documentation. The contributions by Google are
9
incorporated with their permission, and subject to the conditions contained in
10
the file COPYING.Google.
12
This program is free software; you can redistribute it and/or modify it under
13
the terms of the GNU General Public License as published by the Free Software
14
Foundation; version 2 of the License.
16
This program is distributed in the hope that it will be useful, but WITHOUT
17
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
18
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20
You should have received a copy of the GNU General Public License along with
21
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
22
St, Fifth Floor, Boston, MA 02110-1301 USA
24
*****************************************************************************/
26
/***************************************************//**
30
Created 12/19/1997 Heikki Tuuri
31
*******************************************************/
39
#include "dict0dict.h"
40
#include "dict0boot.h"
46
#include "mach0data.h"
52
#include "lock0lock.h"
53
#include "eval0eval.h"
55
#include "pars0pars.h"
56
#include "row0mysql.h"
57
#include "read0read.h"
59
#include "ha_prototypes.h"
61
/* Maximum number of rows to prefetch; MySQL interface has another parameter */
62
#define SEL_MAX_N_PREFETCH 16
64
/* Number of rows fetched, after which to start prefetching; MySQL interface
65
has another parameter */
66
#define SEL_PREFETCH_LIMIT 1
68
/* When a select has accessed about this many pages, it returns control back
69
to que_run_threads: this is to allow canceling runaway queries */
71
#define SEL_COST_LIMIT 100
73
/* Flags for search shortcut */
75
#define SEL_EXHAUSTED 1
78
/********************************************************************//**
79
Returns TRUE if the user-defined column in a secondary index record
80
is alphabetically the same as the corresponding BLOB column in the clustered
82
NOTE: the comparison is NOT done as a binary comparison, but character
83
fields are compared with collation!
84
@return TRUE if the columns are equal */
87
row_sel_sec_rec_is_for_blob(
88
/*========================*/
89
ulint mtype, /*!< in: main type */
90
ulint prtype, /*!< in: precise type */
91
ulint mbminmaxlen, /*!< in: minimum and maximum length of
92
a multi-byte character */
93
const byte* clust_field, /*!< in: the locally stored part of
94
the clustered index column, including
95
the BLOB pointer; the clustered
96
index record must be covered by
97
a lock or a page latch to protect it
98
against deletion (rollback or purge) */
99
ulint clust_len, /*!< in: length of clust_field */
100
const byte* sec_field, /*!< in: column in secondary index */
101
ulint sec_len, /*!< in: length of sec_field */
102
ulint zip_size) /*!< in: compressed page size, or 0 */
105
byte buf[DICT_MAX_INDEX_COL_LEN];
107
len = btr_copy_externally_stored_field_prefix(buf, sizeof buf,
109
clust_field, clust_len);
111
if (UNIV_UNLIKELY(len == 0)) {
112
/* The BLOB was being deleted as the server crashed.
113
There should not be any secondary index records
114
referring to this clustered index record, because
115
btr_free_externally_stored_field() is called after all
116
secondary index entries of the row have been purged. */
120
len = dtype_get_at_most_n_mbchars(prtype, mbminmaxlen,
121
sec_len, len, (const char*) buf);
123
return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
126
/********************************************************************//**
127
Returns TRUE if the user-defined column values in a secondary index record
128
are alphabetically the same as the corresponding columns in the clustered
130
NOTE: the comparison is NOT done as a binary comparison, but character
131
fields are compared with collation!
132
@return TRUE if the secondary record is equal to the corresponding
133
fields in the clustered record, when compared with collation;
134
FALSE if not equal or if the clustered record has been marked for deletion */
137
row_sel_sec_rec_is_for_clust_rec(
138
/*=============================*/
139
const rec_t* sec_rec, /*!< in: secondary index record */
140
dict_index_t* sec_index, /*!< in: secondary index */
141
const rec_t* clust_rec, /*!< in: clustered index record;
142
must be protected by a lock or
143
a page latch against deletion
144
in rollback or purge */
145
dict_index_t* clust_index) /*!< in: clustered index */
147
const byte* sec_field;
149
const byte* clust_field;
152
mem_heap_t* heap = NULL;
153
ulint clust_offsets_[REC_OFFS_NORMAL_SIZE];
154
ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
155
ulint* clust_offs = clust_offsets_;
156
ulint* sec_offs = sec_offsets_;
157
ibool is_equal = TRUE;
159
rec_offs_init(clust_offsets_);
160
rec_offs_init(sec_offsets_);
162
if (rec_get_deleted_flag(clust_rec,
163
dict_table_is_comp(clust_index->table))) {
165
/* The clustered index record is delete-marked;
166
it is not visible in the read view. Besides,
167
if there are any externally stored columns,
168
some of them may have already been purged. */
172
clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
173
ULINT_UNDEFINED, &heap);
174
sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
175
ULINT_UNDEFINED, &heap);
177
n = dict_index_get_n_ordering_defined_by_user(sec_index);
179
for (i = 0; i < n; i++) {
180
const dict_field_t* ifield;
181
const dict_col_t* col;
186
ifield = dict_index_get_nth_field(sec_index, i);
187
col = dict_field_get_col(ifield);
188
clust_pos = dict_col_get_clust_pos(col, clust_index);
190
clust_field = rec_get_nth_field(
191
clust_rec, clust_offs, clust_pos, &clust_len);
192
sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
196
if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL) {
198
if (rec_offs_nth_extern(clust_offs, clust_pos)) {
199
len -= BTR_EXTERN_FIELD_REF_SIZE;
202
len = dtype_get_at_most_n_mbchars(
203
col->prtype, col->mbminmaxlen,
204
ifield->prefix_len, len, (char*) clust_field);
206
if (rec_offs_nth_extern(clust_offs, clust_pos)
208
if (!row_sel_sec_rec_is_for_blob(
209
col->mtype, col->prtype,
211
clust_field, clust_len,
214
clust_index->table))) {
222
if (0 != cmp_data_data(col->mtype, col->prtype,
224
sec_field, sec_len)) {
232
if (UNIV_LIKELY_NULL(heap)) {
238
/*********************************************************************//**
239
Creates a select node struct.
240
@return own: select node struct */
245
mem_heap_t* heap) /*!< in: memory heap where created */
249
node = mem_heap_alloc(heap, sizeof(sel_node_t));
250
node->common.type = QUE_NODE_SELECT;
251
node->state = SEL_NODE_OPEN;
258
/*********************************************************************//**
259
Frees the memory private to a select node when a query graph is freed,
260
does not free the heap where the node was originally created. */
263
sel_node_free_private(
264
/*==================*/
265
sel_node_t* node) /*!< in: select node struct */
270
if (node->plans != NULL) {
271
for (i = 0; i < node->n_tables; i++) {
272
plan = sel_node_get_nth_plan(node, i);
274
btr_pcur_close(&(plan->pcur));
275
btr_pcur_close(&(plan->clust_pcur));
277
if (plan->old_vers_heap) {
278
mem_heap_free(plan->old_vers_heap);
284
/*********************************************************************//**
285
Evaluates the values in a select list. If there are aggregate functions,
286
their argument value is added to the aggregate total. */
289
sel_eval_select_list(
290
/*=================*/
291
sel_node_t* node) /*!< in: select node */
295
exp = node->select_list;
300
exp = que_node_get_next(exp);
304
/*********************************************************************//**
305
Assigns the values in the select list to the possible into-variables in
306
SELECT ... INTO ... */
309
sel_assign_into_var_values(
310
/*=======================*/
311
sym_node_t* var, /*!< in: first variable in a list of variables */
312
sel_node_t* node) /*!< in: select node */
321
exp = node->select_list;
326
eval_node_copy_val(var->alias, exp);
328
exp = que_node_get_next(exp);
329
var = que_node_get_next(var);
333
/*********************************************************************//**
334
Resets the aggregate value totals in the select list of an aggregate type
338
sel_reset_aggregate_vals(
339
/*=====================*/
340
sel_node_t* node) /*!< in: select node */
342
func_node_t* func_node;
344
ut_ad(node->is_aggregate);
346
func_node = node->select_list;
349
eval_node_set_int_val(func_node, 0);
351
func_node = que_node_get_next(func_node);
354
node->aggregate_already_fetched = FALSE;
357
/*********************************************************************//**
358
Copies the input variable values when an explicit cursor is opened. */
361
row_sel_copy_input_variable_vals(
362
/*=============================*/
363
sel_node_t* node) /*!< in: select node */
367
var = UT_LIST_GET_FIRST(node->copy_variables);
370
eval_node_copy_val(var, var->alias);
372
var->indirection = NULL;
374
var = UT_LIST_GET_NEXT(col_var_list, var);
378
/*********************************************************************//**
379
Fetches the column values from a record. */
382
row_sel_fetch_columns(
383
/*==================*/
384
dict_index_t* index, /*!< in: record index */
385
const rec_t* rec, /*!< in: record in a clustered or non-clustered
386
index; must be protected by a page latch */
387
const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
388
sym_node_t* column) /*!< in: first column in a column list, or
397
ut_ad(rec_offs_validate(rec, index, offsets));
399
if (dict_index_is_clust(index)) {
400
index_type = SYM_CLUST_FIELD_NO;
402
index_type = SYM_SEC_FIELD_NO;
406
mem_heap_t* heap = NULL;
409
field_no = column->field_nos[index_type];
411
if (field_no != ULINT_UNDEFINED) {
413
if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
416
/* Copy an externally stored field to the
417
temporary heap, if possible. */
419
heap = mem_heap_create(1);
421
data = btr_rec_copy_externally_stored_field(
423
dict_table_zip_size(index->table),
424
field_no, &len, heap);
426
/* data == NULL means that the
427
externally stored field was not
428
written yet. This record
429
should only be seen by
430
recv_recovery_rollback_active() or any
431
TRX_ISO_READ_UNCOMMITTED
432
transactions. The InnoDB SQL parser
433
(the sole caller of this function)
434
does not implement READ UNCOMMITTED,
435
and it is not involved during rollback. */
437
ut_a(len != UNIV_SQL_NULL);
441
data = rec_get_nth_field(rec, offsets,
444
needs_copy = column->copy_val;
448
eval_node_copy_and_alloc_val(column, data,
451
val = que_node_get_val(column);
452
dfield_set_data(val, data, len);
455
if (UNIV_LIKELY_NULL(heap)) {
460
column = UT_LIST_GET_NEXT(col_var_list, column);
464
/*********************************************************************//**
465
Allocates a prefetch buffer for a column when prefetch is first time done. */
468
sel_col_prefetch_buf_alloc(
469
/*=======================*/
470
sym_node_t* column) /*!< in: symbol table node for a column */
475
ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
477
column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
478
* sizeof(sel_buf_t));
479
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
480
sel_buf = column->prefetch_buf + i;
482
sel_buf->data = NULL;
484
sel_buf->val_buf_size = 0;
488
/*********************************************************************//**
489
Frees a prefetch buffer for a column, including the dynamically allocated
490
memory for data stored there. */
493
sel_col_prefetch_buf_free(
494
/*======================*/
495
sel_buf_t* prefetch_buf) /*!< in, own: prefetch buffer */
500
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
501
sel_buf = prefetch_buf + i;
503
if (sel_buf->val_buf_size > 0) {
505
mem_free(sel_buf->data);
510
/*********************************************************************//**
511
Pops the column values for a prefetched, cached row from the column prefetch
512
buffers and places them to the val fields in the column nodes. */
515
sel_pop_prefetched_row(
516
/*===================*/
517
plan_t* plan) /*!< in: plan node for a table */
526
ut_ad(plan->n_rows_prefetched > 0);
528
column = UT_LIST_GET_FIRST(plan->columns);
531
val = que_node_get_val(column);
533
if (!column->copy_val) {
534
/* We did not really push any value for the
537
ut_ad(!column->prefetch_buf);
538
ut_ad(que_node_get_val_buf_size(column) == 0);
539
ut_d(dfield_set_null(val));
544
ut_ad(column->prefetch_buf);
545
ut_ad(!dfield_is_ext(val));
547
sel_buf = column->prefetch_buf + plan->first_prefetched;
549
data = sel_buf->data;
551
val_buf_size = sel_buf->val_buf_size;
553
/* We must keep track of the allocated memory for
554
column values to be able to free it later: therefore
555
we swap the values for sel_buf and val */
557
sel_buf->data = dfield_get_data(val);
558
sel_buf->len = dfield_get_len(val);
559
sel_buf->val_buf_size = que_node_get_val_buf_size(column);
561
dfield_set_data(val, data, len);
562
que_node_set_val_buf_size(column, val_buf_size);
564
column = UT_LIST_GET_NEXT(col_var_list, column);
567
plan->n_rows_prefetched--;
569
plan->first_prefetched++;
572
/*********************************************************************//**
573
Pushes the column values for a prefetched, cached row to the column prefetch
574
buffers from the val fields in the column nodes. */
577
sel_push_prefetched_row(
578
/*====================*/
579
plan_t* plan) /*!< in: plan node for a table */
589
if (plan->n_rows_prefetched == 0) {
591
plan->first_prefetched = 0;
593
pos = plan->n_rows_prefetched;
595
/* We have the convention that pushing new rows starts only
596
after the prefetch stack has been emptied: */
598
ut_ad(plan->first_prefetched == 0);
601
plan->n_rows_prefetched++;
603
ut_ad(pos < SEL_MAX_N_PREFETCH);
605
column = UT_LIST_GET_FIRST(plan->columns);
608
if (!column->copy_val) {
609
/* There is no sense to push pointers to database
610
page fields when we do not keep latch on the page! */
615
if (!column->prefetch_buf) {
616
/* Allocate a new prefetch buffer */
618
sel_col_prefetch_buf_alloc(column);
621
sel_buf = column->prefetch_buf + pos;
623
val = que_node_get_val(column);
625
data = dfield_get_data(val);
626
len = dfield_get_len(val);
627
val_buf_size = que_node_get_val_buf_size(column);
629
/* We must keep track of the allocated memory for
630
column values to be able to free it later: therefore
631
we swap the values for sel_buf and val */
633
dfield_set_data(val, sel_buf->data, sel_buf->len);
634
que_node_set_val_buf_size(column, sel_buf->val_buf_size);
636
sel_buf->data = data;
638
sel_buf->val_buf_size = val_buf_size;
640
column = UT_LIST_GET_NEXT(col_var_list, column);
644
/*********************************************************************//**
645
Builds a previous version of a clustered index record for a consistent read
646
@return DB_SUCCESS or error code */
649
row_sel_build_prev_vers(
650
/*====================*/
651
read_view_t* read_view, /*!< in: read view */
652
dict_index_t* index, /*!< in: plan node for table */
653
rec_t* rec, /*!< in: record in a clustered index */
654
ulint** offsets, /*!< in/out: offsets returned by
655
rec_get_offsets(rec, plan->index) */
656
mem_heap_t** offset_heap, /*!< in/out: memory heap from which
657
the offsets are allocated */
658
mem_heap_t** old_vers_heap, /*!< out: old version heap to use */
659
rec_t** old_vers, /*!< out: old version, or NULL if the
660
record does not exist in the view:
661
i.e., it was freshly inserted
663
mtr_t* mtr) /*!< in: mtr */
667
if (*old_vers_heap) {
668
mem_heap_empty(*old_vers_heap);
670
*old_vers_heap = mem_heap_create(512);
673
err = row_vers_build_for_consistent_read(
674
rec, mtr, index, offsets, read_view, offset_heap,
675
*old_vers_heap, old_vers);
679
/*********************************************************************//**
680
Builds the last committed version of a clustered index record for a
681
semi-consistent read.
682
@return DB_SUCCESS or error code */
685
row_sel_build_committed_vers_for_mysql(
686
/*===================================*/
687
dict_index_t* clust_index, /*!< in: clustered index */
688
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
689
const rec_t* rec, /*!< in: record in a clustered index */
690
ulint** offsets, /*!< in/out: offsets returned by
691
rec_get_offsets(rec, clust_index) */
692
mem_heap_t** offset_heap, /*!< in/out: memory heap from which
693
the offsets are allocated */
694
const rec_t** old_vers, /*!< out: old version, or NULL if the
695
record does not exist in the view:
696
i.e., it was freshly inserted
698
mtr_t* mtr) /*!< in: mtr */
702
if (prebuilt->old_vers_heap) {
703
mem_heap_empty(prebuilt->old_vers_heap);
705
prebuilt->old_vers_heap = mem_heap_create(200);
708
err = row_vers_build_for_semi_consistent_read(
709
rec, mtr, clust_index, offsets, offset_heap,
710
prebuilt->old_vers_heap, old_vers);
714
/*********************************************************************//**
715
Tests the conditions which determine when the index segment we are searching
716
through has been exhausted.
717
@return TRUE if row passed the tests */
720
row_sel_test_end_conds(
721
/*===================*/
722
plan_t* plan) /*!< in: plan for the table; the column values must
723
already have been retrieved and the right sides of
724
comparisons evaluated */
728
/* All conditions in end_conds are comparisons of a column to an
731
cond = UT_LIST_GET_FIRST(plan->end_conds);
734
/* Evaluate the left side of the comparison, i.e., get the
735
column value if there is an indirection */
737
eval_sym(cond->args);
739
/* Do the comparison */
741
if (!eval_cmp(cond)) {
746
cond = UT_LIST_GET_NEXT(cond_list, cond);
752
/*********************************************************************//**
753
Tests the other conditions.
754
@return TRUE if row passed the tests */
757
row_sel_test_other_conds(
758
/*=====================*/
759
plan_t* plan) /*!< in: plan for the table; the column values must
760
already have been retrieved */
764
cond = UT_LIST_GET_FIRST(plan->other_conds);
769
if (!eval_node_get_ibool_val(cond)) {
774
cond = UT_LIST_GET_NEXT(cond_list, cond);
780
/*********************************************************************//**
781
Retrieves the clustered index record corresponding to a record in a
782
non-clustered index. Does the necessary locking.
783
@return DB_SUCCESS or error code */
786
row_sel_get_clust_rec(
787
/*==================*/
788
sel_node_t* node, /*!< in: select_node */
789
plan_t* plan, /*!< in: plan node for table */
790
rec_t* rec, /*!< in: record in a non-clustered index */
791
que_thr_t* thr, /*!< in: query thread */
792
rec_t** out_rec,/*!< out: clustered record or an old version of
793
it, NULL if the old version did not exist
794
in the read view, i.e., it was a fresh
796
mtr_t* mtr) /*!< in: mtr used to get access to the
797
non-clustered record; the same mtr is used to
798
access the clustered index */
804
mem_heap_t* heap = NULL;
805
ulint offsets_[REC_OFFS_NORMAL_SIZE];
806
ulint* offsets = offsets_;
807
rec_offs_init(offsets_);
811
offsets = rec_get_offsets(rec,
812
btr_pcur_get_btr_cur(&plan->pcur)->index,
813
offsets, ULINT_UNDEFINED, &heap);
815
row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
817
index = dict_table_get_first_index(plan->table);
819
btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
820
BTR_SEARCH_LEAF, &plan->clust_pcur,
823
clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
825
/* Note: only if the search ends up on a non-infimum record is the
826
low_match value the real match to the search tuple */
828
if (!page_rec_is_user_rec(clust_rec)
829
|| btr_pcur_get_low_match(&(plan->clust_pcur))
830
< dict_index_get_n_unique(index)) {
832
ut_a(rec_get_deleted_flag(rec,
833
dict_table_is_comp(plan->table)));
834
ut_a(node->read_view);
836
/* In a rare case it is possible that no clust rec is found
837
for a delete-marked secondary index record: if in row0umod.c
838
in row_undo_mod_remove_clust_low() we have already removed
839
the clust rec, while purge is still cleaning and removing
840
secondary index records associated with earlier versions of
841
the clustered index record. In that case we know that the
842
clustered index record did not exist in the read view of
848
offsets = rec_get_offsets(clust_rec, index, offsets,
849
ULINT_UNDEFINED, &heap);
851
if (!node->read_view) {
852
/* Try to place a lock on the index record */
854
/* If innodb_locks_unsafe_for_binlog option is used
855
or this session is using READ COMMITTED isolation level
856
we lock only the record, i.e., next-key locking is
861
trx = thr_get_trx(thr);
863
if (srv_locks_unsafe_for_binlog
864
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
865
lock_type = LOCK_REC_NOT_GAP;
867
lock_type = LOCK_ORDINARY;
870
err = lock_clust_rec_read_check_and_lock(
871
0, btr_pcur_get_block(&plan->clust_pcur),
872
clust_rec, index, offsets,
873
node->row_lock_mode, lock_type, thr);
877
case DB_SUCCESS_LOCKED_REC:
878
/* Declare the variable uninitialized in Valgrind.
879
It should be set to DB_SUCCESS at func_exit. */
880
UNIV_MEM_INVALID(&err, sizeof err);
886
/* This is a non-locking consistent read: if necessary, fetch
887
a previous version of the record */
891
if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
894
err = row_sel_build_prev_vers(
895
node->read_view, index, clust_rec,
896
&offsets, &heap, &plan->old_vers_heap,
899
if (err != DB_SUCCESS) {
904
clust_rec = old_vers;
906
if (clust_rec == NULL) {
911
/* If we had to go to an earlier version of row or the
912
secondary index record is delete marked, then it may be that
913
the secondary index record corresponding to clust_rec
914
(or old_vers) is not rec; in that case we must ignore
915
such row because in our snapshot rec would not have existed.
916
Remember that from rec we cannot see directly which transaction
917
id corresponds to it: we have to go to the clustered index
918
record. A query where we want to fetch all rows where
919
the secondary index value is in some interval would return
920
a wrong result if we would not drop rows which we come to
921
visit through secondary index records that would not really
922
exist in our snapshot. */
925
|| rec_get_deleted_flag(rec, dict_table_is_comp(
927
&& !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
933
/* Fetch the columns needed in test conditions. The clustered
934
index record is protected by a page latch that was acquired
935
when plan->clust_pcur was positioned. The latch will not be
936
released until mtr_commit(mtr). */
938
ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
939
row_sel_fetch_columns(index, clust_rec, offsets,
940
UT_LIST_GET_FIRST(plan->columns));
941
*out_rec = clust_rec;
945
if (UNIV_LIKELY_NULL(heap)) {
951
/*********************************************************************//**
952
Sets a lock on a record.
953
@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
958
const buf_block_t* block, /*!< in: buffer block of rec */
959
const rec_t* rec, /*!< in: record */
960
dict_index_t* index, /*!< in: index */
961
const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
962
ulint mode, /*!< in: lock mode */
963
ulint type, /*!< in: LOCK_ORDINARY, LOCK_GAP, or
965
que_thr_t* thr) /*!< in: query thread */
970
trx = thr_get_trx(thr);
972
if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
973
if (buf_LRU_buf_pool_running_out()) {
975
return(DB_LOCK_TABLE_FULL);
979
if (dict_index_is_clust(index)) {
980
err = lock_clust_rec_read_check_and_lock(
981
0, block, rec, index, offsets, mode, type, thr);
983
err = lock_sec_rec_read_check_and_lock(
984
0, block, rec, index, offsets, mode, type, thr);
990
/*********************************************************************//**
991
Opens a pcur to a table index. */
996
plan_t* plan, /*!< in: table plan */
997
ibool search_latch_locked,
998
/*!< in: TRUE if the thread currently
999
has the search latch locked in
1001
mtr_t* mtr) /*!< in: mtr */
1003
dict_index_t* index;
1007
ulint has_search_latch = 0; /* RW_S_LATCH or 0 */
1010
if (search_latch_locked) {
1011
has_search_latch = RW_S_LATCH;
1014
index = plan->index;
1016
/* Calculate the value of the search tuple: the exact match columns
1017
get their expressions evaluated when we evaluate the right sides of
1020
cond = UT_LIST_GET_FIRST(plan->end_conds);
1023
eval_exp(que_node_get_next(cond->args));
1025
cond = UT_LIST_GET_NEXT(cond_list, cond);
1029
n_fields = dtuple_get_n_fields(plan->tuple);
1031
if (plan->n_exact_match < n_fields) {
1032
/* There is a non-exact match field which must be
1033
evaluated separately */
1035
eval_exp(plan->tuple_exps[n_fields - 1]);
1038
for (i = 0; i < n_fields; i++) {
1039
exp = plan->tuple_exps[i];
1041
dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
1042
que_node_get_val(exp));
1045
/* Open pcur to the index */
1047
btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
1048
BTR_SEARCH_LEAF, &plan->pcur,
1049
has_search_latch, mtr);
1051
/* Open the cursor to the start or the end of the index
1054
btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
1055
&(plan->pcur), FALSE, mtr);
1058
ut_ad(plan->n_rows_prefetched == 0);
1059
ut_ad(plan->n_rows_fetched == 0);
1060
ut_ad(plan->cursor_at_end == FALSE);
1062
plan->pcur_is_open = TRUE;
1065
/*********************************************************************//**
1066
Restores a stored pcur position to a table index.
1067
@return TRUE if the cursor should be moved to the next record after we
1068
return from this function (moved to the previous, in the case of a
1069
descending cursor) without processing again the current cursor
1073
row_sel_restore_pcur_pos(
1074
/*=====================*/
1075
plan_t* plan, /*!< in: table plan */
1076
mtr_t* mtr) /*!< in: mtr */
1078
ibool equal_position;
1079
ulint relative_position;
1081
ut_ad(!plan->cursor_at_end);
1083
relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
1085
equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
1086
&(plan->pcur), mtr);
1088
/* If the cursor is traveling upwards, and relative_position is
1090
(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
1091
yet on the successor of the page infimum;
1092
(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1093
first record GREATER than the predecessor of a page supremum; we have
1094
not yet processed the cursor record: no need to move the cursor to the
1096
(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1097
last record LESS or EQUAL to the old stored user record; (a) if
1098
equal_position is FALSE, this means that the cursor is now on a record
1099
less than the old user record, and we must move to the next record;
1100
(b) if equal_position is TRUE, then if
1101
plan->stored_cursor_rec_processed is TRUE, we must move to the next
1102
record, else there is no need to move the cursor. */
1105
if (relative_position == BTR_PCUR_ON) {
1107
if (equal_position) {
1109
return(plan->stored_cursor_rec_processed);
1115
ut_ad(relative_position == BTR_PCUR_AFTER
1116
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1121
/* If the cursor is traveling downwards, and relative_position is
1123
(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
1124
the last record LESS than the successor of a page infimum; we have not
1125
processed the cursor record: no need to move the cursor;
1126
(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1127
first record GREATER than the predecessor of a page supremum; we have
1128
processed the cursor record: we should move the cursor to the previous
1130
(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1131
last record LESS or EQUAL to the old stored user record; (a) if
1132
equal_position is FALSE, this means that the cursor is now on a record
1133
less than the old user record, and we need not move to the previous
1134
record; (b) if equal_position is TRUE, then if
1135
plan->stored_cursor_rec_processed is TRUE, we must move to the previous
1136
record, else there is no need to move the cursor. */
1138
if (relative_position == BTR_PCUR_BEFORE
1139
|| relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
1144
if (relative_position == BTR_PCUR_ON) {
1146
if (equal_position) {
1148
return(plan->stored_cursor_rec_processed);
1154
ut_ad(relative_position == BTR_PCUR_AFTER
1155
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1160
/*********************************************************************//**
1161
Resets a plan cursor to a closed state. */
1166
plan_t* plan) /*!< in: plan */
1168
plan->pcur_is_open = FALSE;
1169
plan->cursor_at_end = FALSE;
1170
plan->n_rows_fetched = 0;
1171
plan->n_rows_prefetched = 0;
1174
/*********************************************************************//**
1175
Tries to do a shortcut to fetch a clustered index record with a unique key,
1176
using the hash index if possible (not always).
1177
@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
1180
row_sel_try_search_shortcut(
1181
/*========================*/
1182
sel_node_t* node, /*!< in: select node for a consistent read */
1183
plan_t* plan, /*!< in: plan for a unique search in clustered
1185
mtr_t* mtr) /*!< in: mtr */
1187
dict_index_t* index;
1189
mem_heap_t* heap = NULL;
1190
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1191
ulint* offsets = offsets_;
1193
rec_offs_init(offsets_);
1195
index = plan->index;
1197
ut_ad(node->read_view);
1198
ut_ad(plan->unique_search);
1199
ut_ad(!plan->must_get_clust);
1200
#ifdef UNIV_SYNC_DEBUG
1201
ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
1202
#endif /* UNIV_SYNC_DEBUG */
1204
row_sel_open_pcur(plan, TRUE, mtr);
1206
rec = btr_pcur_get_rec(&(plan->pcur));
1208
if (!page_rec_is_user_rec(rec)) {
1213
ut_ad(plan->mode == PAGE_CUR_GE);
1215
/* As the cursor is now placed on a user record after a search with
1216
the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
1217
fields in the user record matched to the search tuple */
1219
if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
1221
return(SEL_EXHAUSTED);
1224
/* This is a non-locking consistent read: if necessary, fetch
1225
a previous version of the record */
1227
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1229
if (dict_index_is_clust(index)) {
1230
if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1235
} else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
1241
/* Test the deleted flag. */
1243
if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
1245
ret = SEL_EXHAUSTED;
1249
/* Fetch the columns needed in test conditions. The index
1250
record is protected by a page latch that was acquired when
1251
plan->pcur was positioned. The latch will not be released
1252
until mtr_commit(mtr). */
1254
row_sel_fetch_columns(index, rec, offsets,
1255
UT_LIST_GET_FIRST(plan->columns));
1257
/* Test the rest of search conditions */
1259
if (!row_sel_test_other_conds(plan)) {
1261
ret = SEL_EXHAUSTED;
1265
ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1267
plan->n_rows_fetched++;
1270
if (UNIV_LIKELY_NULL(heap)) {
1271
mem_heap_free(heap);
1276
/*********************************************************************//**
1277
Performs a select step.
1278
@return DB_SUCCESS or error code */
1283
sel_node_t* node, /*!< in: select node */
1284
que_thr_t* thr) /*!< in: query thread */
1286
dict_index_t* index;
1293
ibool search_latch_locked;
1294
ibool consistent_read;
1296
/* The following flag becomes TRUE when we are doing a
1297
consistent read from a non-clustered index and we must look
1298
at the clustered index to find out the previous delete mark
1299
state of the non-clustered record: */
1301
ibool cons_read_requires_clust_rec = FALSE;
1302
ulint cost_counter = 0;
1303
ibool cursor_just_opened;
1304
ibool must_go_to_next;
1305
ibool mtr_has_extra_clust_latch = FALSE;
1306
/* TRUE if the search was made using
1307
a non-clustered index, and we had to
1308
access the clustered record: now &mtr
1309
contains a clustered index latch, and
1310
&mtr must be committed before we move
1311
to the next non-clustered record */
1314
mem_heap_t* heap = NULL;
1315
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1316
ulint* offsets = offsets_;
1317
rec_offs_init(offsets_);
1319
ut_ad(thr->run_node == node);
1321
search_latch_locked = FALSE;
1323
if (node->read_view) {
1324
/* In consistent reads, we try to do with the hash index and
1325
not to use the buffer page get. This is to reduce memory bus
1326
load resulting from semaphore operations. The search latch
1327
will be s-locked when we access an index with a unique search
1328
condition, but not locked when we access an index with a
1329
less selective search condition. */
1331
consistent_read = TRUE;
1333
consistent_read = FALSE;
1339
This is the outer major loop in calculating a join. We come here when
1340
node->fetch_table changes, and after adding a row to aggregate totals
1341
and, of course, when this function is called. */
1343
ut_ad(mtr_has_extra_clust_latch == FALSE);
1345
plan = sel_node_get_nth_plan(node, node->fetch_table);
1346
index = plan->index;
1348
if (plan->n_rows_prefetched > 0) {
1349
sel_pop_prefetched_row(plan);
1351
goto next_table_no_mtr;
1354
if (plan->cursor_at_end) {
1355
/* The cursor has already reached the result set end: no more
1356
rows to process for this table cursor, as also the prefetch
1359
ut_ad(plan->pcur_is_open);
1361
goto table_exhausted_no_mtr;
1364
/* Open a cursor to index, or restore an open cursor position */
1368
if (consistent_read && plan->unique_search && !plan->pcur_is_open
1369
&& !plan->must_get_clust
1370
&& !plan->table->big_rows) {
1371
if (!search_latch_locked) {
1372
rw_lock_s_lock(&btr_search_latch);
1374
search_latch_locked = TRUE;
1375
} else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
1377
/* There is an x-latch request waiting: release the
1378
s-latch for a moment; as an s-latch here is often
1379
kept for some 10 searches before being released,
1380
a waiting x-latch request would block other threads
1381
from acquiring an s-latch for a long time, lowering
1382
performance significantly in multiprocessors. */
1384
rw_lock_s_unlock(&btr_search_latch);
1385
rw_lock_s_lock(&btr_search_latch);
1388
found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
1390
if (found_flag == SEL_FOUND) {
1394
} else if (found_flag == SEL_EXHAUSTED) {
1396
goto table_exhausted;
1399
ut_ad(found_flag == SEL_RETRY);
1401
plan_reset_cursor(plan);
1407
if (search_latch_locked) {
1408
rw_lock_s_unlock(&btr_search_latch);
1410
search_latch_locked = FALSE;
1413
if (!plan->pcur_is_open) {
1414
/* Evaluate the expressions to build the search tuple and
1417
row_sel_open_pcur(plan, search_latch_locked, &mtr);
1419
cursor_just_opened = TRUE;
1421
/* A new search was made: increment the cost counter */
1424
/* Restore pcur position to the index */
1426
must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
1428
cursor_just_opened = FALSE;
1430
if (must_go_to_next) {
1431
/* We have already processed the cursor record: move
1441
In this loop we use pcur and try to fetch a qualifying row, and
1442
also fill the prefetch buffer for this table if n_rows_fetched has
1443
exceeded a threshold. While we are inside this loop, the following
1445
(1) &mtr is started,
1446
(2) pcur is positioned and open.
1448
NOTE that if cursor_just_opened is TRUE here, it means that we came
1449
to this point right after row_sel_open_pcur. */
1451
ut_ad(mtr_has_extra_clust_latch == FALSE);
1453
rec = btr_pcur_get_rec(&(plan->pcur));
1455
/* PHASE 1: Set a lock if specified */
1457
if (!node->asc && cursor_just_opened
1458
&& !page_rec_is_supremum(rec)) {
1460
/* When we open a cursor for a descending search, we must set
1461
a next-key lock on the successor record: otherwise it would
1462
be possible to insert new records next to the cursor position,
1463
and it might be that these new records should appear in the
1464
search result set, resulting in the phantom problem. */
1466
if (!consistent_read) {
1468
/* If innodb_locks_unsafe_for_binlog option is used
1469
or this session is using READ COMMITTED isolation
1470
level, we lock only the record, i.e., next-key
1471
locking is not used. */
1473
rec_t* next_rec = page_rec_get_next(rec);
1477
trx = thr_get_trx(thr);
1479
offsets = rec_get_offsets(next_rec, index, offsets,
1480
ULINT_UNDEFINED, &heap);
1482
if (srv_locks_unsafe_for_binlog
1483
|| trx->isolation_level
1484
<= TRX_ISO_READ_COMMITTED) {
1486
if (page_rec_is_supremum(next_rec)) {
1491
lock_type = LOCK_REC_NOT_GAP;
1493
lock_type = LOCK_ORDINARY;
1496
err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1497
next_rec, index, offsets,
1498
node->row_lock_mode,
1502
case DB_SUCCESS_LOCKED_REC:
1507
/* Note that in this case we will store in pcur
1508
the PREDECESSOR of the record we are waiting
1510
goto lock_wait_or_error;
1516
if (page_rec_is_infimum(rec)) {
1518
/* The infimum record on a page cannot be in the result set,
1519
and neither can a record lock be placed on it: we skip such
1520
a record. We also increment the cost counter as we may have
1521
processed yet another page of index. */
1528
if (!consistent_read) {
1529
/* Try to place a lock on the index record */
1531
/* If innodb_locks_unsafe_for_binlog option is used
1532
or this session is using READ COMMITTED isolation level,
1533
we lock only the record, i.e., next-key locking is
1539
offsets = rec_get_offsets(rec, index, offsets,
1540
ULINT_UNDEFINED, &heap);
1542
trx = thr_get_trx(thr);
1544
if (srv_locks_unsafe_for_binlog
1545
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
1547
if (page_rec_is_supremum(rec)) {
1552
lock_type = LOCK_REC_NOT_GAP;
1554
lock_type = LOCK_ORDINARY;
1557
err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1558
rec, index, offsets,
1559
node->row_lock_mode, lock_type, thr);
1562
case DB_SUCCESS_LOCKED_REC:
1567
goto lock_wait_or_error;
1571
if (page_rec_is_supremum(rec)) {
1573
/* A page supremum record cannot be in the result set: skip
1574
it now when we have placed a possible lock on it */
1579
ut_ad(page_rec_is_user_rec(rec));
1581
if (cost_counter > SEL_COST_LIMIT) {
1583
/* Now that we have placed the necessary locks, we can stop
1584
for a while and store the cursor position; NOTE that if we
1585
would store the cursor position BEFORE placing a record lock,
1586
it might happen that the cursor would jump over some records
1587
that another transaction could meanwhile insert adjacent to
1588
the cursor: this would result in the phantom problem. */
1590
goto stop_for_a_while;
1593
/* PHASE 2: Check a mixed index mix id if needed */
1595
if (plan->unique_search && cursor_just_opened) {
1597
ut_ad(plan->mode == PAGE_CUR_GE);
1599
/* As the cursor is now placed on a user record after a search
1600
with the mode PAGE_CUR_GE, the up_match field in the cursor
1601
tells how many fields in the user record matched to the search
1604
if (btr_pcur_get_up_match(&(plan->pcur))
1605
< plan->n_exact_match) {
1606
goto table_exhausted;
1609
/* Ok, no need to test end_conds or mix id */
1613
/* We are ready to look at a possible new index entry in the result
1614
set: the cursor is now placed on a user record */
1616
/* PHASE 3: Get previous version in a consistent read */
1618
cons_read_requires_clust_rec = FALSE;
1619
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1621
if (consistent_read) {
1622
/* This is a non-locking consistent read: if necessary, fetch
1623
a previous version of the record */
1625
if (dict_index_is_clust(index)) {
1627
if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1630
err = row_sel_build_prev_vers(
1631
node->read_view, index, rec,
1632
&offsets, &heap, &plan->old_vers_heap,
1635
if (err != DB_SUCCESS) {
1637
goto lock_wait_or_error;
1640
if (old_vers == NULL) {
1641
/* The record does not exist
1642
in our read view. Skip it, but
1643
first attempt to determine
1644
whether the index segment we
1645
are searching through has been
1648
offsets = rec_get_offsets(
1649
rec, index, offsets,
1650
ULINT_UNDEFINED, &heap);
1652
/* Fetch the columns needed in
1653
test conditions. The clustered
1654
index record is protected by a
1655
page latch that was acquired
1656
by row_sel_open_pcur() or
1657
row_sel_restore_pcur_pos().
1658
The latch will not be released
1659
until mtr_commit(mtr). */
1661
row_sel_fetch_columns(
1662
index, rec, offsets,
1666
if (!row_sel_test_end_conds(plan)) {
1668
goto table_exhausted;
1676
} else if (!lock_sec_rec_cons_read_sees(rec,
1678
cons_read_requires_clust_rec = TRUE;
1682
/* PHASE 4: Test search end conditions and deleted flag */
1684
/* Fetch the columns needed in test conditions. The record is
1685
protected by a page latch that was acquired by
1686
row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
1687
will not be released until mtr_commit(mtr). */
1689
row_sel_fetch_columns(index, rec, offsets,
1690
UT_LIST_GET_FIRST(plan->columns));
1692
/* Test the selection end conditions: these can only contain columns
1693
which already are found in the index, even though the index might be
1696
if (plan->unique_search && cursor_just_opened) {
1698
/* No test necessary: the test was already made above */
1700
} else if (!row_sel_test_end_conds(plan)) {
1702
goto table_exhausted;
1705
if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
1706
&& !cons_read_requires_clust_rec) {
1708
/* The record is delete marked: we can skip it if this is
1709
not a consistent read which might see an earlier version
1710
of a non-clustered index record */
1712
if (plan->unique_search) {
1714
goto table_exhausted;
1720
/* PHASE 5: Get the clustered index record, if needed and if we did
1721
not do the search using the clustered index */
1723
if (plan->must_get_clust || cons_read_requires_clust_rec) {
1725
/* It was a non-clustered index and we must fetch also the
1726
clustered index record */
1728
err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
1730
mtr_has_extra_clust_latch = TRUE;
1733
case DB_SUCCESS_LOCKED_REC:
1738
goto lock_wait_or_error;
1741
/* Retrieving the clustered record required a search:
1742
increment the cost counter */
1746
if (clust_rec == NULL) {
1747
/* The record did not exist in the read view */
1748
ut_ad(consistent_read);
1753
if (rec_get_deleted_flag(clust_rec,
1754
dict_table_is_comp(plan->table))) {
1756
/* The record is delete marked: we can skip it */
1761
if (node->can_get_updated) {
1763
btr_pcur_store_position(&(plan->clust_pcur), &mtr);
1767
/* PHASE 6: Test the rest of search conditions */
1769
if (!row_sel_test_other_conds(plan)) {
1771
if (plan->unique_search) {
1773
goto table_exhausted;
1779
/* PHASE 7: We found a new qualifying row for the current table; push
1780
the row if prefetch is on, or move to the next table in the join */
1782
plan->n_rows_fetched++;
1784
ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
1786
if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
1787
|| plan->unique_search || plan->no_prefetch
1788
|| plan->table->big_rows) {
1790
/* No prefetch in operation: go to the next table */
1795
sel_push_prefetched_row(plan);
1797
if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
1799
/* The prefetch buffer is now full */
1801
sel_pop_prefetched_row(plan);
1807
ut_ad(!search_latch_locked);
1809
if (mtr_has_extra_clust_latch) {
1811
/* We must commit &mtr if we are moving to the next
1812
non-clustered index record, because we could break the
1813
latching order if we would access a different clustered
1814
index page right away without releasing the previous. */
1816
goto commit_mtr_for_a_while;
1820
moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
1822
moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
1827
goto table_exhausted;
1830
cursor_just_opened = FALSE;
1832
/* END OF RECORD LOOP
1833
------------------ */
1837
/* We found a record which satisfies the conditions: we can move to
1838
the next table or return a row in the result set */
1840
ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
1842
if (plan->unique_search && !node->can_get_updated) {
1844
plan->cursor_at_end = TRUE;
1846
ut_ad(!search_latch_locked);
1848
plan->stored_cursor_rec_processed = TRUE;
1850
btr_pcur_store_position(&(plan->pcur), &mtr);
1855
mtr_has_extra_clust_latch = FALSE;
1858
/* If we use 'goto' to this label, it means that the row was popped
1859
from the prefetched rows stack, and &mtr is already committed */
1861
if (node->fetch_table + 1 == node->n_tables) {
1863
sel_eval_select_list(node);
1865
if (node->is_aggregate) {
1870
sel_assign_into_var_values(node->into_list, node);
1872
thr->run_node = que_node_get_parent(node);
1878
node->fetch_table++;
1880
/* When we move to the next table, we first reset the plan cursor:
1881
we do not care about resetting it when we backtrack from a table */
1883
plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
1888
/* The table cursor pcur reached the result set end: backtrack to the
1889
previous table in the join if we do not have cached prefetched rows */
1891
plan->cursor_at_end = TRUE;
1895
mtr_has_extra_clust_latch = FALSE;
1897
if (plan->n_rows_prefetched > 0) {
1898
/* The table became exhausted during a prefetch */
1900
sel_pop_prefetched_row(plan);
1902
goto next_table_no_mtr;
1905
table_exhausted_no_mtr:
1906
if (node->fetch_table == 0) {
1909
if (node->is_aggregate && !node->aggregate_already_fetched) {
1911
node->aggregate_already_fetched = TRUE;
1913
sel_assign_into_var_values(node->into_list, node);
1915
thr->run_node = que_node_get_parent(node);
1917
node->state = SEL_NODE_NO_MORE_ROWS;
1919
thr->run_node = que_node_get_parent(node);
1926
node->fetch_table--;
1931
/* Return control for a while to que_run_threads, so that runaway
1932
queries can be canceled. NOTE that when we come here, we must, in a
1933
locking read, have placed the necessary (possibly waiting request)
1934
record lock on the cursor record or its successor: when we reposition
1935
the cursor, this record lock guarantees that nobody can meanwhile have
1936
inserted new records which should have appeared in the result set,
1937
which would result in the phantom problem. */
1939
ut_ad(!search_latch_locked);
1941
plan->stored_cursor_rec_processed = FALSE;
1942
btr_pcur_store_position(&(plan->pcur), &mtr);
1946
#ifdef UNIV_SYNC_DEBUG
1947
ut_ad(sync_thread_levels_empty_gen(TRUE));
1948
#endif /* UNIV_SYNC_DEBUG */
1952
commit_mtr_for_a_while:
1953
/* Stores the cursor position and commits &mtr; this is used if
1954
&mtr may contain latches which would break the latching order if
1955
&mtr would not be committed and the latches released. */
1957
plan->stored_cursor_rec_processed = TRUE;
1959
ut_ad(!search_latch_locked);
1960
btr_pcur_store_position(&(plan->pcur), &mtr);
1964
mtr_has_extra_clust_latch = FALSE;
1966
#ifdef UNIV_SYNC_DEBUG
1967
ut_ad(sync_thread_levels_empty_gen(TRUE));
1968
#endif /* UNIV_SYNC_DEBUG */
1973
/* See the note at stop_for_a_while: the same holds for this case */
1975
ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
1976
ut_ad(!search_latch_locked);
1978
plan->stored_cursor_rec_processed = FALSE;
1979
btr_pcur_store_position(&(plan->pcur), &mtr);
1983
#ifdef UNIV_SYNC_DEBUG
1984
ut_ad(sync_thread_levels_empty_gen(TRUE));
1985
#endif /* UNIV_SYNC_DEBUG */
1988
if (search_latch_locked) {
1989
rw_lock_s_unlock(&btr_search_latch);
1991
if (UNIV_LIKELY_NULL(heap)) {
1992
mem_heap_free(heap);
1997
/**********************************************************************//**
1998
Performs a select step. This is a high-level function used in SQL execution
2000
@return query thread to run next or NULL */
2005
que_thr_t* thr) /*!< in: query thread */
2008
sym_node_t* table_node;
2014
node = thr->run_node;
2016
ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
2018
/* If this is a new time this node is executed (or when execution
2019
resumes after wait for a table intention lock), set intention locks
2020
on the tables, or assign a read view */
2022
if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
2024
node->state = SEL_NODE_OPEN;
2027
if (node->state == SEL_NODE_OPEN) {
2029
/* It may be that the current session has not yet started
2030
its transaction, or it has been committed: */
2032
trx_start_if_not_started(thr_get_trx(thr));
2034
plan_reset_cursor(sel_node_get_nth_plan(node, 0));
2036
if (node->consistent_read) {
2037
/* Assign a read view for the query */
2038
node->read_view = trx_assign_read_view(
2041
if (node->set_x_locks) {
2042
i_lock_mode = LOCK_IX;
2044
i_lock_mode = LOCK_IS;
2047
table_node = node->table_list;
2049
while (table_node) {
2050
err = lock_table(0, table_node->table,
2052
if (err != DB_SUCCESS) {
2053
thr_get_trx(thr)->error_state = err;
2058
table_node = que_node_get_next(table_node);
2062
/* If this is an explicit cursor, copy stored procedure
2063
variable values, so that the values cannot change between
2064
fetches (currently, we copy them also for non-explicit
2067
if (node->explicit_cursor
2068
&& UT_LIST_GET_FIRST(node->copy_variables)) {
2070
row_sel_copy_input_variable_vals(node);
2073
node->state = SEL_NODE_FETCH;
2074
node->fetch_table = 0;
2076
if (node->is_aggregate) {
2077
/* Reset the aggregate total values */
2078
sel_reset_aggregate_vals(node);
2084
err = row_sel(node, thr);
2086
/* NOTE! if queries are parallelized, the following assignment may
2087
have problems; the assignment should be made only if thr is the
2088
only top-level thr in the graph: */
2090
thr->graph->last_sel_node = node;
2092
if (err != DB_SUCCESS) {
2093
thr_get_trx(thr)->error_state = err;
2101
/**********************************************************************//**
2102
Performs a fetch for a cursor.
2103
@return query thread to run next or NULL */
2108
que_thr_t* thr) /*!< in: query thread */
2110
sel_node_t* sel_node;
2115
node = thr->run_node;
2116
sel_node = node->cursor_def;
2118
ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
2120
if (thr->prev_node != que_node_get_parent(node)) {
2122
if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
2124
if (node->into_list) {
2125
sel_assign_into_var_values(node->into_list,
2128
void* ret = (*node->func->func)(
2129
sel_node, node->func->arg);
2133
= SEL_NODE_NO_MORE_ROWS;
2138
thr->run_node = que_node_get_parent(node);
2143
/* Make the fetch node the parent of the cursor definition for
2144
the time of the fetch, so that execution knows to return to this
2145
fetch node after a row has been selected or we know that there is
2148
sel_node->common.parent = node;
2150
if (sel_node->state == SEL_NODE_CLOSED) {
2152
"InnoDB: Error: fetch called on a closed cursor\n");
2154
thr_get_trx(thr)->error_state = DB_ERROR;
2159
thr->run_node = sel_node;
2164
/****************************************************************//**
2165
Sample callback function for fetch that prints each row.
2166
@return always returns non-NULL */
2171
void* row, /*!< in: sel_node_t* */
2172
void* user_arg) /*!< in: not used */
2174
sel_node_t* node = row;
2178
UT_NOT_USED(user_arg);
2180
fprintf(stderr, "row_fetch_print: row %p\n", row);
2182
exp = node->select_list;
2185
dfield_t* dfield = que_node_get_val(exp);
2186
const dtype_t* type = dfield_get_type(dfield);
2188
fprintf(stderr, " column %lu:\n", (ulong)i);
2193
if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
2194
ut_print_buf(stderr, dfield_get_data(dfield),
2195
dfield_get_len(dfield));
2198
fputs(" <NULL>;\n", stderr);
2201
exp = que_node_get_next(exp);
2208
/***********************************************************//**
2209
Prints a row in a select result.
2210
@return query thread to run next or NULL */
2215
que_thr_t* thr) /*!< in: query thread */
2217
row_printf_node_t* node;
2218
sel_node_t* sel_node;
2223
node = thr->run_node;
2225
sel_node = node->sel_node;
2227
ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
2229
if (thr->prev_node == que_node_get_parent(node)) {
2231
/* Reset the cursor */
2232
sel_node->state = SEL_NODE_OPEN;
2234
/* Fetch next row to print */
2236
thr->run_node = sel_node;
2241
if (sel_node->state != SEL_NODE_FETCH) {
2243
ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
2245
/* No more rows to print */
2247
thr->run_node = que_node_get_parent(node);
2252
arg = sel_node->select_list;
2255
dfield_print_also_hex(que_node_get_val(arg));
2257
fputs(" ::: ", stderr);
2259
arg = que_node_get_next(arg);
2264
/* Fetch next row to print */
2266
thr->run_node = sel_node;
2271
/****************************************************************//**
2272
Converts a key value stored in MySQL format to an Innobase dtuple. The last
2273
field of the key value may be just a prefix of a fixed length field: hence
2274
the parameter key_len. But currently we do not allow search keys where the
2275
last field is only a prefix of the full key field len and print a warning if
2276
such appears. A counterpart of this function is
2277
ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2280
row_sel_convert_mysql_key_to_innobase(
2281
/*==================================*/
2282
dtuple_t* tuple, /*!< in/out: tuple where to build;
2283
NOTE: we assume that the type info
2284
in the tuple is already according
2286
byte* buf, /*!< in: buffer to use in field
2288
ulint buf_len, /*!< in: buffer length */
2289
dict_index_t* index, /*!< in: index of the key value */
2290
const byte* key_ptr, /*!< in: MySQL key value */
2291
ulint key_len, /*!< in: MySQL key value length */
2292
trx_t* trx) /*!< in: transaction */
2294
byte* original_buf = buf;
2295
const byte* original_key_ptr = key_ptr;
2296
dict_field_t* field;
2300
ulint data_field_len;
2302
const byte* key_end;
2305
/* For documentation of the key value storage format in MySQL, see
2306
ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2308
key_end = key_ptr + key_len;
2310
/* Permit us to access any field in the tuple (ULINT_MAX): */
2312
dtuple_set_n_fields(tuple, ULINT_MAX);
2314
dfield = dtuple_get_nth_field(tuple, 0);
2315
field = dict_index_get_nth_field(index, 0);
2317
if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
2318
/* A special case: we are looking for a position in the
2319
generated clustered index which InnoDB automatically added
2320
to a table with no primary key: the first and the only
2321
ordering column is ROW_ID which InnoDB stored to the key_ptr
2324
ut_a(key_len == DATA_ROW_ID_LEN);
2326
dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
2328
dtuple_set_n_fields(tuple, 1);
2333
while (key_ptr < key_end) {
2335
ulint type = dfield_get_type(dfield)->mtype;
2336
ut_a(field->col->mtype == type);
2341
if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
2342
/* The first byte in the field tells if this is
2343
an SQL NULL value */
2347
if (*key_ptr != 0) {
2348
dfield_set_null(dfield);
2354
/* Calculate data length and data field total length */
2356
if (type == DATA_BLOB) {
2357
/* The key field is a column prefix of a BLOB or
2360
ut_a(field->prefix_len > 0);
2362
/* MySQL stores the actual data length to the first 2
2363
bytes after the optional SQL NULL marker byte. The
2364
storage format is little-endian, that is, the most
2365
significant byte at a higher address. In UTF-8, MySQL
2366
seems to reserve field->prefix_len bytes for
2367
storing this field in the key value buffer, even
2368
though the actual value only takes data_len bytes
2371
data_len = key_ptr[data_offset]
2372
+ 256 * key_ptr[data_offset + 1];
2373
data_field_len = data_offset + 2 + field->prefix_len;
2377
/* Now that we know the length, we store the column
2378
value like it would be a fixed char field */
2380
} else if (field->prefix_len > 0) {
2381
/* Looks like MySQL pads unused end bytes in the
2382
prefix with space. Therefore, also in UTF-8, it is ok
2383
to compare with a prefix containing full prefix_len
2384
bytes, and no need to take at most prefix_len / 3
2385
UTF-8 characters from the start.
2386
If the prefix is used as the upper end of a LIKE
2387
'abc%' query, then MySQL pads the end with chars
2388
0xff. TODO: in that case does it any harm to compare
2389
with the full prefix_len bytes. How do characters
2390
0xff in UTF-8 behave? */
2392
data_len = field->prefix_len;
2393
data_field_len = data_offset + data_len;
2395
data_len = dfield_get_type(dfield)->len;
2396
data_field_len = data_offset + data_len;
2400
(dtype_get_mysql_type(dfield_get_type(dfield))
2401
== DATA_MYSQL_TRUE_VARCHAR)
2402
&& UNIV_LIKELY(type != DATA_INT)) {
2403
/* In a MySQL key value format, a true VARCHAR is
2404
always preceded by 2 bytes of a length field.
2405
dfield_get_type(dfield)->len returns the maximum
2406
'payload' len in bytes. That does not include the
2407
2 bytes that tell the actual data length.
2409
We added the check != DATA_INT to make sure we do
2410
not treat MySQL ENUM or SET as a true VARCHAR! */
2413
data_field_len += 2;
2416
/* Storing may use at most data_len bytes of buf */
2418
if (UNIV_LIKELY(!is_null)) {
2419
row_mysql_store_col_in_innobase_format(
2421
FALSE, /* MySQL key value format col */
2422
key_ptr + data_offset, data_len,
2423
dict_table_is_comp(index->table));
2427
key_ptr += data_field_len;
2429
if (UNIV_UNLIKELY(key_ptr > key_end)) {
2430
/* The last field in key was not a complete key field
2433
Print a warning about this! HA_READ_PREFIX_LAST does
2434
not currently work in InnoDB with partial-field key
2435
value prefixes. Since MySQL currently uses a padding
2436
trick to calculate LIKE 'abc%' type queries there
2437
should never be partial-field prefixes in searches. */
2439
ut_print_timestamp(stderr);
2441
fputs(" InnoDB: Warning: using a partial-field"
2442
" key prefix in search.\n"
2443
"InnoDB: ", stderr);
2444
dict_index_name_print(stderr, trx, index);
2445
fprintf(stderr, ". Last data field length %lu bytes,\n"
2446
"InnoDB: key ptr now exceeds"
2447
" key end by %lu bytes.\n"
2448
"InnoDB: Key value in the MySQL format:\n",
2449
(ulong) data_field_len,
2450
(ulong) (key_ptr - key_end));
2452
ut_print_buf(stderr, original_key_ptr, key_len);
2456
ulint len = dfield_get_len(dfield);
2457
dfield_set_len(dfield, len
2458
- (ulint) (key_ptr - key_end));
2467
ut_a(buf <= original_buf + buf_len);
2469
/* We set the length of tuple to n_fields: we assume that the memory
2470
area allocated for it is big enough (usually bigger than n_fields). */
2472
dtuple_set_n_fields(tuple, n_fields);
2475
/**************************************************************//**
2476
Stores the row id to the prebuilt struct. */
2479
row_sel_store_row_id_to_prebuilt(
2480
/*=============================*/
2481
row_prebuilt_t* prebuilt, /*!< in/out: prebuilt */
2482
const rec_t* index_rec, /*!< in: record */
2483
const dict_index_t* index, /*!< in: index of the record */
2484
const ulint* offsets) /*!< in: rec_get_offsets
2485
(index_rec, index) */
2490
ut_ad(rec_offs_validate(index_rec, index, offsets));
2492
data = rec_get_nth_field(
2494
dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
2496
if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
2498
"InnoDB: Error: Row id field is"
2499
" wrong length %lu in ", (ulong) len);
2500
dict_index_name_print(stderr, prebuilt->trx, index);
2501
fprintf(stderr, "\n"
2502
"InnoDB: Field number %lu, record:\n",
2503
(ulong) dict_index_get_sys_col_pos(index,
2505
rec_print_new(stderr, index_rec, offsets);
2510
ut_memcpy(prebuilt->row_id, data, len);
2513
/**************************************************************//**
2514
Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
2515
function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
2518
row_sel_field_store_in_mysql_format(
2519
/*================================*/
2520
byte* dest, /*!< in/out: buffer where to store; NOTE
2521
that BLOBs are not in themselves
2522
stored here: the caller must allocate
2523
and copy the BLOB into buffer before,
2524
and pass the pointer to the BLOB in
2526
const mysql_row_templ_t* templ,
2527
/*!< in: MySQL column template.
2528
Its following fields are referenced:
2529
type, is_unsigned, mysql_col_len,
2530
mbminlen, mbmaxlen */
2531
const byte* data, /*!< in: data to store */
2532
ulint len) /*!< in: length of the data */
2536
ut_ad(len != UNIV_SQL_NULL);
2537
UNIV_MEM_ASSERT_RW(data, len);
2539
switch (templ->type) {
2540
const byte* field_end;
2543
/* Convert integer data from Innobase to a little-endian
2544
format, sign bit restored to normal */
2557
if (!templ->is_unsigned) {
2558
dest[len - 1] = (byte) (dest[len - 1] ^ 128);
2561
ut_ad(templ->mysql_col_len == len);
2567
field_end = dest + templ->mysql_col_len;
2569
if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
2570
/* This is a >= 5.0.3 type true VARCHAR. Store the
2571
length of the data to the first byte or the first
2572
two bytes of dest. */
2574
dest = row_mysql_store_true_var_len(
2575
dest, len, templ->mysql_length_bytes);
2578
/* Copy the actual data */
2579
ut_memcpy(dest, data, len);
2581
/* Pad with trailing spaces. We pad with spaces also the
2582
unused end of a >= 5.0.3 true VARCHAR column, just in case
2583
MySQL expects its contents to be deterministic. */
2587
ut_ad(templ->mbminlen <= templ->mbmaxlen);
2589
/* We treat some Unicode charset strings specially. */
2590
switch (templ->mbminlen) {
2592
/* InnoDB should never have stripped partial
2593
UTF-32 characters. */
2597
/* A space char is two bytes,
2598
0x0020 in UCS2 and UTF-16 */
2600
if (UNIV_UNLIKELY(len & 1)) {
2601
/* A 0x20 has been stripped from the column.
2604
if (pad < field_end) {
2610
row_mysql_pad_col(templ->mbminlen, pad, field_end - pad);
2614
/* Store a pointer to the BLOB buffer to dest: the BLOB was
2615
already copied to the buffer in row_sel_store_mysql_rec */
2617
row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
2622
memcpy(dest, data, len);
2624
ut_ad(templ->mysql_col_len >= len);
2625
ut_ad(templ->mbmaxlen >= templ->mbminlen);
2627
ut_ad(templ->mbmaxlen > templ->mbminlen
2628
|| templ->mysql_col_len == len);
2629
/* The following assertion would fail for old tables
2630
containing UTF-8 ENUM columns due to Bug #9526. */
2631
ut_ad(!templ->mbmaxlen
2632
|| !(templ->mysql_col_len % templ->mbmaxlen));
2633
ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
2635
if (templ->mbminlen == 1 && templ->mbmaxlen != 1) {
2636
/* Pad with spaces. This undoes the stripping
2637
done in row0mysql.c, function
2638
row_mysql_store_col_in_innobase_format(). */
2640
memset(dest + len, 0x20, templ->mysql_col_len - len);
2646
case DATA_SYS_CHILD:
2648
/* These column types should never be shipped to MySQL. */
2652
case DATA_FIXBINARY:
2656
/* Above are the valid column types for MySQL data. */
2657
#endif /* UNIV_DEBUG */
2658
ut_ad(templ->mysql_col_len == len);
2659
memcpy(dest, data, len);
2663
/**************************************************************//**
2664
Convert a row in the Innobase format to a row in the MySQL format.
2665
Note that the template in prebuilt may advise us to copy only a few
2666
columns to mysql_rec, other columns are left blank. All columns may not
2667
be needed in the query.
2668
@return TRUE on success, FALSE if not all columns could be retrieved */
2671
__attribute__((warn_unused_result))
2674
row_sel_store_mysql_rec(
2675
/*====================*/
2676
byte* mysql_rec, /*!< out: row in the MySQL format */
2677
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
2678
const rec_t* rec, /*!< in: Innobase record in the index
2679
which was described in prebuilt's
2680
template; must be protected by
2682
const ulint* offsets) /*!< in: array returned by
2683
rec_get_offsets() */
2685
mysql_row_templ_t* templ;
2686
mem_heap_t* extern_field_heap = NULL;
2692
ut_ad(prebuilt->mysql_template);
2693
ut_ad(prebuilt->default_rec);
2694
ut_ad(rec_offs_validate(rec, NULL, offsets));
2696
if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
2697
mem_heap_free(prebuilt->blob_heap);
2698
prebuilt->blob_heap = NULL;
2701
for (i = 0; i < prebuilt->n_template ; i++) {
2703
templ = prebuilt->mysql_template + i;
2705
if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
2706
templ->rec_field_no))) {
2708
/* Copy an externally stored field to the temporary
2711
ut_a(!prebuilt->trx->has_search_latch);
2713
if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2714
if (prebuilt->blob_heap == NULL) {
2715
prebuilt->blob_heap = mem_heap_create(
2719
heap = prebuilt->blob_heap;
2722
= mem_heap_create(UNIV_PAGE_SIZE);
2724
heap = extern_field_heap;
2727
/* NOTE: if we are retrieving a big BLOB, we may
2728
already run out of memory in the next call, which
2731
data = btr_rec_copy_externally_stored_field(
2733
dict_table_zip_size(prebuilt->table),
2734
templ->rec_field_no, &len, heap);
2736
if (UNIV_UNLIKELY(!data)) {
2737
/* The externally stored field
2738
was not written yet. This
2739
record should only be seen by
2740
recv_recovery_rollback_active()
2741
or any TRX_ISO_READ_UNCOMMITTED
2744
if (extern_field_heap) {
2745
mem_heap_free(extern_field_heap);
2751
if (UNIV_UNLIKELY(!data)) {
2752
/* The externally stored field
2753
was not written yet. This
2754
record should only be seen by
2755
recv_recovery_rollback_active()
2756
or any TRX_ISO_READ_UNCOMMITTED
2759
if (extern_field_heap) {
2760
mem_heap_free(extern_field_heap);
2766
ut_a(len != UNIV_SQL_NULL);
2768
/* Field is stored in the row. */
2770
data = rec_get_nth_field(rec, offsets,
2771
templ->rec_field_no, &len);
2773
if (UNIV_UNLIKELY(templ->type == DATA_BLOB)
2774
&& len != UNIV_SQL_NULL) {
2776
/* It is a BLOB field locally stored in the
2777
InnoDB record: we MUST copy its contents to
2778
prebuilt->blob_heap here because later code
2779
assumes all BLOB values have been copied to a
2782
if (prebuilt->blob_heap == NULL) {
2783
prebuilt->blob_heap = mem_heap_create(
2787
data = memcpy(mem_heap_alloc(
2788
prebuilt->blob_heap, len),
2793
if (len != UNIV_SQL_NULL) {
2794
row_sel_field_store_in_mysql_format(
2795
mysql_rec + templ->mysql_col_offset,
2799
if (extern_field_heap) {
2800
mem_heap_free(extern_field_heap);
2801
extern_field_heap = NULL;
2804
if (templ->mysql_null_bit_mask) {
2805
/* It is a nullable column with a non-NULL
2807
mysql_rec[templ->mysql_null_byte_offset]
2808
&= ~(byte) templ->mysql_null_bit_mask;
2811
/* MySQL assumes that the field for an SQL
2812
NULL value is set to the default value. */
2814
UNIV_MEM_ASSERT_RW(prebuilt->default_rec
2815
+ templ->mysql_col_offset,
2816
templ->mysql_col_len);
2817
mysql_rec[templ->mysql_null_byte_offset]
2818
|= (byte) templ->mysql_null_bit_mask;
2819
memcpy(mysql_rec + templ->mysql_col_offset,
2820
(const byte*) prebuilt->default_rec
2821
+ templ->mysql_col_offset,
2822
templ->mysql_col_len);
2829
/*********************************************************************//**
2830
Builds a previous version of a clustered index record for a consistent read
2831
@return DB_SUCCESS or error code */
2834
row_sel_build_prev_vers_for_mysql(
2835
/*==============================*/
2836
read_view_t* read_view, /*!< in: read view */
2837
dict_index_t* clust_index, /*!< in: clustered index */
2838
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
2839
const rec_t* rec, /*!< in: record in a clustered index */
2840
ulint** offsets, /*!< in/out: offsets returned by
2841
rec_get_offsets(rec, clust_index) */
2842
mem_heap_t** offset_heap, /*!< in/out: memory heap from which
2843
the offsets are allocated */
2844
rec_t** old_vers, /*!< out: old version, or NULL if the
2845
record does not exist in the view:
2846
i.e., it was freshly inserted
2848
mtr_t* mtr) /*!< in: mtr */
2852
if (prebuilt->old_vers_heap) {
2853
mem_heap_empty(prebuilt->old_vers_heap);
2855
prebuilt->old_vers_heap = mem_heap_create(200);
2858
err = row_vers_build_for_consistent_read(
2859
rec, mtr, clust_index, offsets, read_view, offset_heap,
2860
prebuilt->old_vers_heap, old_vers);
2864
/*********************************************************************//**
2865
Retrieves the clustered index record corresponding to a record in a
2866
non-clustered index. Does the necessary locking. Used in the MySQL
2868
@return DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
2871
row_sel_get_clust_rec_for_mysql(
2872
/*============================*/
2873
row_prebuilt_t* prebuilt,/*!< in: prebuilt struct in the handle */
2874
dict_index_t* sec_index,/*!< in: secondary index where rec resides */
2875
const rec_t* rec, /*!< in: record in a non-clustered index; if
2876
this is a locking read, then rec is not
2877
allowed to be delete-marked, and that would
2878
not make sense either */
2879
que_thr_t* thr, /*!< in: query thread */
2880
const rec_t** out_rec,/*!< out: clustered record or an old version of
2881
it, NULL if the old version did not exist
2882
in the read view, i.e., it was a fresh
2884
ulint** offsets,/*!< in: offsets returned by
2885
rec_get_offsets(rec, sec_index);
2886
out: offsets returned by
2887
rec_get_offsets(out_rec, clust_index) */
2888
mem_heap_t** offset_heap,/*!< in/out: memory heap from which
2889
the offsets are allocated */
2890
mtr_t* mtr) /*!< in: mtr used to get access to the
2891
non-clustered record; the same mtr is used to
2892
access the clustered index */
2894
dict_index_t* clust_index;
2895
const rec_t* clust_rec;
2901
trx = thr_get_trx(thr);
2903
row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
2904
sec_index, *offsets, trx);
2906
clust_index = dict_table_get_first_index(sec_index->table);
2908
btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
2909
PAGE_CUR_LE, BTR_SEARCH_LEAF,
2910
prebuilt->clust_pcur, 0, mtr);
2912
clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
2914
prebuilt->clust_pcur->trx_if_known = trx;
2916
/* Note: only if the search ends up on a non-infimum record is the
2917
low_match value the real match to the search tuple */
2919
if (!page_rec_is_user_rec(clust_rec)
2920
|| btr_pcur_get_low_match(prebuilt->clust_pcur)
2921
< dict_index_get_n_unique(clust_index)) {
2923
/* In a rare case it is possible that no clust rec is found
2924
for a delete-marked secondary index record: if in row0umod.c
2925
in row_undo_mod_remove_clust_low() we have already removed
2926
the clust rec, while purge is still cleaning and removing
2927
secondary index records associated with earlier versions of
2928
the clustered index record. In that case we know that the
2929
clustered index record did not exist in the read view of
2932
if (!rec_get_deleted_flag(rec,
2933
dict_table_is_comp(sec_index->table))
2934
|| prebuilt->select_lock_type != LOCK_NONE) {
2935
ut_print_timestamp(stderr);
2936
fputs(" InnoDB: error clustered record"
2937
" for sec rec not found\n"
2938
"InnoDB: ", stderr);
2939
dict_index_name_print(stderr, trx, sec_index);
2941
"InnoDB: sec index record ", stderr);
2942
rec_print(stderr, rec, sec_index);
2944
"InnoDB: clust index record ", stderr);
2945
rec_print(stderr, clust_rec, clust_index);
2947
trx_print(stderr, trx, 600);
2950
"InnoDB: Submit a detailed bug report"
2951
" to http://bugs.mysql.com\n", stderr);
2960
*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
2961
ULINT_UNDEFINED, offset_heap);
2963
if (prebuilt->select_lock_type != LOCK_NONE) {
2964
/* Try to place a lock on the index record; we are searching
2965
the clust rec with a unique condition, hence
2966
we set a LOCK_REC_NOT_GAP type lock */
2968
err = lock_clust_rec_read_check_and_lock(
2969
0, btr_pcur_get_block(prebuilt->clust_pcur),
2970
clust_rec, clust_index, *offsets,
2971
prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr);
2974
case DB_SUCCESS_LOCKED_REC:
2980
/* This is a non-locking consistent read: if necessary, fetch
2981
a previous version of the record */
2985
/* If the isolation level allows reading of uncommitted data,
2986
then we never look for an earlier version */
2988
if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
2989
&& !lock_clust_rec_cons_read_sees(
2990
clust_rec, clust_index, *offsets,
2993
/* The following call returns 'offsets' associated with
2995
err = row_sel_build_prev_vers_for_mysql(
2996
trx->read_view, clust_index, prebuilt,
2997
clust_rec, offsets, offset_heap, &old_vers,
3000
if (err != DB_SUCCESS || old_vers == NULL) {
3005
clust_rec = old_vers;
3008
/* If we had to go to an earlier version of row or the
3009
secondary index record is delete marked, then it may be that
3010
the secondary index record corresponding to clust_rec
3011
(or old_vers) is not rec; in that case we must ignore
3012
such row because in our snapshot rec would not have existed.
3013
Remember that from rec we cannot see directly which transaction
3014
id corresponds to it: we have to go to the clustered index
3015
record. A query where we want to fetch all rows where
3016
the secondary index value is in some interval would return
3017
a wrong result if we would not drop rows which we come to
3018
visit through secondary index records that would not really
3019
exist in our snapshot. */
3023
|| trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
3024
|| rec_get_deleted_flag(rec, dict_table_is_comp(
3026
&& !row_sel_sec_rec_is_for_clust_rec(
3027
rec, sec_index, clust_rec, clust_index)) {
3029
#ifdef UNIV_SEARCH_DEBUG
3031
ut_a(clust_rec == NULL
3032
|| row_sel_sec_rec_is_for_clust_rec(
3033
rec, sec_index, clust_rec, clust_index));
3041
*out_rec = clust_rec;
3043
if (prebuilt->select_lock_type != LOCK_NONE) {
3044
/* We may use the cursor in update or in unlock_row():
3045
store its position */
3047
btr_pcur_store_position(prebuilt->clust_pcur, mtr);
3054
/********************************************************************//**
3055
Restores cursor position after it has been stored. We have to take into
3056
account that the record cursor was positioned on may have been deleted.
3057
Then we may have to move the cursor one step up or down.
3058
@return TRUE if we may need to process the record the cursor is now
3059
positioned on (i.e. we should not go to the next record yet) */
3062
sel_restore_position_for_mysql(
3063
/*===========================*/
3064
ibool* same_user_rec, /*!< out: TRUE if we were able to restore
3065
the cursor on a user record with the
3066
same ordering prefix in in the
3068
ulint latch_mode, /*!< in: latch mode wished in
3070
btr_pcur_t* pcur, /*!< in: cursor whose position
3072
ibool moves_up, /*!< in: TRUE if the cursor moves up
3074
mtr_t* mtr) /*!< in: mtr; CAUTION: may commit
3078
ulint relative_position;
3080
relative_position = pcur->rel_pos;
3082
success = btr_pcur_restore_position(latch_mode, pcur, mtr);
3084
*same_user_rec = success;
3086
if (relative_position == BTR_PCUR_ON) {
3092
btr_pcur_move_to_next(pcur, mtr);
3098
if (relative_position == BTR_PCUR_AFTER
3099
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
3105
if (btr_pcur_is_on_user_rec(pcur)) {
3106
btr_pcur_move_to_prev(pcur, mtr);
3112
ut_ad(relative_position == BTR_PCUR_BEFORE
3113
|| relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
3115
if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
3116
btr_pcur_move_to_next(pcur, mtr);
3122
/********************************************************************//**
3123
Pops a cached row for MySQL from the fetch cache. */
3126
row_sel_pop_cached_row_for_mysql(
3127
/*=============================*/
3128
byte* buf, /*!< in/out: buffer where to copy the
3130
row_prebuilt_t* prebuilt) /*!< in: prebuilt struct */
3133
mysql_row_templ_t* templ;
3135
ut_ad(prebuilt->n_fetch_cached > 0);
3136
ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
3138
if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
3139
/* Copy cache record field by field, don't touch fields that
3140
are not covered by current key */
3141
cached_rec = prebuilt->fetch_cache[
3142
prebuilt->fetch_cache_first];
3144
for (i = 0; i < prebuilt->n_template; i++) {
3145
templ = prebuilt->mysql_template + i;
3146
#if 0 /* Some of the cached_rec may legitimately be uninitialized. */
3147
UNIV_MEM_ASSERT_RW(cached_rec
3148
+ templ->mysql_col_offset,
3149
templ->mysql_col_len);
3151
ut_memcpy(buf + templ->mysql_col_offset,
3152
cached_rec + templ->mysql_col_offset,
3153
templ->mysql_col_len);
3154
/* Copy NULL bit of the current field from cached_rec
3156
if (templ->mysql_null_bit_mask) {
3157
buf[templ->mysql_null_byte_offset]
3158
^= (buf[templ->mysql_null_byte_offset]
3159
^ cached_rec[templ->mysql_null_byte_offset])
3160
& (byte)templ->mysql_null_bit_mask;
3165
#if 0 /* Some of the cached_rec may legitimately be uninitialized. */
3166
UNIV_MEM_ASSERT_RW(prebuilt->fetch_cache
3167
[prebuilt->fetch_cache_first],
3168
prebuilt->mysql_prefix_len);
3171
prebuilt->fetch_cache[prebuilt->fetch_cache_first],
3172
prebuilt->mysql_prefix_len);
3174
prebuilt->n_fetch_cached--;
3175
prebuilt->fetch_cache_first++;
3177
if (prebuilt->n_fetch_cached == 0) {
3178
prebuilt->fetch_cache_first = 0;
3182
/********************************************************************//**
3183
Pushes a row for MySQL to the fetch cache.
3184
@return TRUE on success, FALSE if the record contains incomplete BLOBs */
3187
__attribute__((warn_unused_result))
3190
row_sel_push_cache_row_for_mysql(
3191
/*=============================*/
3192
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct */
3193
const rec_t* rec, /*!< in: record to push; must
3194
be protected by a page latch */
3195
const ulint* offsets) /*!<in: rec_get_offsets() */
3200
ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
3201
ut_ad(rec_offs_validate(rec, NULL, offsets));
3202
ut_a(!prebuilt->templ_contains_blob);
3204
if (prebuilt->fetch_cache[0] == NULL) {
3205
/* Allocate memory for the fetch cache */
3207
for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
3209
/* A user has reported memory corruption in these
3210
buffers in Linux. Put magic numbers there to help
3211
to track a possible bug. */
3213
buf = mem_alloc(prebuilt->mysql_row_len + 8);
3215
prebuilt->fetch_cache[i] = buf + 4;
3217
mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
3218
mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
3219
ROW_PREBUILT_FETCH_MAGIC_N);
3223
ut_ad(prebuilt->fetch_cache_first == 0);
3224
UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
3225
prebuilt->mysql_row_len);
3227
if (UNIV_UNLIKELY(!row_sel_store_mysql_rec(
3228
prebuilt->fetch_cache[
3229
prebuilt->n_fetch_cached],
3230
prebuilt, rec, offsets))) {
3234
prebuilt->n_fetch_cached++;
3238
/*********************************************************************//**
3239
Tries to do a shortcut to fetch a clustered index record with a unique key,
3240
using the hash index if possible (not always). We assume that the search
3241
mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
3242
btr search latch has been locked in S-mode.
3243
@return SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
3246
row_sel_try_search_shortcut_for_mysql(
3247
/*==================================*/
3248
const rec_t** out_rec,/*!< out: record if found */
3249
row_prebuilt_t* prebuilt,/*!< in: prebuilt struct */
3250
ulint** offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
3251
mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
3252
mtr_t* mtr) /*!< in: started mtr */
3254
dict_index_t* index = prebuilt->index;
3255
const dtuple_t* search_tuple = prebuilt->search_tuple;
3256
btr_pcur_t* pcur = prebuilt->pcur;
3257
trx_t* trx = prebuilt->trx;
3260
ut_ad(dict_index_is_clust(index));
3261
ut_ad(!prebuilt->templ_contains_blob);
3263
#ifndef UNIV_SEARCH_DEBUG
3264
btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3265
BTR_SEARCH_LEAF, pcur,
3268
#else /* UNIV_SEARCH_DEBUG */
3269
btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3270
BTR_SEARCH_LEAF, pcur,
3273
#endif /* UNIV_SEARCH_DEBUG */
3274
rec = btr_pcur_get_rec(pcur);
3276
if (!page_rec_is_user_rec(rec)) {
3281
/* As the cursor is now placed on a user record after a search with
3282
the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
3283
fields in the user record matched to the search tuple */
3285
if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
3287
return(SEL_EXHAUSTED);
3290
/* This is a non-locking consistent read: if necessary, fetch
3291
a previous version of the record */
3293
*offsets = rec_get_offsets(rec, index, *offsets,
3294
ULINT_UNDEFINED, heap);
3296
if (!lock_clust_rec_cons_read_sees(rec, index,
3297
*offsets, trx->read_view)) {
3302
if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
3304
return(SEL_EXHAUSTED);
3312
/********************************************************************//**
3313
Searches for rows in the database. This is used in the interface to
3314
MySQL. This function opens a cursor, and also implements fetch next
3315
and fetch prev. NOTE that if we do a search with a full key value
3316
from a unique index (ROW_SEL_EXACT), then we will not store the cursor
3317
position and fetch next or fetch prev must not be tried to the cursor!
3318
@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
3319
DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
3322
row_search_for_mysql(
3323
/*=================*/
3324
byte* buf, /*!< in/out: buffer for the fetched
3325
row in the MySQL format */
3326
ulint mode, /*!< in: search mode PAGE_CUR_L, ... */
3327
row_prebuilt_t* prebuilt, /*!< in: prebuilt struct for the
3328
table handle; this contains the info
3329
of search_tuple, index; if search
3330
tuple contains 0 fields then we
3331
position the cursor at the start or
3332
the end of the index, depending on
3334
ulint match_mode, /*!< in: 0 or ROW_SEL_EXACT or
3335
ROW_SEL_EXACT_PREFIX */
3336
ulint direction) /*!< in: 0 or ROW_SEL_NEXT or
3337
ROW_SEL_PREV; NOTE: if this is != 0,
3338
then prebuilt must have a pcur
3339
with stored position! In opening of a
3340
cursor 'direction' should be 0. */
3342
dict_index_t* index = prebuilt->index;
3343
ibool comp = dict_table_is_comp(index->table);
3344
const dtuple_t* search_tuple = prebuilt->search_tuple;
3345
btr_pcur_t* pcur = prebuilt->pcur;
3346
trx_t* trx = prebuilt->trx;
3347
dict_index_t* clust_index;
3350
const rec_t* result_rec;
3351
const rec_t* clust_rec;
3352
ulint err = DB_SUCCESS;
3353
ibool unique_search = FALSE;
3354
ibool unique_search_from_clust_index = FALSE;
3355
ibool mtr_has_extra_clust_latch = FALSE;
3356
ibool moves_up = FALSE;
3357
ibool set_also_gap_locks = TRUE;
3358
/* if the query is a plain locking SELECT, and the isolation level
3359
is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
3360
ibool did_semi_consistent_read = FALSE;
3361
/* if the returned record was locked and we did a semi-consistent
3362
read (fetch the newest committed version), then this is set to
3364
#ifdef UNIV_SEARCH_DEBUG
3366
#endif /* UNIV_SEARCH_DEBUG */
3368
ibool same_user_rec;
3370
mem_heap_t* heap = NULL;
3371
ulint offsets_[REC_OFFS_NORMAL_SIZE];
3372
ulint* offsets = offsets_;
3373
ibool table_lock_waited = FALSE;
3375
rec_offs_init(offsets_);
3377
ut_ad(index && pcur && search_tuple);
3378
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
3380
if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) {
3381
ut_print_timestamp(stderr);
3382
fprintf(stderr, " InnoDB: Error:\n"
3383
"InnoDB: MySQL is trying to use a table handle"
3384
" but the .ibd file for\n"
3385
"InnoDB: table %s does not exist.\n"
3386
"InnoDB: Have you deleted the .ibd file"
3387
" from the database directory under\n"
3388
"InnoDB: the MySQL datadir, or have you used"
3389
" DISCARD TABLESPACE?\n"
3390
"InnoDB: Look from\n"
3391
"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
3392
"InnoDB: how you can resolve the problem.\n",
3393
prebuilt->table->name);
3398
if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
3400
return(DB_MISSING_HISTORY);
3403
if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
3405
"InnoDB: Error: trying to free a corrupt\n"
3406
"InnoDB: table handle. Magic n %lu, table name ",
3407
(ulong) prebuilt->magic_n);
3408
ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
3411
mem_analyze_corruption(prebuilt);
3417
fprintf(stderr, "Match mode %lu\n search tuple ",
3418
(ulong) match_mode);
3419
dtuple_print(search_tuple);
3420
fprintf(stderr, "N tables locked %lu\n",
3421
(ulong) trx->mysql_n_tables_locked);
3423
/*-------------------------------------------------------------*/
3424
/* PHASE 0: Release a possible s-latch we are holding on the
3425
adaptive hash index latch if there is someone waiting behind */
3427
if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
3428
&& trx->has_search_latch) {
3430
/* There is an x-latch request on the adaptive hash index:
3431
release the s-latch to reduce starvation and wait for
3432
BTR_SEA_TIMEOUT rounds before trying to keep it again over
3435
rw_lock_s_unlock(&btr_search_latch);
3436
trx->has_search_latch = FALSE;
3438
trx->search_latch_timeout = BTR_SEA_TIMEOUT;
3441
/* Reset the new record lock info if srv_locks_unsafe_for_binlog
3442
is set or session is using a READ COMMITED isolation level. Then
3443
we are able to remove the record locks set here on an individual
3445
prebuilt->new_rec_locks = 0;
3447
/*-------------------------------------------------------------*/
3448
/* PHASE 1: Try to pop the row from the prefetch cache */
3450
if (UNIV_UNLIKELY(direction == 0)) {
3451
trx->op_info = "starting index read";
3453
prebuilt->n_rows_fetched = 0;
3454
prebuilt->n_fetch_cached = 0;
3455
prebuilt->fetch_cache_first = 0;
3457
if (prebuilt->sel_graph == NULL) {
3458
/* Build a dummy select query graph */
3459
row_prebuild_sel_graph(prebuilt);
3462
trx->op_info = "fetching rows";
3464
if (prebuilt->n_rows_fetched == 0) {
3465
prebuilt->fetch_direction = direction;
3468
if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
3469
if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
3471
/* TODO: scrollable cursor: restore cursor to
3472
the place of the latest returned row,
3473
or better: prevent caching for a scroll
3477
prebuilt->n_rows_fetched = 0;
3478
prebuilt->n_fetch_cached = 0;
3479
prebuilt->fetch_cache_first = 0;
3481
} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
3482
row_sel_pop_cached_row_for_mysql(buf, prebuilt);
3484
prebuilt->n_rows_fetched++;
3491
if (prebuilt->fetch_cache_first > 0
3492
&& prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
3494
/* The previous returned row was popped from the fetch
3495
cache, but the cache was not full at the time of the
3496
popping: no more rows can exist in the result set */
3498
err = DB_RECORD_NOT_FOUND;
3502
prebuilt->n_rows_fetched++;
3504
if (prebuilt->n_rows_fetched > 1000000000) {
3505
/* Prevent wrap-over */
3506
prebuilt->n_rows_fetched = 500000000;
3509
mode = pcur->search_mode;
3512
/* In a search where at most one record in the index may match, we
3513
can use a LOCK_REC_NOT_GAP type record lock when locking a
3514
non-delete-marked matching record.
3516
Note that in a unique secondary index there may be different
3517
delete-marked versions of a record where only the primary key
3518
values differ: thus in a secondary index we must use next-key
3519
locks when locking delete-marked records. */
3521
if (match_mode == ROW_SEL_EXACT
3522
&& dict_index_is_unique(index)
3523
&& dtuple_get_n_fields(search_tuple)
3524
== dict_index_get_n_unique(index)
3525
&& (dict_index_is_clust(index)
3526
|| !dtuple_contains_null(search_tuple))) {
3528
/* Note above that a UNIQUE secondary index can contain many
3529
rows with the same key value if one of the columns is the SQL
3530
null. A clustered index under MySQL can never contain null
3531
columns because we demand that all the columns in primary key
3534
unique_search = TRUE;
3536
/* Even if the condition is unique, MySQL seems to try to
3537
retrieve also a second row if a primary key contains more than
3540
if (UNIV_UNLIKELY(direction != 0)) {
3542
err = DB_RECORD_NOT_FOUND;
3549
/*-------------------------------------------------------------*/
3550
/* PHASE 2: Try fast adaptive hash index search if possible */
3552
/* Next test if this is the special case where we can use the fast
3553
adaptive hash index to try the search. Since we must release the
3554
search system latch when we retrieve an externally stored field, we
3555
cannot use the adaptive hash index in a search in the case the row
3556
may be long and there may be externally stored fields */
3558
if (UNIV_UNLIKELY(direction == 0)
3560
&& dict_index_is_clust(index)
3561
&& !prebuilt->templ_contains_blob
3562
&& (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
3566
unique_search_from_clust_index = TRUE;
3568
if (trx->mysql_n_tables_locked == 0
3569
&& prebuilt->select_lock_type == LOCK_NONE
3570
&& trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3571
&& trx->read_view) {
3573
/* This is a SELECT query done as a consistent read,
3574
and the read view has already been allocated:
3575
let us try a search shortcut through the hash
3577
NOTE that we must also test that
3578
mysql_n_tables_locked == 0, because this might
3579
also be INSERT INTO ... SELECT ... or
3580
CREATE TABLE ... SELECT ... . Our algorithm is
3581
NOT prepared to inserts interleaved with the SELECT,
3582
and if we try that, we can deadlock on the adaptive
3583
hash index semaphore! */
3585
#ifndef UNIV_SEARCH_DEBUG
3586
if (!trx->has_search_latch) {
3587
rw_lock_s_lock(&btr_search_latch);
3588
trx->has_search_latch = TRUE;
3591
switch (row_sel_try_search_shortcut_for_mysql(
3592
&rec, prebuilt, &offsets, &heap,
3595
#ifdef UNIV_SEARCH_DEBUG
3596
ut_a(0 == cmp_dtuple_rec(search_tuple,
3599
/* At this point, rec is protected by
3600
a page latch that was acquired by
3601
row_sel_try_search_shortcut_for_mysql().
3602
The latch will not be released until
3603
mtr_commit(&mtr). */
3604
ut_ad(!rec_get_deleted_flag(rec, comp));
3606
if (!row_sel_store_mysql_rec(buf, prebuilt,
3608
/* Only fresh inserts may contain
3609
incomplete externally stored
3610
columns. Pretend that such
3611
records do not exist. Such
3612
records may only be accessed
3613
at the READ UNCOMMITTED
3614
isolation level or when
3615
rolling back a recovered
3616
transaction. Rollback happens
3617
at a lower level, not here. */
3618
ut_a(trx->isolation_level
3619
== TRX_ISO_READ_UNCOMMITTED);
3621
/* Proceed as in case SEL_RETRY. */
3627
/* ut_print_name(stderr, index->name);
3628
fputs(" shortcut\n", stderr); */
3633
goto release_search_latch_if_needed;
3638
/* ut_print_name(stderr, index->name);
3639
fputs(" record not found 2\n", stderr); */
3641
err = DB_RECORD_NOT_FOUND;
3642
release_search_latch_if_needed:
3643
if (trx->search_latch_timeout > 0
3644
&& trx->has_search_latch) {
3646
trx->search_latch_timeout--;
3648
rw_lock_s_unlock(&btr_search_latch);
3649
trx->has_search_latch = FALSE;
3652
/* NOTE that we do NOT store the cursor
3668
/*-------------------------------------------------------------*/
3669
/* PHASE 3: Open or restore index cursor position */
3671
if (trx->has_search_latch) {
3672
rw_lock_s_unlock(&btr_search_latch);
3673
trx->has_search_latch = FALSE;
3676
ut_ad(prebuilt->sql_stat_start || trx->conc_state == TRX_ACTIVE);
3677
ut_ad(trx->conc_state == TRX_NOT_STARTED
3678
|| trx->conc_state == TRX_ACTIVE);
3679
ut_ad(prebuilt->sql_stat_start
3680
|| prebuilt->select_lock_type != LOCK_NONE
3683
ut_ad(prebuilt->sql_stat_start || trx->conc_state == TRX_ACTIVE);
3684
ut_ad(trx->conc_state == TRX_NOT_STARTED
3685
|| trx->conc_state == TRX_ACTIVE);
3686
ut_ad(prebuilt->sql_stat_start
3687
|| prebuilt->select_lock_type != LOCK_NONE
3690
trx_start_if_not_started(trx);
3692
if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
3693
&& prebuilt->select_lock_type != LOCK_NONE
3694
&& trx->mysql_thd != NULL
3695
&& thd_is_select(trx->mysql_thd)) {
3696
/* It is a plain locking SELECT and the isolation
3697
level is low: do not lock gaps */
3699
set_also_gap_locks = FALSE;
3702
/* Note that if the search mode was GE or G, then the cursor
3703
naturally moves upward (in fetch next) in alphabetical order,
3704
otherwise downward */
3706
if (UNIV_UNLIKELY(direction == 0)) {
3707
if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
3710
} else if (direction == ROW_SEL_NEXT) {
3714
thr = que_fork_get_first_thr(prebuilt->sel_graph);
3716
que_thr_move_to_run_state_for_mysql(thr, trx);
3718
clust_index = dict_table_get_first_index(index->table);
3720
/* Do some start-of-statement preparations */
3722
if (!prebuilt->sql_stat_start) {
3723
/* No need to set an intention lock or assign a read view */
3725
if (trx->read_view == NULL
3726
&& prebuilt->select_lock_type == LOCK_NONE) {
3728
fputs("InnoDB: Error: MySQL is trying to"
3729
" perform a consistent read\n"
3730
"InnoDB: but the read view is not assigned!\n",
3732
trx_print(stderr, trx, 600);
3733
fputc('\n', stderr);
3736
} else if (prebuilt->select_lock_type == LOCK_NONE) {
3737
/* This is a consistent read */
3738
/* Assign a read view for the query */
3740
trx_assign_read_view(trx);
3741
prebuilt->sql_stat_start = FALSE;
3744
err = lock_table(0, index->table,
3745
prebuilt->select_lock_type == LOCK_S
3746
? LOCK_IS : LOCK_IX, thr);
3748
if (err != DB_SUCCESS) {
3750
table_lock_waited = TRUE;
3751
goto lock_table_wait;
3753
prebuilt->sql_stat_start = FALSE;
3756
/* Open or restore index cursor position */
3758
if (UNIV_LIKELY(direction != 0)) {
3759
ibool need_to_process = sel_restore_position_for_mysql(
3760
&same_user_rec, BTR_SEARCH_LEAF,
3761
pcur, moves_up, &mtr);
3763
if (UNIV_UNLIKELY(need_to_process)) {
3764
if (UNIV_UNLIKELY(prebuilt->row_read_type
3765
== ROW_READ_DID_SEMI_CONSISTENT)) {
3766
/* We did a semi-consistent read,
3767
but the record was removed in
3769
prebuilt->row_read_type
3770
= ROW_READ_TRY_SEMI_CONSISTENT;
3772
} else if (UNIV_LIKELY(prebuilt->row_read_type
3773
!= ROW_READ_DID_SEMI_CONSISTENT)) {
3775
/* The cursor was positioned on the record
3776
that we returned previously. If we need
3777
to repeat a semi-consistent read as a
3778
pessimistic locking read, the record
3779
cannot be skipped. */
3784
} else if (dtuple_get_n_fields(search_tuple) > 0) {
3786
btr_pcur_open_with_no_init(index, search_tuple, mode,
3790
pcur->trx_if_known = trx;
3792
rec = btr_pcur_get_rec(pcur);
3795
&& !page_rec_is_supremum(rec)
3796
&& set_also_gap_locks
3797
&& !(srv_locks_unsafe_for_binlog
3798
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
3799
&& prebuilt->select_lock_type != LOCK_NONE) {
3801
/* Try to place a gap lock on the next index record
3802
to prevent phantoms in ORDER BY ... DESC queries */
3803
const rec_t* next = page_rec_get_next_const(rec);
3805
offsets = rec_get_offsets(next, index, offsets,
3806
ULINT_UNDEFINED, &heap);
3807
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
3808
next, index, offsets,
3809
prebuilt->select_lock_type,
3813
case DB_SUCCESS_LOCKED_REC:
3818
goto lock_wait_or_error;
3822
if (mode == PAGE_CUR_G) {
3823
btr_pcur_open_at_index_side(
3824
TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3826
} else if (mode == PAGE_CUR_L) {
3827
btr_pcur_open_at_index_side(
3828
FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3834
/*-------------------------------------------------------------*/
3835
/* PHASE 4: Look for matching records in a loop */
3837
rec = btr_pcur_get_rec(pcur);
3838
ut_ad(!!page_rec_is_comp(rec) == comp);
3839
#ifdef UNIV_SEARCH_DEBUG
3841
fputs("Using ", stderr);
3842
dict_index_name_print(stderr, index);
3843
fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
3844
page_get_page_no(page_align(rec)));
3847
#endif /* UNIV_SEARCH_DEBUG */
3849
if (page_rec_is_infimum(rec)) {
3851
/* The infimum record on a page cannot be in the result set,
3852
and neither can a record lock be placed on it: we skip such
3858
if (page_rec_is_supremum(rec)) {
3860
if (set_also_gap_locks
3861
&& !(srv_locks_unsafe_for_binlog
3862
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
3863
&& prebuilt->select_lock_type != LOCK_NONE) {
3865
/* Try to place a lock on the index record */
3867
/* If innodb_locks_unsafe_for_binlog option is used
3868
or this session is using a READ COMMITTED isolation
3869
level we do not lock gaps. Supremum record is really
3870
a gap and therefore we do not set locks there. */
3872
offsets = rec_get_offsets(rec, index, offsets,
3873
ULINT_UNDEFINED, &heap);
3874
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
3875
rec, index, offsets,
3876
prebuilt->select_lock_type,
3877
LOCK_ORDINARY, thr);
3880
case DB_SUCCESS_LOCKED_REC:
3885
goto lock_wait_or_error;
3888
/* A page supremum record cannot be in the result set: skip
3889
it now that we have placed a possible lock on it */
3894
/*-------------------------------------------------------------*/
3895
/* Do sanity checks in case our cursor has bumped into page
3899
next_offs = rec_get_next_offs(rec, TRUE);
3900
if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
3905
next_offs = rec_get_next_offs(rec, FALSE);
3906
if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
3912
if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
3915
if (srv_force_recovery == 0 || moves_up == FALSE) {
3916
ut_print_timestamp(stderr);
3917
buf_page_print(page_align(rec), 0);
3919
"\nInnoDB: rec address %p,"
3920
" buf block fix count %lu\n",
3921
(void*) rec, (ulong)
3922
btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
3923
->page.buf_fix_count);
3925
"InnoDB: Index corruption: rec offs %lu"
3926
" next offs %lu, page no %lu,\n"
3928
(ulong) page_offset(rec),
3930
(ulong) page_get_page_no(page_align(rec)));
3931
dict_index_name_print(stderr, trx, index);
3932
fputs(". Run CHECK TABLE. You may need to\n"
3933
"InnoDB: restore from a backup, or"
3934
" dump + drop + reimport the table.\n",
3937
err = DB_CORRUPTION;
3939
goto lock_wait_or_error;
3941
/* The user may be dumping a corrupt table. Jump
3942
over the corruption to recover as much as possible. */
3945
"InnoDB: Index corruption: rec offs %lu"
3946
" next offs %lu, page no %lu,\n"
3948
(ulong) page_offset(rec),
3950
(ulong) page_get_page_no(page_align(rec)));
3951
dict_index_name_print(stderr, trx, index);
3952
fputs(". We try to skip the rest of the page.\n",
3955
btr_pcur_move_to_last_on_page(pcur, &mtr);
3960
/*-------------------------------------------------------------*/
3962
/* Calculate the 'offsets' associated with 'rec' */
3964
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
3966
if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
3967
if (!rec_validate(rec, offsets)
3968
|| !btr_index_rec_validate(rec, index, FALSE)) {
3970
"InnoDB: Index corruption: rec offs %lu"
3971
" next offs %lu, page no %lu,\n"
3973
(ulong) page_offset(rec),
3975
(ulong) page_get_page_no(page_align(rec)));
3976
dict_index_name_print(stderr, trx, index);
3977
fputs(". We try to skip the record.\n",
3984
/* Note that we cannot trust the up_match value in the cursor at this
3985
place because we can arrive here after moving the cursor! Thus
3986
we have to recompare rec and search_tuple to determine if they
3989
if (match_mode == ROW_SEL_EXACT) {
3990
/* Test if the index record matches completely to search_tuple
3991
in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
3993
/* fputs("Comparing rec and search tuple\n", stderr); */
3995
if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
3997
if (set_also_gap_locks
3998
&& !(srv_locks_unsafe_for_binlog
3999
|| trx->isolation_level
4000
<= TRX_ISO_READ_COMMITTED)
4001
&& prebuilt->select_lock_type != LOCK_NONE) {
4003
/* Try to place a gap lock on the index
4004
record only if innodb_locks_unsafe_for_binlog
4005
option is not set or this session is not
4006
using a READ COMMITTED isolation level. */
4008
err = sel_set_rec_lock(
4009
btr_pcur_get_block(pcur),
4010
rec, index, offsets,
4011
prebuilt->select_lock_type, LOCK_GAP,
4015
case DB_SUCCESS_LOCKED_REC:
4019
goto lock_wait_or_error;
4023
btr_pcur_store_position(pcur, &mtr);
4025
err = DB_RECORD_NOT_FOUND;
4026
/* ut_print_name(stderr, index->name);
4027
fputs(" record not found 3\n", stderr); */
4032
} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
4034
if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
4036
if (set_also_gap_locks
4037
&& !(srv_locks_unsafe_for_binlog
4038
|| trx->isolation_level
4039
<= TRX_ISO_READ_COMMITTED)
4040
&& prebuilt->select_lock_type != LOCK_NONE) {
4042
/* Try to place a gap lock on the index
4043
record only if innodb_locks_unsafe_for_binlog
4044
option is not set or this session is not
4045
using a READ COMMITTED isolation level. */
4047
err = sel_set_rec_lock(
4048
btr_pcur_get_block(pcur),
4049
rec, index, offsets,
4050
prebuilt->select_lock_type, LOCK_GAP,
4054
case DB_SUCCESS_LOCKED_REC:
4058
goto lock_wait_or_error;
4062
btr_pcur_store_position(pcur, &mtr);
4064
err = DB_RECORD_NOT_FOUND;
4065
/* ut_print_name(stderr, index->name);
4066
fputs(" record not found 4\n", stderr); */
4072
/* We are ready to look at a possible new index entry in the result
4073
set: the cursor is now placed on a user record */
4075
if (prebuilt->select_lock_type != LOCK_NONE) {
4076
/* Try to place a lock on the index record; note that delete
4077
marked records are a special case in a unique search. If there
4078
is a non-delete marked record, then it is enough to lock its
4079
existence with LOCK_REC_NOT_GAP. */
4081
/* If innodb_locks_unsafe_for_binlog option is used
4082
or this session is using a READ COMMITED isolation
4083
level we lock only the record, i.e., next-key locking is
4088
if (!set_also_gap_locks
4089
|| srv_locks_unsafe_for_binlog
4090
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED
4092
&& !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
4096
lock_type = LOCK_ORDINARY;
4099
/* If we are doing a 'greater or equal than a primary key
4100
value' search from a clustered index, and we find a record
4101
that has that exact primary key value, then there is no need
4102
to lock the gap before the record, because no insert in the
4103
gap can be in our search range. That is, no phantom row can
4106
An example: if col1 is the primary key, the search is WHERE
4107
col1 >= 100, and we find a record where col1 = 100, then no
4108
need to lock the gap before that record. */
4110
if (index == clust_index
4111
&& mode == PAGE_CUR_GE
4113
&& dtuple_get_n_fields_cmp(search_tuple)
4114
== dict_index_get_n_unique(index)
4115
&& 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
4117
lock_type = LOCK_REC_NOT_GAP;
4120
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4121
rec, index, offsets,
4122
prebuilt->select_lock_type,
4126
const rec_t* old_vers;
4127
case DB_SUCCESS_LOCKED_REC:
4128
if (srv_locks_unsafe_for_binlog
4129
|| trx->isolation_level
4130
<= TRX_ISO_READ_COMMITTED) {
4131
/* Note that a record of
4132
prebuilt->index was locked. */
4133
prebuilt->new_rec_locks = 1;
4139
/* Never unlock rows that were part of a conflict. */
4140
prebuilt->new_rec_locks = 0;
4142
if (UNIV_LIKELY(prebuilt->row_read_type
4143
!= ROW_READ_TRY_SEMI_CONSISTENT)
4145
|| index != clust_index) {
4147
goto lock_wait_or_error;
4150
/* The following call returns 'offsets'
4151
associated with 'old_vers' */
4152
err = row_sel_build_committed_vers_for_mysql(
4153
clust_index, prebuilt, rec,
4154
&offsets, &heap, &old_vers, &mtr);
4157
case DB_SUCCESS_LOCKED_REC:
4162
goto lock_wait_or_error;
4165
mutex_enter(&kernel_mutex);
4166
if (trx->was_chosen_as_deadlock_victim) {
4167
mutex_exit(&kernel_mutex);
4170
goto lock_wait_or_error;
4172
if (UNIV_LIKELY(trx->wait_lock != NULL)) {
4173
lock_cancel_waiting_and_release(
4176
mutex_exit(&kernel_mutex);
4178
/* The lock was granted while we were
4179
searching for the last committed version.
4180
Do a normal locking read. */
4182
offsets = rec_get_offsets(rec, index, offsets,
4188
mutex_exit(&kernel_mutex);
4190
if (old_vers == NULL) {
4191
/* The row was not yet committed */
4196
did_semi_consistent_read = TRUE;
4201
goto lock_wait_or_error;
4204
/* This is a non-locking consistent read: if necessary, fetch
4205
a previous version of the record */
4207
if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
4209
/* Do nothing: we let a non-locking SELECT read the
4210
latest version of the record */
4212
} else if (index == clust_index) {
4214
/* Fetch a previous version of the row if the current
4215
one is not visible in the snapshot; if we have a very
4216
high force recovery level set, we try to avoid crashes
4217
by skipping this lookup */
4219
if (UNIV_LIKELY(srv_force_recovery < 5)
4220
&& !lock_clust_rec_cons_read_sees(
4221
rec, index, offsets, trx->read_view)) {
4224
/* The following call returns 'offsets'
4225
associated with 'old_vers' */
4226
err = row_sel_build_prev_vers_for_mysql(
4227
trx->read_view, clust_index,
4228
prebuilt, rec, &offsets, &heap,
4232
case DB_SUCCESS_LOCKED_REC:
4236
goto lock_wait_or_error;
4239
if (old_vers == NULL) {
4240
/* The row did not exist yet in
4249
/* We are looking into a non-clustered index,
4250
and to get the right version of the record we
4251
have to look also into the clustered index: this
4252
is necessary, because we can only get the undo
4253
information via the clustered index record. */
4255
ut_ad(index != clust_index);
4256
ut_ad(!dict_index_is_clust(index));
4257
if (!lock_sec_rec_cons_read_sees(
4258
rec, trx->read_view)) {
4259
goto requires_clust_rec;
4264
/* NOTE that at this point rec can be an old version of a clustered
4265
index record built for a consistent read. We cannot assume after this
4266
point that rec is on a buffer pool page. Functions like
4267
page_rec_is_comp() cannot be used! */
4269
if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
4271
/* The record is delete-marked: we can skip it */
4273
if ((srv_locks_unsafe_for_binlog
4274
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4275
&& prebuilt->select_lock_type != LOCK_NONE
4276
&& !did_semi_consistent_read) {
4278
/* No need to keep a lock on a delete-marked record
4279
if we do not want to use next-key locking. */
4281
row_unlock_for_mysql(prebuilt, TRUE);
4284
/* This is an optimization to skip setting the next key lock
4285
on the record that follows this delete-marked record. This
4286
optimization works because of the unique search criteria
4287
which precludes the presence of a range lock between this
4288
delete marked record and the record following it.
4290
For now this is applicable only to clustered indexes while
4291
doing a unique search. There is scope for further optimization
4292
applicable to unique secondary indexes. Current behaviour is
4293
to widen the scope of a lock on an already delete marked record
4294
if the same record is deleted twice by the same transaction */
4295
if (index == clust_index && unique_search) {
4296
err = DB_RECORD_NOT_FOUND;
4304
/* Get the clustered index record if needed, if we did not do the
4305
search using the clustered index. */
4307
if (index != clust_index && prebuilt->need_to_access_clustered) {
4310
/* We use a 'goto' to the preceding label if a consistent
4311
read of a secondary index record requires us to look up old
4312
versions of the associated clustered index record. */
4314
ut_ad(rec_offs_validate(rec, index, offsets));
4316
/* It was a non-clustered index and we must fetch also the
4317
clustered index record */
4319
mtr_has_extra_clust_latch = TRUE;
4321
/* The following call returns 'offsets' associated with
4322
'clust_rec'. Note that 'clust_rec' can be an old version
4323
built for a consistent read. */
4325
err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
4327
&offsets, &heap, &mtr);
4330
if (clust_rec == NULL) {
4331
/* The record did not exist in the read view */
4332
ut_ad(prebuilt->select_lock_type == LOCK_NONE);
4337
case DB_SUCCESS_LOCKED_REC:
4338
ut_a(clust_rec != NULL);
4339
if (srv_locks_unsafe_for_binlog
4340
|| trx->isolation_level
4341
<= TRX_ISO_READ_COMMITTED) {
4342
/* Note that the clustered index record
4344
prebuilt->new_rec_locks = 2;
4349
goto lock_wait_or_error;
4352
if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
4354
/* The record is delete marked: we can skip it */
4356
if ((srv_locks_unsafe_for_binlog
4357
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4358
&& prebuilt->select_lock_type != LOCK_NONE) {
4360
/* No need to keep a lock on a delete-marked
4361
record if we do not want to use next-key
4364
row_unlock_for_mysql(prebuilt, TRUE);
4370
if (prebuilt->need_to_access_clustered) {
4372
result_rec = clust_rec;
4374
ut_ad(rec_offs_validate(result_rec, clust_index,
4377
/* We used 'offsets' for the clust rec, recalculate
4379
offsets = rec_get_offsets(rec, index, offsets,
4380
ULINT_UNDEFINED, &heap);
4384
/* result_rec can legitimately be delete-marked
4385
now that it has been established that it points to a
4386
clustered index record that exists in the read view. */
4389
ut_ad(!rec_get_deleted_flag(rec, comp));
4392
/* We found a qualifying record 'result_rec'. At this point,
4393
'offsets' are associated with 'result_rec'. */
4395
ut_ad(rec_offs_validate(result_rec,
4396
result_rec != rec ? clust_index : index,
4399
/* At this point, the clustered index record is protected
4400
by a page latch that was acquired when pcur was positioned.
4401
The latch will not be released until mtr_commit(&mtr). */
4403
if ((match_mode == ROW_SEL_EXACT
4404
|| prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
4405
&& prebuilt->select_lock_type == LOCK_NONE
4406
&& !prebuilt->templ_contains_blob
4407
&& !prebuilt->clust_index_was_generated
4408
&& prebuilt->template_type
4409
!= ROW_MYSQL_DUMMY_TEMPLATE) {
4411
/* Inside an update, for example, we do not cache rows,
4412
since we may use the cursor position to do the actual
4413
update, that is why we require ...lock_type == LOCK_NONE.
4414
Since we keep space in prebuilt only for the BLOBs of
4415
a single row, we cannot cache rows in the case there
4416
are BLOBs in the fields to be fetched. In HANDLER we do
4417
not cache rows because there the cursor is a scrollable
4420
if (!row_sel_push_cache_row_for_mysql(prebuilt, result_rec,
4422
/* Only fresh inserts may contain incomplete
4423
externally stored columns. Pretend that such
4424
records do not exist. Such records may only be
4425
accessed at the READ UNCOMMITTED isolation
4426
level or when rolling back a recovered
4427
transaction. Rollback happens at a lower
4429
ut_a(trx->isolation_level == TRX_ISO_READ_UNCOMMITTED);
4430
} else if (prebuilt->n_fetch_cached
4431
== MYSQL_FETCH_CACHE_SIZE) {
4438
if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) {
4439
memcpy(buf + 4, result_rec
4440
- rec_offs_extra_size(offsets),
4441
rec_offs_size(offsets));
4442
mach_write_to_4(buf,
4443
rec_offs_extra_size(offsets) + 4);
4445
if (!row_sel_store_mysql_rec(buf, prebuilt,
4446
result_rec, offsets)) {
4447
/* Only fresh inserts may contain
4448
incomplete externally stored
4449
columns. Pretend that such records do
4450
not exist. Such records may only be
4451
accessed at the READ UNCOMMITTED
4452
isolation level or when rolling back a
4453
recovered transaction. Rollback
4454
happens at a lower level, not here. */
4455
ut_a(trx->isolation_level
4456
== TRX_ISO_READ_UNCOMMITTED);
4461
if (prebuilt->clust_index_was_generated) {
4462
if (result_rec != rec) {
4463
offsets = rec_get_offsets(
4464
rec, index, offsets, ULINT_UNDEFINED,
4467
row_sel_store_row_id_to_prebuilt(prebuilt, rec,
4472
/* From this point on, 'offsets' are invalid. */
4475
/* We have an optimization to save CPU time: if this is a consistent
4476
read on a unique condition on the clustered index, then we do not
4477
store the pcur position, because any fetch next or prev will anyway
4478
return 'end of file'. Exceptions are locking reads and the MySQL
4479
HANDLER command where the user can move the cursor with PREV or NEXT
4480
even after a unique search. */
4482
if (!unique_search_from_clust_index
4483
|| prebuilt->select_lock_type != LOCK_NONE) {
4485
/* Inside an update always store the cursor position */
4487
btr_pcur_store_position(pcur, &mtr);
4495
/* Reset the old and new "did semi-consistent read" flags. */
4496
if (UNIV_UNLIKELY(prebuilt->row_read_type
4497
== ROW_READ_DID_SEMI_CONSISTENT)) {
4498
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4500
did_semi_consistent_read = FALSE;
4501
prebuilt->new_rec_locks = 0;
4503
/*-------------------------------------------------------------*/
4504
/* PHASE 5: Move the cursor to the next index record */
4506
if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
4507
/* We must commit mtr if we are moving to the next
4508
non-clustered index record, because we could break the
4509
latching order if we would access a different clustered
4510
index page right away without releasing the previous. */
4512
btr_pcur_store_position(pcur, &mtr);
4515
mtr_has_extra_clust_latch = FALSE;
4518
if (sel_restore_position_for_mysql(&same_user_rec,
4520
pcur, moves_up, &mtr)) {
4521
#ifdef UNIV_SEARCH_DEBUG
4523
#endif /* UNIV_SEARCH_DEBUG */
4530
if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
4532
btr_pcur_store_position(pcur, &mtr);
4534
if (match_mode != 0) {
4535
err = DB_RECORD_NOT_FOUND;
4537
err = DB_END_OF_INDEX;
4543
if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
4548
#ifdef UNIV_SEARCH_DEBUG
4550
#endif /* UNIV_SEARCH_DEBUG */
4555
/* Reset the old and new "did semi-consistent read" flags. */
4556
if (UNIV_UNLIKELY(prebuilt->row_read_type
4557
== ROW_READ_DID_SEMI_CONSISTENT)) {
4558
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4560
did_semi_consistent_read = FALSE;
4562
/*-------------------------------------------------------------*/
4564
btr_pcur_store_position(pcur, &mtr);
4568
mtr_has_extra_clust_latch = FALSE;
4570
trx->error_state = err;
4572
/* The following is a patch for MySQL */
4574
que_thr_stop_for_mysql(thr);
4576
thr->lock_state = QUE_THR_LOCK_ROW;
4578
if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
4579
/* It was a lock wait, and it ended */
4581
thr->lock_state = QUE_THR_LOCK_NOLOCK;
4584
/* Table lock waited, go try to obtain table lock
4586
if (table_lock_waited) {
4587
table_lock_waited = FALSE;
4589
goto wait_table_again;
4592
sel_restore_position_for_mysql(&same_user_rec,
4593
BTR_SEARCH_LEAF, pcur,
4596
if ((srv_locks_unsafe_for_binlog
4597
|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
4598
&& !same_user_rec) {
4600
/* Since we were not able to restore the cursor
4601
on the same user record, we cannot use
4602
row_unlock_for_mysql() to unlock any records, and
4603
we must thus reset the new rec lock info. Since
4604
in lock0lock.c we have blocked the inheriting of gap
4605
X-locks, we actually do not have any new record locks
4608
Note that if we were able to restore on the 'same'
4609
user record, it is still possible that we were actually
4610
waiting on a delete-marked record, and meanwhile
4611
it was removed by purge and inserted again by some
4612
other user. But that is no problem, because in
4613
rec_loop we will again try to set a lock, and
4614
new_rec_lock_info in trx will be right at the end. */
4616
prebuilt->new_rec_locks = 0;
4619
mode = pcur->search_mode;
4624
thr->lock_state = QUE_THR_LOCK_NOLOCK;
4626
#ifdef UNIV_SEARCH_DEBUG
4627
/* fputs("Using ", stderr);
4628
dict_index_name_print(stderr, index);
4629
fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4630
#endif /* UNIV_SEARCH_DEBUG */
4634
/*-------------------------------------------------------------*/
4635
que_thr_stop_for_mysql_no_error(thr, trx);
4639
if (prebuilt->n_fetch_cached > 0) {
4640
row_sel_pop_cached_row_for_mysql(buf, prebuilt);
4645
#ifdef UNIV_SEARCH_DEBUG
4646
/* fputs("Using ", stderr);
4647
dict_index_name_print(stderr, index);
4648
fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4649
#endif /* UNIV_SEARCH_DEBUG */
4650
if (err == DB_SUCCESS) {
4656
if (UNIV_LIKELY_NULL(heap)) {
4657
mem_heap_free(heap);
4660
/* Set or reset the "did semi-consistent read" flag on return.
4661
The flag did_semi_consistent_read is set if and only if
4662
the record being returned was fetched with a semi-consistent read. */
4663
ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
4664
|| !did_semi_consistent_read);
4666
if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
4667
if (UNIV_UNLIKELY(did_semi_consistent_read)) {
4668
prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
4670
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4676
/*******************************************************************//**
4677
Checks if MySQL at the moment is allowed for this table to retrieve a
4678
consistent read result, or store it to the query cache.
4679
@return TRUE if storing or retrieving from the query cache is permitted */
4682
row_search_check_if_query_cache_permitted(
4683
/*======================================*/
4684
trx_t* trx, /*!< in: transaction object */
4685
const char* norm_name) /*!< in: concatenation of database name,
4686
'/' char, table name */
4688
dict_table_t* table;
4691
table = dict_table_get(norm_name, FALSE);
4693
if (table == NULL) {
4698
mutex_enter(&kernel_mutex);
4700
/* Start the transaction if it is not started yet */
4702
trx_start_if_not_started_low(trx);
4704
/* If there are locks on the table or some trx has invalidated the
4705
cache up to our trx id, then ret = FALSE.
4706
We do not check what type locks there are on the table, though only
4707
IX type locks actually would require ret = FALSE. */
4709
if (UT_LIST_GET_LEN(table->locks) == 0
4710
&& trx->id >= table->query_cache_inv_trx_id) {
4714
/* If the isolation level is high, assign a read view for the
4715
transaction if it does not yet have one */
4717
if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
4718
&& !trx->read_view) {
4720
trx->read_view = read_view_open_now(
4721
trx->id, trx->global_read_view_heap);
4722
trx->global_read_view = trx->read_view;
4726
mutex_exit(&kernel_mutex);
4731
/*******************************************************************//**
4732
Read the AUTOINC column from the current row. If the value is less than
4733
0 and the type is not unsigned then we reset the value to 0.
4734
@return value read from the column */
4737
row_search_autoinc_read_column(
4738
/*===========================*/
4739
dict_index_t* index, /*!< in: index to read from */
4740
const rec_t* rec, /*!< in: current rec */
4741
ulint col_no, /*!< in: column number */
4742
ulint mtype, /*!< in: column main type */
4743
ibool unsigned_type) /*!< in: signed or unsigned flag */
4748
mem_heap_t* heap = NULL;
4749
ulint offsets_[REC_OFFS_NORMAL_SIZE];
4750
ulint* offsets = offsets_;
4752
rec_offs_init(offsets_);
4754
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4756
data = rec_get_nth_field(rec, offsets, col_no, &len);
4758
ut_a(len != UNIV_SQL_NULL);
4762
ut_a(len <= sizeof value);
4763
value = mach_read_int_type(data, len, unsigned_type);
4767
ut_a(len == sizeof(float));
4768
value = (ib_uint64_t) mach_float_read(data);
4772
ut_a(len == sizeof(double));
4773
value = (ib_uint64_t) mach_double_read(data);
4780
if (UNIV_LIKELY_NULL(heap)) {
4781
mem_heap_free(heap);
4784
if (!unsigned_type && (ib_int64_t) value < 0) {
4791
/*******************************************************************//**
4793
@return current rec or NULL */
4796
row_search_autoinc_get_rec(
4797
/*=======================*/
4798
btr_pcur_t* pcur, /*!< in: the current cursor */
4799
mtr_t* mtr) /*!< in: mini transaction */
4802
const rec_t* rec = btr_pcur_get_rec(pcur);
4804
if (page_rec_is_user_rec(rec)) {
4807
} while (btr_pcur_move_to_prev(pcur, mtr));
4812
/*******************************************************************//**
4813
Read the max AUTOINC value from an index.
4814
@return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if
4815
column name can't be found in index */
4818
row_search_max_autoinc(
4819
/*===================*/
4820
dict_index_t* index, /*!< in: index to search */
4821
const char* col_name, /*!< in: name of autoinc column */
4822
ib_uint64_t* value) /*!< out: AUTOINC value read */
4826
dict_field_t* dfield = NULL;
4827
ulint error = DB_SUCCESS;
4829
n_cols = dict_index_get_n_ordering_defined_by_user(index);
4831
/* Search the index for the AUTOINC column name */
4832
for (i = 0; i < n_cols; ++i) {
4833
dfield = dict_index_get_nth_field(index, i);
4835
if (strcmp(col_name, dfield->name) == 0) {
4842
/* Must find the AUTOINC column name */
4843
if (i < n_cols && dfield) {
4849
/* Open at the high/right end (FALSE), and INIT
4851
btr_pcur_open_at_index_side(
4852
FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
4854
if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
4857
rec = row_search_autoinc_get_rec(&pcur, &mtr);
4860
ibool unsigned_type = (
4861
dfield->col->prtype & DATA_UNSIGNED);
4863
*value = row_search_autoinc_read_column(
4865
dfield->col->mtype, unsigned_type);
4869
btr_pcur_close(&pcur);
4873
error = DB_RECORD_NOT_FOUND;