1
/*******************************************************
6
Created 12/19/1997 Heikki Tuuri
7
*******************************************************/
15
#include "dict0dict.h"
16
#include "dict0boot.h"
22
#include "mach0data.h"
28
#include "lock0lock.h"
29
#include "eval0eval.h"
31
#include "pars0pars.h"
32
#include "row0mysql.h"
33
#include "read0read.h"
36
/* Maximum number of rows to prefetch; MySQL interface has another parameter */
37
#define SEL_MAX_N_PREFETCH 16
39
/* Number of rows fetched, after which to start prefetching; MySQL interface
40
has another parameter */
41
#define SEL_PREFETCH_LIMIT 1
43
/* When a select has accessed about this many pages, it returns control back
44
to que_run_threads: this is to allow canceling runaway queries */
46
#define SEL_COST_LIMIT 100
48
/* Flags for search shortcut */
50
#define SEL_EXHAUSTED 1
53
/************************************************************************
54
Returns TRUE if the user-defined column in a secondary index record
55
is alphabetically the same as the corresponding BLOB column in the clustered
57
NOTE: the comparison is NOT done as a binary comparison, but character
58
fields are compared with collation! */
61
row_sel_sec_rec_is_for_blob(
62
/*========================*/
63
/* out: TRUE if the columns
65
ulint mtype, /* in: main type */
66
ulint prtype, /* in: precise type */
67
ulint mbminlen, /* in: minimum length of a
68
multi-byte character */
69
ulint mbmaxlen, /* in: maximum length of a
70
multi-byte character */
71
const byte* clust_field, /* in: the locally stored part of
72
the clustered index column, including
73
the BLOB pointer; the clustered
74
index record must be covered by
75
a lock or a page latch to protect it
76
against deletion (rollback or purge) */
77
ulint clust_len, /* in: length of clust_field */
78
const byte* sec_field, /* in: column in secondary index */
79
ulint sec_len, /* in: length of sec_field */
80
ulint zip_size) /* in: compressed page size, or 0 */
83
byte buf[DICT_MAX_INDEX_COL_LEN];
85
len = btr_copy_externally_stored_field_prefix(buf, sizeof buf,
87
clust_field, clust_len);
88
len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen,
89
sec_len, len, (const char*) buf);
91
return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
94
/************************************************************************
95
Returns TRUE if the user-defined column values in a secondary index record
96
are alphabetically the same as the corresponding columns in the clustered
98
NOTE: the comparison is NOT done as a binary comparison, but character
99
fields are compared with collation! */
102
row_sel_sec_rec_is_for_clust_rec(
103
/*=============================*/
104
/* out: TRUE if the secondary
105
record is equal to the corresponding
106
fields in the clustered record,
107
when compared with collation */
108
const rec_t* sec_rec, /* in: secondary index record */
109
dict_index_t* sec_index, /* in: secondary index */
110
const rec_t* clust_rec, /* in: clustered index record;
111
must be protected by a lock or
112
a page latch against deletion
113
in rollback or purge */
114
dict_index_t* clust_index) /* in: clustered index */
116
const byte* sec_field;
118
const byte* clust_field;
121
mem_heap_t* heap = NULL;
122
ulint clust_offsets_[REC_OFFS_NORMAL_SIZE];
123
ulint sec_offsets_[REC_OFFS_SMALL_SIZE];
124
ulint* clust_offs = clust_offsets_;
125
ulint* sec_offs = sec_offsets_;
126
ibool is_equal = TRUE;
128
rec_offs_init(clust_offsets_);
129
rec_offs_init(sec_offsets_);
131
if (rec_get_deleted_flag(clust_rec,
132
dict_table_is_comp(clust_index->table))) {
134
/* The clustered index record is delete-marked;
135
it is not visible in the read view. Besides,
136
if there are any externally stored columns,
137
some of them may have already been purged. */
141
clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
142
ULINT_UNDEFINED, &heap);
143
sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
144
ULINT_UNDEFINED, &heap);
146
n = dict_index_get_n_ordering_defined_by_user(sec_index);
148
for (i = 0; i < n; i++) {
149
const dict_field_t* ifield;
150
const dict_col_t* col;
155
ifield = dict_index_get_nth_field(sec_index, i);
156
col = dict_field_get_col(ifield);
157
clust_pos = dict_col_get_clust_pos(col, clust_index);
159
clust_field = rec_get_nth_field(
160
clust_rec, clust_offs, clust_pos, &clust_len);
161
sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
165
if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL) {
167
if (rec_offs_nth_extern(clust_offs, clust_pos)) {
168
len -= BTR_EXTERN_FIELD_REF_SIZE;
171
len = dtype_get_at_most_n_mbchars(
172
col->prtype, col->mbminlen, col->mbmaxlen,
173
ifield->prefix_len, len, (char*) clust_field);
175
if (rec_offs_nth_extern(clust_offs, clust_pos)
177
if (!row_sel_sec_rec_is_for_blob(
178
col->mtype, col->prtype,
179
col->mbminlen, col->mbmaxlen,
180
clust_field, clust_len,
183
clust_index->table))) {
191
if (0 != cmp_data_data(col->mtype, col->prtype,
193
sec_field, sec_len)) {
201
if (UNIV_LIKELY_NULL(heap)) {
207
/*************************************************************************
208
Creates a select node struct. */
213
/* out, own: select node struct */
214
mem_heap_t* heap) /* in: memory heap where created */
218
node = mem_heap_alloc(heap, sizeof(sel_node_t));
219
node->common.type = QUE_NODE_SELECT;
220
node->state = SEL_NODE_OPEN;
222
node->select_will_do_update = FALSE;
223
node->latch_mode = BTR_SEARCH_LEAF;
230
/*************************************************************************
231
Frees the memory private to a select node when a query graph is freed,
232
does not free the heap where the node was originally created. */
235
sel_node_free_private(
236
/*==================*/
237
sel_node_t* node) /* in: select node struct */
242
if (node->plans != NULL) {
243
for (i = 0; i < node->n_tables; i++) {
244
plan = sel_node_get_nth_plan(node, i);
246
btr_pcur_close(&(plan->pcur));
247
btr_pcur_close(&(plan->clust_pcur));
249
if (plan->old_vers_heap) {
250
mem_heap_free(plan->old_vers_heap);
256
/*************************************************************************
257
Evaluates the values in a select list. If there are aggregate functions,
258
their argument value is added to the aggregate total. */
261
sel_eval_select_list(
262
/*=================*/
263
sel_node_t* node) /* in: select node */
267
exp = node->select_list;
272
exp = que_node_get_next(exp);
276
/*************************************************************************
277
Assigns the values in the select list to the possible into-variables in
278
SELECT ... INTO ... */
281
sel_assign_into_var_values(
282
/*=======================*/
283
sym_node_t* var, /* in: first variable in a list of variables */
284
sel_node_t* node) /* in: select node */
293
exp = node->select_list;
298
eval_node_copy_val(var->alias, exp);
300
exp = que_node_get_next(exp);
301
var = que_node_get_next(var);
305
/*************************************************************************
306
Resets the aggregate value totals in the select list of an aggregate type
310
sel_reset_aggregate_vals(
311
/*=====================*/
312
sel_node_t* node) /* in: select node */
314
func_node_t* func_node;
316
ut_ad(node->is_aggregate);
318
func_node = node->select_list;
321
eval_node_set_int_val(func_node, 0);
323
func_node = que_node_get_next(func_node);
326
node->aggregate_already_fetched = FALSE;
329
/*************************************************************************
330
Copies the input variable values when an explicit cursor is opened. */
333
row_sel_copy_input_variable_vals(
334
/*=============================*/
335
sel_node_t* node) /* in: select node */
339
var = UT_LIST_GET_FIRST(node->copy_variables);
342
eval_node_copy_val(var, var->alias);
344
var->indirection = NULL;
346
var = UT_LIST_GET_NEXT(col_var_list, var);
350
/*************************************************************************
351
Fetches the column values from a record. */
354
row_sel_fetch_columns(
355
/*==================*/
356
dict_index_t* index, /* in: record index */
357
const rec_t* rec, /* in: record in a clustered or non-clustered
358
index; must be protected by a page latch */
359
const ulint* offsets,/* in: rec_get_offsets(rec, index) */
360
sym_node_t* column) /* in: first column in a column list, or
369
ut_ad(rec_offs_validate(rec, index, offsets));
371
if (dict_index_is_clust(index)) {
372
index_type = SYM_CLUST_FIELD_NO;
374
index_type = SYM_SEC_FIELD_NO;
378
mem_heap_t* heap = NULL;
381
field_no = column->field_nos[index_type];
383
if (field_no != ULINT_UNDEFINED) {
385
if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
388
/* Copy an externally stored field to the
391
heap = mem_heap_create(1);
393
data = btr_rec_copy_externally_stored_field(
395
dict_table_zip_size(index->table),
396
field_no, &len, heap);
398
ut_a(len != UNIV_SQL_NULL);
402
data = rec_get_nth_field(rec, offsets,
405
if (len == UNIV_SQL_NULL) {
409
needs_copy = column->copy_val;
413
eval_node_copy_and_alloc_val(column, data,
416
val = que_node_get_val(column);
417
dfield_set_data(val, data, len);
420
if (UNIV_LIKELY_NULL(heap)) {
425
column = UT_LIST_GET_NEXT(col_var_list, column);
429
/*************************************************************************
430
Allocates a prefetch buffer for a column when prefetch is first time done. */
433
sel_col_prefetch_buf_alloc(
434
/*=======================*/
435
sym_node_t* column) /* in: symbol table node for a column */
440
ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
442
column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
443
* sizeof(sel_buf_t));
444
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
445
sel_buf = column->prefetch_buf + i;
447
sel_buf->data = NULL;
449
sel_buf->val_buf_size = 0;
453
/*************************************************************************
454
Frees a prefetch buffer for a column, including the dynamically allocated
455
memory for data stored there. */
458
sel_col_prefetch_buf_free(
459
/*======================*/
460
sel_buf_t* prefetch_buf) /* in, own: prefetch buffer */
465
for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
466
sel_buf = prefetch_buf + i;
468
if (sel_buf->val_buf_size > 0) {
470
mem_free(sel_buf->data);
475
/*************************************************************************
476
Pops the column values for a prefetched, cached row from the column prefetch
477
buffers and places them to the val fields in the column nodes. */
480
sel_pop_prefetched_row(
481
/*===================*/
482
plan_t* plan) /* in: plan node for a table */
491
ut_ad(plan->n_rows_prefetched > 0);
493
column = UT_LIST_GET_FIRST(plan->columns);
496
val = que_node_get_val(column);
498
if (!column->copy_val) {
499
/* We did not really push any value for the
502
ut_ad(!column->prefetch_buf);
503
ut_ad(que_node_get_val_buf_size(column) == 0);
504
ut_d(dfield_set_null(val));
509
ut_ad(column->prefetch_buf);
510
ut_ad(!dfield_is_ext(val));
512
sel_buf = column->prefetch_buf + plan->first_prefetched;
514
data = sel_buf->data;
516
val_buf_size = sel_buf->val_buf_size;
518
/* We must keep track of the allocated memory for
519
column values to be able to free it later: therefore
520
we swap the values for sel_buf and val */
522
sel_buf->data = dfield_get_data(val);
523
sel_buf->len = dfield_get_len(val);
524
sel_buf->val_buf_size = que_node_get_val_buf_size(column);
526
dfield_set_data(val, data, len);
527
que_node_set_val_buf_size(column, val_buf_size);
529
column = UT_LIST_GET_NEXT(col_var_list, column);
532
plan->n_rows_prefetched--;
534
plan->first_prefetched++;
537
/*************************************************************************
538
Pushes the column values for a prefetched, cached row to the column prefetch
539
buffers from the val fields in the column nodes. */
542
sel_push_prefetched_row(
543
/*====================*/
544
plan_t* plan) /* in: plan node for a table */
554
if (plan->n_rows_prefetched == 0) {
556
plan->first_prefetched = 0;
558
pos = plan->n_rows_prefetched;
560
/* We have the convention that pushing new rows starts only
561
after the prefetch stack has been emptied: */
563
ut_ad(plan->first_prefetched == 0);
566
plan->n_rows_prefetched++;
568
ut_ad(pos < SEL_MAX_N_PREFETCH);
570
column = UT_LIST_GET_FIRST(plan->columns);
573
if (!column->copy_val) {
574
/* There is no sense to push pointers to database
575
page fields when we do not keep latch on the page! */
580
if (!column->prefetch_buf) {
581
/* Allocate a new prefetch buffer */
583
sel_col_prefetch_buf_alloc(column);
586
sel_buf = column->prefetch_buf + pos;
588
val = que_node_get_val(column);
590
data = dfield_get_data(val);
591
len = dfield_get_len(val);
592
val_buf_size = que_node_get_val_buf_size(column);
594
/* We must keep track of the allocated memory for
595
column values to be able to free it later: therefore
596
we swap the values for sel_buf and val */
598
dfield_set_data(val, sel_buf->data, sel_buf->len);
599
que_node_set_val_buf_size(column, sel_buf->val_buf_size);
601
sel_buf->data = data;
603
sel_buf->val_buf_size = val_buf_size;
605
column = UT_LIST_GET_NEXT(col_var_list, column);
609
/*************************************************************************
610
Builds a previous version of a clustered index record for a consistent read */
613
row_sel_build_prev_vers(
614
/*====================*/
615
/* out: DB_SUCCESS or error code */
616
read_view_t* read_view, /* in: read view */
617
dict_index_t* index, /* in: plan node for table */
618
rec_t* rec, /* in: record in a clustered index */
619
ulint** offsets, /* in/out: offsets returned by
620
rec_get_offsets(rec, plan->index) */
621
mem_heap_t** offset_heap, /* in/out: memory heap from which
622
the offsets are allocated */
623
mem_heap_t** old_vers_heap, /* out: old version heap to use */
624
rec_t** old_vers, /* out: old version, or NULL if the
625
record does not exist in the view:
626
i.e., it was freshly inserted
628
mtr_t* mtr) /* in: mtr */
632
if (*old_vers_heap) {
633
mem_heap_empty(*old_vers_heap);
635
*old_vers_heap = mem_heap_create(512);
638
err = row_vers_build_for_consistent_read(
639
rec, mtr, index, offsets, read_view, offset_heap,
640
*old_vers_heap, old_vers);
644
/*************************************************************************
645
Builds the last committed version of a clustered index record for a
646
semi-consistent read. */
649
row_sel_build_committed_vers_for_mysql(
650
/*===================================*/
651
/* out: DB_SUCCESS or error code */
652
dict_index_t* clust_index, /* in: clustered index */
653
row_prebuilt_t* prebuilt, /* in: prebuilt struct */
654
const rec_t* rec, /* in: record in a clustered index */
655
ulint** offsets, /* in/out: offsets returned by
656
rec_get_offsets(rec, clust_index) */
657
mem_heap_t** offset_heap, /* in/out: memory heap from which
658
the offsets are allocated */
659
const rec_t** old_vers, /* out: old version, or NULL if the
660
record does not exist in the view:
661
i.e., it was freshly inserted
663
mtr_t* mtr) /* in: mtr */
667
if (prebuilt->old_vers_heap) {
668
mem_heap_empty(prebuilt->old_vers_heap);
670
prebuilt->old_vers_heap = mem_heap_create(200);
673
err = row_vers_build_for_semi_consistent_read(
674
rec, mtr, clust_index, offsets, offset_heap,
675
prebuilt->old_vers_heap, old_vers);
679
/*************************************************************************
680
Tests the conditions which determine when the index segment we are searching
681
through has been exhausted. */
684
row_sel_test_end_conds(
685
/*===================*/
686
/* out: TRUE if row passed the tests */
687
plan_t* plan) /* in: plan for the table; the column values must
688
already have been retrieved and the right sides of
689
comparisons evaluated */
693
/* All conditions in end_conds are comparisons of a column to an
696
cond = UT_LIST_GET_FIRST(plan->end_conds);
699
/* Evaluate the left side of the comparison, i.e., get the
700
column value if there is an indirection */
702
eval_sym(cond->args);
704
/* Do the comparison */
706
if (!eval_cmp(cond)) {
711
cond = UT_LIST_GET_NEXT(cond_list, cond);
717
/*************************************************************************
718
Tests the other conditions. */
721
row_sel_test_other_conds(
722
/*=====================*/
723
/* out: TRUE if row passed the tests */
724
plan_t* plan) /* in: plan for the table; the column values must
725
already have been retrieved */
729
cond = UT_LIST_GET_FIRST(plan->other_conds);
734
if (!eval_node_get_ibool_val(cond)) {
739
cond = UT_LIST_GET_NEXT(cond_list, cond);
745
/*************************************************************************
746
Retrieves the clustered index record corresponding to a record in a
747
non-clustered index. Does the necessary locking. */
750
row_sel_get_clust_rec(
751
/*==================*/
752
/* out: DB_SUCCESS or error code */
753
sel_node_t* node, /* in: select_node */
754
plan_t* plan, /* in: plan node for table */
755
rec_t* rec, /* in: record in a non-clustered index */
756
que_thr_t* thr, /* in: query thread */
757
rec_t** out_rec,/* out: clustered record or an old version of
758
it, NULL if the old version did not exist
759
in the read view, i.e., it was a fresh
761
mtr_t* mtr) /* in: mtr used to get access to the
762
non-clustered record; the same mtr is used to
763
access the clustered index */
769
mem_heap_t* heap = NULL;
770
ulint offsets_[REC_OFFS_NORMAL_SIZE];
771
ulint* offsets = offsets_;
772
rec_offs_init(offsets_);
776
offsets = rec_get_offsets(rec,
777
btr_pcur_get_btr_cur(&plan->pcur)->index,
778
offsets, ULINT_UNDEFINED, &heap);
780
row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
782
index = dict_table_get_first_index(plan->table);
784
btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
785
node->latch_mode, &(plan->clust_pcur),
788
clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
790
/* Note: only if the search ends up on a non-infimum record is the
791
low_match value the real match to the search tuple */
793
if (!page_rec_is_user_rec(clust_rec)
794
|| btr_pcur_get_low_match(&(plan->clust_pcur))
795
< dict_index_get_n_unique(index)) {
797
ut_a(rec_get_deleted_flag(rec,
798
dict_table_is_comp(plan->table)));
799
ut_a(node->read_view);
801
/* In a rare case it is possible that no clust rec is found
802
for a delete-marked secondary index record: if in row0umod.c
803
in row_undo_mod_remove_clust_low() we have already removed
804
the clust rec, while purge is still cleaning and removing
805
secondary index records associated with earlier versions of
806
the clustered index record. In that case we know that the
807
clustered index record did not exist in the read view of
813
offsets = rec_get_offsets(clust_rec, index, offsets,
814
ULINT_UNDEFINED, &heap);
816
if (!node->read_view) {
817
/* Try to place a lock on the index record */
819
/* If innodb_locks_unsafe_for_binlog option is used
820
or this session is using READ COMMITTED isolation level
821
we lock only the record, i.e., next-key locking is
826
trx = thr_get_trx(thr);
828
if (srv_locks_unsafe_for_binlog
829
|| trx->isolation_level == TRX_ISO_READ_COMMITTED) {
830
lock_type = LOCK_REC_NOT_GAP;
832
lock_type = LOCK_ORDINARY;
835
err = lock_clust_rec_read_check_and_lock(
836
0, btr_pcur_get_block(&plan->clust_pcur),
837
clust_rec, index, offsets,
838
node->row_lock_mode, lock_type, thr);
840
if (err != DB_SUCCESS) {
845
/* This is a non-locking consistent read: if necessary, fetch
846
a previous version of the record */
850
if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
853
err = row_sel_build_prev_vers(
854
node->read_view, index, clust_rec,
855
&offsets, &heap, &plan->old_vers_heap,
858
if (err != DB_SUCCESS) {
863
clust_rec = old_vers;
865
if (clust_rec == NULL) {
870
/* If we had to go to an earlier version of row or the
871
secondary index record is delete marked, then it may be that
872
the secondary index record corresponding to clust_rec
873
(or old_vers) is not rec; in that case we must ignore
874
such row because in our snapshot rec would not have existed.
875
Remember that from rec we cannot see directly which transaction
876
id corresponds to it: we have to go to the clustered index
877
record. A query where we want to fetch all rows where
878
the secondary index value is in some interval would return
879
a wrong result if we would not drop rows which we come to
880
visit through secondary index records that would not really
881
exist in our snapshot. */
884
|| rec_get_deleted_flag(rec, dict_table_is_comp(
886
&& !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
892
/* Fetch the columns needed in test conditions. The clustered
893
index record is protected by a page latch that was acquired
894
when plan->clust_pcur was positioned. The latch will not be
895
released until mtr_commit(mtr). */
897
row_sel_fetch_columns(index, clust_rec, offsets,
898
UT_LIST_GET_FIRST(plan->columns));
899
*out_rec = clust_rec;
903
if (UNIV_LIKELY_NULL(heap)) {
909
/*************************************************************************
910
Sets a lock on a record. */
915
/* out: DB_SUCCESS or error code */
916
const buf_block_t* block, /* in: buffer block of rec */
917
const rec_t* rec, /* in: record */
918
dict_index_t* index, /* in: index */
919
const ulint* offsets,/* in: rec_get_offsets(rec, index) */
920
ulint mode, /* in: lock mode */
921
ulint type, /* in: LOCK_ORDINARY, LOCK_GAP, or
923
que_thr_t* thr) /* in: query thread */
928
trx = thr_get_trx(thr);
930
if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
931
if (buf_LRU_buf_pool_running_out()) {
933
return(DB_LOCK_TABLE_FULL);
937
if (dict_index_is_clust(index)) {
938
err = lock_clust_rec_read_check_and_lock(
939
0, block, rec, index, offsets, mode, type, thr);
941
err = lock_sec_rec_read_check_and_lock(
942
0, block, rec, index, offsets, mode, type, thr);
948
/*************************************************************************
949
Opens a pcur to a table index. */
954
sel_node_t* node, /* in: select node */
955
plan_t* plan, /* in: table plan */
956
ibool search_latch_locked,
957
/* in: TRUE if the thread currently
958
has the search latch locked in
960
mtr_t* mtr) /* in: mtr */
966
ulint has_search_latch = 0; /* RW_S_LATCH or 0 */
969
if (search_latch_locked) {
970
has_search_latch = RW_S_LATCH;
975
/* Calculate the value of the search tuple: the exact match columns
976
get their expressions evaluated when we evaluate the right sides of
979
cond = UT_LIST_GET_FIRST(plan->end_conds);
982
eval_exp(que_node_get_next(cond->args));
984
cond = UT_LIST_GET_NEXT(cond_list, cond);
988
n_fields = dtuple_get_n_fields(plan->tuple);
990
if (plan->n_exact_match < n_fields) {
991
/* There is a non-exact match field which must be
992
evaluated separately */
994
eval_exp(plan->tuple_exps[n_fields - 1]);
997
for (i = 0; i < n_fields; i++) {
998
exp = plan->tuple_exps[i];
1000
dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
1001
que_node_get_val(exp));
1004
/* Open pcur to the index */
1006
btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
1007
node->latch_mode, &(plan->pcur),
1008
has_search_latch, mtr);
1010
/* Open the cursor to the start or the end of the index
1013
btr_pcur_open_at_index_side(plan->asc, index, node->latch_mode,
1014
&(plan->pcur), FALSE, mtr);
1017
ut_ad(plan->n_rows_prefetched == 0);
1018
ut_ad(plan->n_rows_fetched == 0);
1019
ut_ad(plan->cursor_at_end == FALSE);
1021
plan->pcur_is_open = TRUE;
1024
/*************************************************************************
1025
Restores a stored pcur position to a table index. */
1028
row_sel_restore_pcur_pos(
1029
/*=====================*/
1030
/* out: TRUE if the cursor should be moved to
1031
the next record after we return from this
1032
function (moved to the previous, in the case
1033
of a descending cursor) without processing
1034
again the current cursor record */
1035
sel_node_t* node, /* in: select node */
1036
plan_t* plan, /* in: table plan */
1037
mtr_t* mtr) /* in: mtr */
1039
ibool equal_position;
1040
ulint relative_position;
1042
ut_ad(!plan->cursor_at_end);
1044
relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
1046
equal_position = btr_pcur_restore_position(node->latch_mode,
1047
&(plan->pcur), mtr);
1049
/* If the cursor is traveling upwards, and relative_position is
1051
(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
1052
yet on the successor of the page infimum;
1053
(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1054
first record GREATER than the predecessor of a page supremum; we have
1055
not yet processed the cursor record: no need to move the cursor to the
1057
(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1058
last record LESS or EQUAL to the old stored user record; (a) if
1059
equal_position is FALSE, this means that the cursor is now on a record
1060
less than the old user record, and we must move to the next record;
1061
(b) if equal_position is TRUE, then if
1062
plan->stored_cursor_rec_processed is TRUE, we must move to the next
1063
record, else there is no need to move the cursor. */
1066
if (relative_position == BTR_PCUR_ON) {
1068
if (equal_position) {
1070
return(plan->stored_cursor_rec_processed);
1076
ut_ad(relative_position == BTR_PCUR_AFTER
1077
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1082
/* If the cursor is traveling downwards, and relative_position is
1084
(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
1085
the last record LESS than the successor of a page infimum; we have not
1086
processed the cursor record: no need to move the cursor;
1087
(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1088
first record GREATER than the predecessor of a page supremum; we have
1089
processed the cursor record: we should move the cursor to the previous
1091
(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1092
last record LESS or EQUAL to the old stored user record; (a) if
1093
equal_position is FALSE, this means that the cursor is now on a record
1094
less than the old user record, and we need not move to the previous
1095
record; (b) if equal_position is TRUE, then if
1096
plan->stored_cursor_rec_processed is TRUE, we must move to the previous
1097
record, else there is no need to move the cursor. */
1099
if (relative_position == BTR_PCUR_BEFORE
1100
|| relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
1105
if (relative_position == BTR_PCUR_ON) {
1107
if (equal_position) {
1109
return(plan->stored_cursor_rec_processed);
1115
ut_ad(relative_position == BTR_PCUR_AFTER
1116
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1121
/*************************************************************************
1122
Resets a plan cursor to a closed state. */
1127
plan_t* plan) /* in: plan */
1129
plan->pcur_is_open = FALSE;
1130
plan->cursor_at_end = FALSE;
1131
plan->n_rows_fetched = 0;
1132
plan->n_rows_prefetched = 0;
1135
/*************************************************************************
1136
Tries to do a shortcut to fetch a clustered index record with a unique key,
1137
using the hash index if possible (not always). */
1140
row_sel_try_search_shortcut(
1141
/*========================*/
1142
/* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
1143
sel_node_t* node, /* in: select node for a consistent read */
1144
plan_t* plan, /* in: plan for a unique search in clustered
1146
mtr_t* mtr) /* in: mtr */
1148
dict_index_t* index;
1150
mem_heap_t* heap = NULL;
1151
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1152
ulint* offsets = offsets_;
1154
rec_offs_init(offsets_);
1156
index = plan->index;
1158
ut_ad(node->read_view);
1159
ut_ad(plan->unique_search);
1160
ut_ad(!plan->must_get_clust);
1161
#ifdef UNIV_SYNC_DEBUG
1162
ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
1163
#endif /* UNIV_SYNC_DEBUG */
1165
row_sel_open_pcur(node, plan, TRUE, mtr);
1167
rec = btr_pcur_get_rec(&(plan->pcur));
1169
if (!page_rec_is_user_rec(rec)) {
1174
ut_ad(plan->mode == PAGE_CUR_GE);
1176
/* As the cursor is now placed on a user record after a search with
1177
the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
1178
fields in the user record matched to the search tuple */
1180
if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
1182
return(SEL_EXHAUSTED);
1185
/* This is a non-locking consistent read: if necessary, fetch
1186
a previous version of the record */
1188
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1190
if (dict_index_is_clust(index)) {
1191
if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1196
} else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
1202
/* Test the deleted flag. */
1204
if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
1206
ret = SEL_EXHAUSTED;
1210
/* Fetch the columns needed in test conditions. The index
1211
record is protected by a page latch that was acquired when
1212
plan->pcur was positioned. The latch will not be released
1213
until mtr_commit(mtr). */
1215
row_sel_fetch_columns(index, rec, offsets,
1216
UT_LIST_GET_FIRST(plan->columns));
1218
/* Test the rest of search conditions */
1220
if (!row_sel_test_other_conds(plan)) {
1222
ret = SEL_EXHAUSTED;
1226
ut_ad(plan->pcur.latch_mode == node->latch_mode);
1228
plan->n_rows_fetched++;
1231
if (UNIV_LIKELY_NULL(heap)) {
1232
mem_heap_free(heap);
1237
/*************************************************************************
1238
Performs a select step. */
1243
/* out: DB_SUCCESS or error code */
1244
sel_node_t* node, /* in: select node */
1245
que_thr_t* thr) /* in: query thread */
1247
dict_index_t* index;
1254
ibool search_latch_locked;
1255
ibool consistent_read;
1257
/* The following flag becomes TRUE when we are doing a
1258
consistent read from a non-clustered index and we must look
1259
at the clustered index to find out the previous delete mark
1260
state of the non-clustered record: */
1262
ibool cons_read_requires_clust_rec = FALSE;
1263
ulint cost_counter = 0;
1264
ibool cursor_just_opened;
1265
ibool must_go_to_next;
1266
ibool leaf_contains_updates = FALSE;
1267
/* TRUE if select_will_do_update is
1268
TRUE and the current clustered index
1269
leaf page has been updated during
1270
the current mtr: mtr must be committed
1271
at the same time as the leaf x-latch
1273
ibool mtr_has_extra_clust_latch = FALSE;
1274
/* TRUE if the search was made using
1275
a non-clustered index, and we had to
1276
access the clustered record: now &mtr
1277
contains a clustered index latch, and
1278
&mtr must be committed before we move
1279
to the next non-clustered record */
1282
mem_heap_t* heap = NULL;
1283
ulint offsets_[REC_OFFS_NORMAL_SIZE];
1284
ulint* offsets = offsets_;
1285
rec_offs_init(offsets_);
1287
ut_ad(thr->run_node == node);
1289
search_latch_locked = FALSE;
1291
if (node->read_view) {
1292
/* In consistent reads, we try to do with the hash index and
1293
not to use the buffer page get. This is to reduce memory bus
1294
load resulting from semaphore operations. The search latch
1295
will be s-locked when we access an index with a unique search
1296
condition, but not locked when we access an index with a
1297
less selective search condition. */
1299
consistent_read = TRUE;
1301
consistent_read = FALSE;
1307
This is the outer major loop in calculating a join. We come here when
1308
node->fetch_table changes, and after adding a row to aggregate totals
1309
and, of course, when this function is called. */
1311
ut_ad(leaf_contains_updates == FALSE);
1312
ut_ad(mtr_has_extra_clust_latch == FALSE);
1314
plan = sel_node_get_nth_plan(node, node->fetch_table);
1315
index = plan->index;
1317
if (plan->n_rows_prefetched > 0) {
1318
sel_pop_prefetched_row(plan);
1320
goto next_table_no_mtr;
1323
if (plan->cursor_at_end) {
1324
/* The cursor has already reached the result set end: no more
1325
rows to process for this table cursor, as also the prefetch
1328
ut_ad(plan->pcur_is_open);
1330
goto table_exhausted_no_mtr;
1333
/* Open a cursor to index, or restore an open cursor position */
1337
if (consistent_read && plan->unique_search && !plan->pcur_is_open
1338
&& !plan->must_get_clust
1339
&& !plan->table->big_rows) {
1340
if (!search_latch_locked) {
1341
rw_lock_s_lock(&btr_search_latch);
1343
search_latch_locked = TRUE;
1344
} else if (btr_search_latch.writer_is_wait_ex) {
1346
/* There is an x-latch request waiting: release the
1347
s-latch for a moment; as an s-latch here is often
1348
kept for some 10 searches before being released,
1349
a waiting x-latch request would block other threads
1350
from acquiring an s-latch for a long time, lowering
1351
performance significantly in multiprocessors. */
1353
rw_lock_s_unlock(&btr_search_latch);
1354
rw_lock_s_lock(&btr_search_latch);
1357
found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
1359
if (found_flag == SEL_FOUND) {
1363
} else if (found_flag == SEL_EXHAUSTED) {
1365
goto table_exhausted;
1368
ut_ad(found_flag == SEL_RETRY);
1370
plan_reset_cursor(plan);
1376
if (search_latch_locked) {
1377
rw_lock_s_unlock(&btr_search_latch);
1379
search_latch_locked = FALSE;
1382
if (!plan->pcur_is_open) {
1383
/* Evaluate the expressions to build the search tuple and
1386
row_sel_open_pcur(node, plan, search_latch_locked, &mtr);
1388
cursor_just_opened = TRUE;
1390
/* A new search was made: increment the cost counter */
1393
/* Restore pcur position to the index */
1395
must_go_to_next = row_sel_restore_pcur_pos(node, plan, &mtr);
1397
cursor_just_opened = FALSE;
1399
if (must_go_to_next) {
1400
/* We have already processed the cursor record: move
1410
In this loop we use pcur and try to fetch a qualifying row, and
1411
also fill the prefetch buffer for this table if n_rows_fetched has
1412
exceeded a threshold. While we are inside this loop, the following
1414
(1) &mtr is started,
1415
(2) pcur is positioned and open.
1417
NOTE that if cursor_just_opened is TRUE here, it means that we came
1418
to this point right after row_sel_open_pcur. */
1420
ut_ad(mtr_has_extra_clust_latch == FALSE);
1422
rec = btr_pcur_get_rec(&(plan->pcur));
1424
/* PHASE 1: Set a lock if specified */
1426
if (!node->asc && cursor_just_opened
1427
&& !page_rec_is_supremum(rec)) {
1429
/* When we open a cursor for a descending search, we must set
1430
a next-key lock on the successor record: otherwise it would
1431
be possible to insert new records next to the cursor position,
1432
and it might be that these new records should appear in the
1433
search result set, resulting in the phantom problem. */
1435
if (!consistent_read) {
1437
/* If innodb_locks_unsafe_for_binlog option is used
1438
or this session is using READ COMMITTED isolation
1439
level, we lock only the record, i.e., next-key
1440
locking is not used. */
1442
rec_t* next_rec = page_rec_get_next(rec);
1446
trx = thr_get_trx(thr);
1448
offsets = rec_get_offsets(next_rec, index, offsets,
1449
ULINT_UNDEFINED, &heap);
1451
if (srv_locks_unsafe_for_binlog
1452
|| trx->isolation_level
1453
== TRX_ISO_READ_COMMITTED) {
1455
if (page_rec_is_supremum(next_rec)) {
1460
lock_type = LOCK_REC_NOT_GAP;
1462
lock_type = LOCK_ORDINARY;
1465
err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1466
next_rec, index, offsets,
1467
node->row_lock_mode,
1470
if (err != DB_SUCCESS) {
1471
/* Note that in this case we will store in pcur
1472
the PREDECESSOR of the record we are waiting
1475
goto lock_wait_or_error;
1481
if (page_rec_is_infimum(rec)) {
1483
/* The infimum record on a page cannot be in the result set,
1484
and neither can a record lock be placed on it: we skip such
1485
a record. We also increment the cost counter as we may have
1486
processed yet another page of index. */
1493
if (!consistent_read) {
1494
/* Try to place a lock on the index record */
1496
/* If innodb_locks_unsafe_for_binlog option is used
1497
or this session is using READ COMMITTED isolation level,
1498
we lock only the record, i.e., next-key locking is
1504
offsets = rec_get_offsets(rec, index, offsets,
1505
ULINT_UNDEFINED, &heap);
1507
trx = thr_get_trx(thr);
1509
if (srv_locks_unsafe_for_binlog
1510
|| trx->isolation_level == TRX_ISO_READ_COMMITTED) {
1512
if (page_rec_is_supremum(rec)) {
1517
lock_type = LOCK_REC_NOT_GAP;
1519
lock_type = LOCK_ORDINARY;
1522
err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
1523
rec, index, offsets,
1524
node->row_lock_mode, lock_type, thr);
1526
if (err != DB_SUCCESS) {
1528
goto lock_wait_or_error;
1532
if (page_rec_is_supremum(rec)) {
1534
/* A page supremum record cannot be in the result set: skip
1535
it now when we have placed a possible lock on it */
1540
ut_ad(page_rec_is_user_rec(rec));
1542
if (cost_counter > SEL_COST_LIMIT) {
1544
/* Now that we have placed the necessary locks, we can stop
1545
for a while and store the cursor position; NOTE that if we
1546
would store the cursor position BEFORE placing a record lock,
1547
it might happen that the cursor would jump over some records
1548
that another transaction could meanwhile insert adjacent to
1549
the cursor: this would result in the phantom problem. */
1551
goto stop_for_a_while;
1554
/* PHASE 2: Check a mixed index mix id if needed */
1556
if (plan->unique_search && cursor_just_opened) {
1558
ut_ad(plan->mode == PAGE_CUR_GE);
1560
/* As the cursor is now placed on a user record after a search
1561
with the mode PAGE_CUR_GE, the up_match field in the cursor
1562
tells how many fields in the user record matched to the search
1565
if (btr_pcur_get_up_match(&(plan->pcur))
1566
< plan->n_exact_match) {
1567
goto table_exhausted;
1570
/* Ok, no need to test end_conds or mix id */
1574
/* We are ready to look at a possible new index entry in the result
1575
set: the cursor is now placed on a user record */
1577
/* PHASE 3: Get previous version in a consistent read */
1579
cons_read_requires_clust_rec = FALSE;
1580
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1582
if (consistent_read) {
1583
/* This is a non-locking consistent read: if necessary, fetch
1584
a previous version of the record */
1586
if (dict_index_is_clust(index)) {
1588
if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1591
err = row_sel_build_prev_vers(
1592
node->read_view, index, rec,
1593
&offsets, &heap, &plan->old_vers_heap,
1596
if (err != DB_SUCCESS) {
1598
goto lock_wait_or_error;
1601
if (old_vers == NULL) {
1602
offsets = rec_get_offsets(
1603
rec, index, offsets,
1604
ULINT_UNDEFINED, &heap);
1606
/* Fetch the columns needed in
1607
test conditions. The clustered
1608
index record is protected by a
1609
page latch that was acquired
1610
by row_sel_open_pcur() or
1611
row_sel_restore_pcur_pos().
1612
The latch will not be released
1613
until mtr_commit(mtr). */
1615
row_sel_fetch_columns(
1616
index, rec, offsets,
1620
if (!row_sel_test_end_conds(plan)) {
1622
goto table_exhausted;
1630
} else if (!lock_sec_rec_cons_read_sees(rec,
1632
cons_read_requires_clust_rec = TRUE;
1636
/* PHASE 4: Test search end conditions and deleted flag */
1638
/* Fetch the columns needed in test conditions. The record is
1639
protected by a page latch that was acquired by
1640
row_sel_open_pcur() or row_sel_restore_pcur_pos(). The latch
1641
will not be released until mtr_commit(mtr). */
1643
row_sel_fetch_columns(index, rec, offsets,
1644
UT_LIST_GET_FIRST(plan->columns));
1646
/* Test the selection end conditions: these can only contain columns
1647
which already are found in the index, even though the index might be
1650
if (plan->unique_search && cursor_just_opened) {
1652
/* No test necessary: the test was already made above */
1654
} else if (!row_sel_test_end_conds(plan)) {
1656
goto table_exhausted;
1659
if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
1660
&& !cons_read_requires_clust_rec) {
1662
/* The record is delete marked: we can skip it if this is
1663
not a consistent read which might see an earlier version
1664
of a non-clustered index record */
1666
if (plan->unique_search) {
1668
goto table_exhausted;
1675
/* PHASE 5: Get the clustered index record, if needed and if we did
1676
not do the search using the clustered index */
1678
if (plan->must_get_clust || cons_read_requires_clust_rec) {
1680
/* It was a non-clustered index and we must fetch also the
1681
clustered index record */
1683
err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
1685
mtr_has_extra_clust_latch = TRUE;
1687
if (err != DB_SUCCESS) {
1689
goto lock_wait_or_error;
1692
/* Retrieving the clustered record required a search:
1693
increment the cost counter */
1697
if (clust_rec == NULL) {
1698
/* The record did not exist in the read view */
1699
ut_ad(consistent_read);
1704
if (rec_get_deleted_flag(clust_rec,
1705
dict_table_is_comp(plan->table))) {
1707
/* The record is delete marked: we can skip it */
1712
if (node->can_get_updated) {
1714
btr_pcur_store_position(&(plan->clust_pcur), &mtr);
1718
/* PHASE 6: Test the rest of search conditions */
1720
if (!row_sel_test_other_conds(plan)) {
1722
if (plan->unique_search) {
1724
goto table_exhausted;
1730
/* PHASE 7: We found a new qualifying row for the current table; push
1731
the row if prefetch is on, or move to the next table in the join */
1733
plan->n_rows_fetched++;
1735
ut_ad(plan->pcur.latch_mode == node->latch_mode);
1737
if (node->select_will_do_update) {
1738
/* This is a searched update and we can do the update in-place,
1741
row_upd_in_place_in_select(node, thr, &mtr);
1743
leaf_contains_updates = TRUE;
1745
/* When the database is in the online backup mode, the number
1746
of log records for a single mtr should be small: increment the
1747
cost counter to ensure it */
1749
cost_counter += 1 + (SEL_COST_LIMIT / 8);
1751
if (plan->unique_search) {
1753
goto table_exhausted;
1759
if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
1760
|| plan->unique_search || plan->no_prefetch
1761
|| plan->table->big_rows) {
1763
/* No prefetch in operation: go to the next table */
1768
sel_push_prefetched_row(plan);
1770
if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
1772
/* The prefetch buffer is now full */
1774
sel_pop_prefetched_row(plan);
1780
ut_ad(!search_latch_locked);
1782
if (mtr_has_extra_clust_latch) {
1784
/* We must commit &mtr if we are moving to the next
1785
non-clustered index record, because we could break the
1786
latching order if we would access a different clustered
1787
index page right away without releasing the previous. */
1789
goto commit_mtr_for_a_while;
1792
if (leaf_contains_updates
1793
&& btr_pcur_is_after_last_on_page(&plan->pcur)) {
1795
/* We must commit &mtr if we are moving to a different page,
1796
because we have done updates to the x-latched leaf page, and
1797
the latch would be released in btr_pcur_move_to_next, without
1798
&mtr getting committed there */
1802
goto commit_mtr_for_a_while;
1806
moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
1808
moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
1813
goto table_exhausted;
1816
cursor_just_opened = FALSE;
1818
/* END OF RECORD LOOP
1819
------------------ */
1823
/* We found a record which satisfies the conditions: we can move to
1824
the next table or return a row in the result set */
1826
ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
1828
if (plan->unique_search && !node->can_get_updated) {
1830
plan->cursor_at_end = TRUE;
1832
ut_ad(!search_latch_locked);
1834
plan->stored_cursor_rec_processed = TRUE;
1836
btr_pcur_store_position(&(plan->pcur), &mtr);
1841
leaf_contains_updates = FALSE;
1842
mtr_has_extra_clust_latch = FALSE;
1845
/* If we use 'goto' to this label, it means that the row was popped
1846
from the prefetched rows stack, and &mtr is already committed */
1848
if (node->fetch_table + 1 == node->n_tables) {
1850
sel_eval_select_list(node);
1852
if (node->is_aggregate) {
1857
sel_assign_into_var_values(node->into_list, node);
1859
thr->run_node = que_node_get_parent(node);
1865
node->fetch_table++;
1867
/* When we move to the next table, we first reset the plan cursor:
1868
we do not care about resetting it when we backtrack from a table */
1870
plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
1875
/* The table cursor pcur reached the result set end: backtrack to the
1876
previous table in the join if we do not have cached prefetched rows */
1878
plan->cursor_at_end = TRUE;
1882
leaf_contains_updates = FALSE;
1883
mtr_has_extra_clust_latch = FALSE;
1885
if (plan->n_rows_prefetched > 0) {
1886
/* The table became exhausted during a prefetch */
1888
sel_pop_prefetched_row(plan);
1890
goto next_table_no_mtr;
1893
table_exhausted_no_mtr:
1894
if (node->fetch_table == 0) {
1897
if (node->is_aggregate && !node->aggregate_already_fetched) {
1899
node->aggregate_already_fetched = TRUE;
1901
sel_assign_into_var_values(node->into_list, node);
1903
thr->run_node = que_node_get_parent(node);
1905
node->state = SEL_NODE_NO_MORE_ROWS;
1907
thr->run_node = que_node_get_parent(node);
1913
node->fetch_table--;
1918
/* Return control for a while to que_run_threads, so that runaway
1919
queries can be canceled. NOTE that when we come here, we must, in a
1920
locking read, have placed the necessary (possibly waiting request)
1921
record lock on the cursor record or its successor: when we reposition
1922
the cursor, this record lock guarantees that nobody can meanwhile have
1923
inserted new records which should have appeared in the result set,
1924
which would result in the phantom problem. */
1926
ut_ad(!search_latch_locked);
1928
plan->stored_cursor_rec_processed = FALSE;
1929
btr_pcur_store_position(&(plan->pcur), &mtr);
1933
#ifdef UNIV_SYNC_DEBUG
1934
ut_ad(sync_thread_levels_empty_gen(TRUE));
1935
#endif /* UNIV_SYNC_DEBUG */
1939
commit_mtr_for_a_while:
1940
/* Stores the cursor position and commits &mtr; this is used if
1941
&mtr may contain latches which would break the latching order if
1942
&mtr would not be committed and the latches released. */
1944
plan->stored_cursor_rec_processed = TRUE;
1946
ut_ad(!search_latch_locked);
1947
btr_pcur_store_position(&(plan->pcur), &mtr);
1951
leaf_contains_updates = FALSE;
1952
mtr_has_extra_clust_latch = FALSE;
1954
#ifdef UNIV_SYNC_DEBUG
1955
ut_ad(sync_thread_levels_empty_gen(TRUE));
1956
#endif /* UNIV_SYNC_DEBUG */
1961
/* See the note at stop_for_a_while: the same holds for this case */
1963
ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
1964
ut_ad(!search_latch_locked);
1966
plan->stored_cursor_rec_processed = FALSE;
1967
btr_pcur_store_position(&(plan->pcur), &mtr);
1971
#ifdef UNIV_SYNC_DEBUG
1972
ut_ad(sync_thread_levels_empty_gen(TRUE));
1973
#endif /* UNIV_SYNC_DEBUG */
1976
if (search_latch_locked) {
1977
rw_lock_s_unlock(&btr_search_latch);
1979
if (UNIV_LIKELY_NULL(heap)) {
1980
mem_heap_free(heap);
1985
/**************************************************************************
1986
Performs a select step. This is a high-level function used in SQL execution
1992
/* out: query thread to run next or NULL */
1993
que_thr_t* thr) /* in: query thread */
1996
sym_node_t* table_node;
2002
node = thr->run_node;
2004
ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
2006
/* If this is a new time this node is executed (or when execution
2007
resumes after wait for a table intention lock), set intention locks
2008
on the tables, or assign a read view */
2010
if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
2012
node->state = SEL_NODE_OPEN;
2015
if (node->state == SEL_NODE_OPEN) {
2017
/* It may be that the current session has not yet started
2018
its transaction, or it has been committed: */
2020
trx_start_if_not_started(thr_get_trx(thr));
2022
plan_reset_cursor(sel_node_get_nth_plan(node, 0));
2024
if (node->consistent_read) {
2025
/* Assign a read view for the query */
2026
node->read_view = trx_assign_read_view(
2029
if (node->set_x_locks) {
2030
i_lock_mode = LOCK_IX;
2032
i_lock_mode = LOCK_IS;
2035
table_node = node->table_list;
2037
while (table_node) {
2038
err = lock_table(0, table_node->table,
2040
if (err != DB_SUCCESS) {
2041
thr_get_trx(thr)->error_state = err;
2046
table_node = que_node_get_next(table_node);
2050
/* If this is an explicit cursor, copy stored procedure
2051
variable values, so that the values cannot change between
2052
fetches (currently, we copy them also for non-explicit
2055
if (node->explicit_cursor
2056
&& UT_LIST_GET_FIRST(node->copy_variables)) {
2058
row_sel_copy_input_variable_vals(node);
2061
node->state = SEL_NODE_FETCH;
2062
node->fetch_table = 0;
2064
if (node->is_aggregate) {
2065
/* Reset the aggregate total values */
2066
sel_reset_aggregate_vals(node);
2070
err = row_sel(node, thr);
2072
/* NOTE! if queries are parallelized, the following assignment may
2073
have problems; the assignment should be made only if thr is the
2074
only top-level thr in the graph: */
2076
thr->graph->last_sel_node = node;
2078
if (err != DB_SUCCESS) {
2079
thr_get_trx(thr)->error_state = err;
2087
/**************************************************************************
2088
Performs a fetch for a cursor. */
2093
/* out: query thread to run next or NULL */
2094
que_thr_t* thr) /* in: query thread */
2096
sel_node_t* sel_node;
2101
node = thr->run_node;
2102
sel_node = node->cursor_def;
2104
ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
2106
if (thr->prev_node != que_node_get_parent(node)) {
2108
if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
2110
if (node->into_list) {
2111
sel_assign_into_var_values(node->into_list,
2114
void* ret = (*node->func->func)(
2115
sel_node, node->func->arg);
2119
= SEL_NODE_NO_MORE_ROWS;
2124
thr->run_node = que_node_get_parent(node);
2129
/* Make the fetch node the parent of the cursor definition for
2130
the time of the fetch, so that execution knows to return to this
2131
fetch node after a row has been selected or we know that there is
2134
sel_node->common.parent = node;
2136
if (sel_node->state == SEL_NODE_CLOSED) {
2138
"InnoDB: Error: fetch called on a closed cursor\n");
2140
thr_get_trx(thr)->error_state = DB_ERROR;
2145
thr->run_node = sel_node;
2150
/********************************************************************
2151
Sample callback function for fetch that prints each row.*/
2156
/* out: always returns non-NULL */
2157
void* row, /* in: sel_node_t* */
2158
void* user_arg) /* in: not used */
2160
sel_node_t* node = row;
2164
UT_NOT_USED(user_arg);
2166
fprintf(stderr, "row_fetch_print: row %p\n", row);
2168
exp = node->select_list;
2171
dfield_t* dfield = que_node_get_val(exp);
2172
const dtype_t* type = dfield_get_type(dfield);
2174
fprintf(stderr, " column %lu:\n", (ulong)i);
2177
fprintf(stderr, "\n");
2179
if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
2180
ut_print_buf(stderr, dfield_get_data(dfield),
2181
dfield_get_len(dfield));
2183
fprintf(stderr, " <NULL>;");
2186
fprintf(stderr, "\n");
2188
exp = que_node_get_next(exp);
2195
/********************************************************************
2196
Callback function for fetch that stores an unsigned 4 byte integer to the
2197
location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length
2201
row_fetch_store_uint4(
2202
/*==================*/
2203
/* out: always returns NULL */
2204
void* row, /* in: sel_node_t* */
2205
void* user_arg) /* in: data pointer */
2207
sel_node_t* node = row;
2208
ib_uint32_t* val = user_arg;
2211
dfield_t* dfield = que_node_get_val(node->select_list);
2212
const dtype_t* type = dfield_get_type(dfield);
2213
ulint len = dfield_get_len(dfield);
2215
ut_a(dtype_get_mtype(type) == DATA_INT);
2216
ut_a(dtype_get_prtype(type) & DATA_UNSIGNED);
2219
tmp = mach_read_from_4(dfield_get_data(dfield));
2220
*val = (ib_uint32_t) tmp;
2225
/***************************************************************
2226
Prints a row in a select result. */
2231
/* out: query thread to run next or NULL */
2232
que_thr_t* thr) /* in: query thread */
2234
row_printf_node_t* node;
2235
sel_node_t* sel_node;
2240
node = thr->run_node;
2242
sel_node = node->sel_node;
2244
ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
2246
if (thr->prev_node == que_node_get_parent(node)) {
2248
/* Reset the cursor */
2249
sel_node->state = SEL_NODE_OPEN;
2251
/* Fetch next row to print */
2253
thr->run_node = sel_node;
2258
if (sel_node->state != SEL_NODE_FETCH) {
2260
ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
2262
/* No more rows to print */
2264
thr->run_node = que_node_get_parent(node);
2269
arg = sel_node->select_list;
2272
dfield_print_also_hex(que_node_get_val(arg));
2274
fputs(" ::: ", stderr);
2276
arg = que_node_get_next(arg);
2281
/* Fetch next row to print */
2283
thr->run_node = sel_node;
2288
/********************************************************************
2289
Converts a key value stored in MySQL format to an Innobase dtuple. The last
2290
field of the key value may be just a prefix of a fixed length field: hence
2291
the parameter key_len. But currently we do not allow search keys where the
2292
last field is only a prefix of the full key field len and print a warning if
2293
such appears. A counterpart of this function is
2294
ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2297
row_sel_convert_mysql_key_to_innobase(
2298
/*==================================*/
2299
dtuple_t* tuple, /* in/out: tuple where to build;
2300
NOTE: we assume that the type info
2301
in the tuple is already according
2303
byte* buf, /* in: buffer to use in field
2305
ulint buf_len, /* in: buffer length */
2306
dict_index_t* index, /* in: index of the key value */
2307
const byte* key_ptr, /* in: MySQL key value */
2308
ulint key_len, /* in: MySQL key value length */
2309
trx_t* trx) /* in: transaction */
2311
byte* original_buf = buf;
2312
const byte* original_key_ptr = key_ptr;
2313
dict_field_t* field;
2317
ulint data_field_len;
2319
const byte* key_end;
2322
/* For documentation of the key value storage format in MySQL, see
2323
ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2325
key_end = key_ptr + key_len;
2327
/* Permit us to access any field in the tuple (ULINT_MAX): */
2329
dtuple_set_n_fields(tuple, ULINT_MAX);
2331
dfield = dtuple_get_nth_field(tuple, 0);
2332
field = dict_index_get_nth_field(index, 0);
2334
if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
2335
/* A special case: we are looking for a position in the
2336
generated clustered index which InnoDB automatically added
2337
to a table with no primary key: the first and the only
2338
ordering column is ROW_ID which InnoDB stored to the key_ptr
2341
ut_a(key_len == DATA_ROW_ID_LEN);
2343
dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
2345
dtuple_set_n_fields(tuple, 1);
2350
while (key_ptr < key_end) {
2352
ulint type = dfield_get_type(dfield)->mtype;
2353
ut_a(field->col->mtype == type);
2358
if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
2359
/* The first byte in the field tells if this is
2360
an SQL NULL value */
2364
if (*key_ptr != 0) {
2365
dfield_set_null(dfield);
2371
/* Calculate data length and data field total length */
2373
if (type == DATA_BLOB) {
2374
/* The key field is a column prefix of a BLOB or
2377
ut_a(field->prefix_len > 0);
2379
/* MySQL stores the actual data length to the first 2
2380
bytes after the optional SQL NULL marker byte. The
2381
storage format is little-endian, that is, the most
2382
significant byte at a higher address. In UTF-8, MySQL
2383
seems to reserve field->prefix_len bytes for
2384
storing this field in the key value buffer, even
2385
though the actual value only takes data_len bytes
2388
data_len = key_ptr[data_offset]
2389
+ 256 * key_ptr[data_offset + 1];
2390
data_field_len = data_offset + 2 + field->prefix_len;
2394
/* Now that we know the length, we store the column
2395
value like it would be a fixed char field */
2397
} else if (field->prefix_len > 0) {
2398
/* Looks like MySQL pads unused end bytes in the
2399
prefix with space. Therefore, also in UTF-8, it is ok
2400
to compare with a prefix containing full prefix_len
2401
bytes, and no need to take at most prefix_len / 3
2402
UTF-8 characters from the start.
2403
If the prefix is used as the upper end of a LIKE
2404
'abc%' query, then MySQL pads the end with chars
2405
0xff. TODO: in that case does it any harm to compare
2406
with the full prefix_len bytes. How do characters
2407
0xff in UTF-8 behave? */
2409
data_len = field->prefix_len;
2410
data_field_len = data_offset + data_len;
2412
data_len = dfield_get_type(dfield)->len;
2413
data_field_len = data_offset + data_len;
2417
(dtype_get_mysql_type(dfield_get_type(dfield))
2418
== DATA_MYSQL_TRUE_VARCHAR)
2419
&& UNIV_LIKELY(type != DATA_INT)) {
2420
/* In a MySQL key value format, a true VARCHAR is
2421
always preceded by 2 bytes of a length field.
2422
dfield_get_type(dfield)->len returns the maximum
2423
'payload' len in bytes. That does not include the
2424
2 bytes that tell the actual data length.
2426
We added the check != DATA_INT to make sure we do
2427
not treat MySQL ENUM or SET as a true VARCHAR! */
2430
data_field_len += 2;
2433
/* Storing may use at most data_len bytes of buf */
2435
if (UNIV_LIKELY(!is_null)) {
2436
row_mysql_store_col_in_innobase_format(
2438
FALSE, /* MySQL key value format col */
2439
key_ptr + data_offset, data_len,
2440
dict_table_is_comp(index->table));
2444
key_ptr += data_field_len;
2446
if (UNIV_UNLIKELY(key_ptr > key_end)) {
2447
/* The last field in key was not a complete key field
2450
Print a warning about this! HA_READ_PREFIX_LAST does
2451
not currently work in InnoDB with partial-field key
2452
value prefixes. Since MySQL currently uses a padding
2453
trick to calculate LIKE 'abc%' type queries there
2454
should never be partial-field prefixes in searches. */
2456
ut_print_timestamp(stderr);
2458
fputs(" InnoDB: Warning: using a partial-field"
2459
" key prefix in search.\n"
2460
"InnoDB: ", stderr);
2461
dict_index_name_print(stderr, trx, index);
2462
fprintf(stderr, ". Last data field length %lu bytes,\n"
2463
"InnoDB: key ptr now exceeds"
2464
" key end by %lu bytes.\n"
2465
"InnoDB: Key value in the MySQL format:\n",
2466
(ulong) data_field_len,
2467
(ulong) (key_ptr - key_end));
2469
ut_print_buf(stderr, original_key_ptr, key_len);
2470
fprintf(stderr, "\n");
2473
ulint len = dfield_get_len(dfield);
2474
dfield_set_len(dfield, len
2475
- (ulint) (key_ptr - key_end));
2484
ut_a(buf <= original_buf + buf_len);
2486
/* We set the length of tuple to n_fields: we assume that the memory
2487
area allocated for it is big enough (usually bigger than n_fields). */
2489
dtuple_set_n_fields(tuple, n_fields);
2492
/******************************************************************
2493
Stores the row id to the prebuilt struct. */
2496
row_sel_store_row_id_to_prebuilt(
2497
/*=============================*/
2498
row_prebuilt_t* prebuilt, /* in/out: prebuilt */
2499
const rec_t* index_rec, /* in: record */
2500
const dict_index_t* index, /* in: index of the record */
2501
const ulint* offsets) /* in: rec_get_offsets
2502
(index_rec, index) */
2507
ut_ad(rec_offs_validate(index_rec, index, offsets));
2509
data = rec_get_nth_field(
2511
dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
2513
if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
2515
"InnoDB: Error: Row id field is"
2516
" wrong length %lu in ", (ulong) len);
2517
dict_index_name_print(stderr, prebuilt->trx, index);
2518
fprintf(stderr, "\n"
2519
"InnoDB: Field number %lu, record:\n",
2520
(ulong) dict_index_get_sys_col_pos(index,
2522
rec_print_new(stderr, index_rec, offsets);
2527
ut_memcpy(prebuilt->row_id, data, len);
2530
/******************************************************************
2531
Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
2532
function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
2535
row_sel_field_store_in_mysql_format(
2536
/*================================*/
2537
byte* dest, /* in/out: buffer where to store; NOTE
2538
that BLOBs are not in themselves
2539
stored here: the caller must allocate
2540
and copy the BLOB into buffer before,
2541
and pass the pointer to the BLOB in
2543
const mysql_row_templ_t* templ,
2544
/* in: MySQL column template.
2545
Its following fields are referenced:
2546
type, is_unsigned, mysql_col_len,
2547
mbminlen, mbmaxlen */
2548
const byte* data, /* in: data to store */
2549
ulint len) /* in: length of the data */
2555
ut_ad(len != UNIV_SQL_NULL);
2557
switch (templ->type) {
2559
/* Convert integer data from Innobase to a little-endian
2560
format, sign bit restored to normal */
2573
if (!templ->is_unsigned) {
2574
dest[len - 1] = (byte) (dest[len - 1] ^ 128);
2577
ut_ad(templ->mysql_col_len == len);
2583
field_end = dest + templ->mysql_col_len;
2585
if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
2586
/* This is a >= 5.0.3 type true VARCHAR. Store the
2587
length of the data to the first byte or the first
2588
two bytes of dest. */
2590
dest = row_mysql_store_true_var_len(
2591
dest, len, templ->mysql_length_bytes);
2594
/* Copy the actual data */
2595
ut_memcpy(dest, data, len);
2597
/* Pad with trailing spaces. We pad with spaces also the
2598
unused end of a >= 5.0.3 true VARCHAR column, just in case
2599
MySQL expects its contents to be deterministic. */
2601
pad_ptr = dest + len;
2603
ut_ad(templ->mbminlen <= templ->mbmaxlen);
2605
/* We handle UCS2 charset strings differently. */
2606
if (templ->mbminlen == 2) {
2607
/* A space char is two bytes, 0x0020 in UCS2 */
2610
/* A 0x20 has been stripped from the column.
2613
if (pad_ptr < field_end) {
2619
/* Pad the rest of the string with 0x0020 */
2621
while (pad_ptr < field_end) {
2628
ut_ad(templ->mbminlen == 1);
2631
memset(pad_ptr, 0x20, field_end - pad_ptr);
2636
/* Store a pointer to the BLOB buffer to dest: the BLOB was
2637
already copied to the buffer in row_sel_store_mysql_rec */
2639
row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
2644
memcpy(dest, data, len);
2646
ut_ad(templ->mysql_col_len >= len);
2647
ut_ad(templ->mbmaxlen >= templ->mbminlen);
2649
ut_ad(templ->mbmaxlen > templ->mbminlen
2650
|| templ->mysql_col_len == len);
2651
/* The following assertion would fail for old tables
2652
containing UTF-8 ENUM columns due to Bug #9526. */
2653
ut_ad(!templ->mbmaxlen
2654
|| !(templ->mysql_col_len % templ->mbmaxlen));
2655
ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
2657
if (templ->mbminlen != templ->mbmaxlen) {
2658
/* Pad with spaces. This undoes the stripping
2659
done in row0mysql.ic, function
2660
row_mysql_store_col_in_innobase_format(). */
2662
memset(dest + len, 0x20, templ->mysql_col_len - len);
2668
case DATA_SYS_CHILD:
2670
/* These column types should never be shipped to MySQL. */
2674
case DATA_FIXBINARY:
2678
/* Above are the valid column types for MySQL data. */
2679
#endif /* UNIV_DEBUG */
2680
ut_ad(templ->mysql_col_len == len);
2681
memcpy(dest, data, len);
2685
/******************************************************************
2686
Convert a row in the Innobase format to a row in the MySQL format.
2687
Note that the template in prebuilt may advise us to copy only a few
2688
columns to mysql_rec, other columns are left blank. All columns may not
2689
be needed in the query. */
2692
row_sel_store_mysql_rec(
2693
/*====================*/
2694
/* out: TRUE if success, FALSE if
2695
could not allocate memory for a BLOB
2696
(though we may also assert in that
2698
byte* mysql_rec, /* out: row in the MySQL format */
2699
row_prebuilt_t* prebuilt, /* in: prebuilt struct */
2700
const rec_t* rec, /* in: Innobase record in the index
2701
which was described in prebuilt's
2702
template; must be protected by
2704
const ulint* offsets, /* in: array returned by
2705
rec_get_offsets() */
2706
ulint start_field_no,
2709
mysql_row_templ_t* templ;
2710
mem_heap_t* extern_field_heap = NULL;
2716
ut_ad(prebuilt->mysql_template);
2717
ut_ad(rec_offs_validate(rec, NULL, offsets));
2719
if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
2720
mem_heap_free(prebuilt->blob_heap);
2721
prebuilt->blob_heap = NULL;
2724
for (i = start_field_no; i < end_field_no /* prebuilt->n_template */; i++) {
2726
templ = prebuilt->mysql_template + i;
2728
if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
2729
templ->rec_field_no))) {
2731
/* Copy an externally stored field to the temporary
2734
ut_a(!prebuilt->trx->has_search_latch);
2736
if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2737
if (prebuilt->blob_heap == NULL) {
2738
prebuilt->blob_heap = mem_heap_create(
2742
heap = prebuilt->blob_heap;
2745
= mem_heap_create(UNIV_PAGE_SIZE);
2747
heap = extern_field_heap;
2750
/* NOTE: if we are retrieving a big BLOB, we may
2751
already run out of memory in the next call, which
2754
data = btr_rec_copy_externally_stored_field(
2756
dict_table_zip_size(prebuilt->table),
2757
templ->rec_field_no, &len, heap);
2759
ut_a(len != UNIV_SQL_NULL);
2761
/* Field is stored in the row. */
2763
data = rec_get_nth_field(rec, offsets,
2764
templ->rec_field_no, &len);
2766
if (UNIV_UNLIKELY(templ->type == DATA_BLOB)
2767
&& len != UNIV_SQL_NULL) {
2769
/* It is a BLOB field locally stored in the
2770
InnoDB record: we MUST copy its contents to
2771
prebuilt->blob_heap here because later code
2772
assumes all BLOB values have been copied to a
2775
if (prebuilt->blob_heap == NULL) {
2776
prebuilt->blob_heap = mem_heap_create(
2780
data = memcpy(mem_heap_alloc(
2781
prebuilt->blob_heap, len),
2786
if (len != UNIV_SQL_NULL) {
2787
row_sel_field_store_in_mysql_format(
2788
mysql_rec + templ->mysql_col_offset,
2792
if (extern_field_heap) {
2793
mem_heap_free(extern_field_heap);
2794
extern_field_heap = NULL;
2797
if (templ->mysql_null_bit_mask) {
2798
/* It is a nullable column with a non-NULL
2800
mysql_rec[templ->mysql_null_byte_offset]
2801
&= ~(byte) templ->mysql_null_bit_mask;
2804
/* MySQL seems to assume the field for an SQL NULL
2805
value is set to zero or space. Not taking this into
2806
account caused seg faults with NULL BLOB fields, and
2807
bug number 154 in the MySQL bug database: GROUP BY
2808
and DISTINCT could treat NULL values inequal. */
2811
mysql_rec[templ->mysql_null_byte_offset]
2812
|= (byte) templ->mysql_null_bit_mask;
2813
switch (templ->type) {
2817
if (templ->mysql_type
2818
== DATA_MYSQL_TRUE_VARCHAR) {
2819
/* This is a >= 5.0.3 type
2820
true VARCHAR. Zero the field. */
2826
case DATA_FIXBINARY:
2828
/* MySQL pads all string types (except
2829
BLOB, TEXT and true VARCHAR) with space. */
2830
if (UNIV_UNLIKELY(templ->mbminlen == 2)) {
2831
/* Treat UCS2 as a special case. */
2833
+ templ->mysql_col_offset;
2834
len = templ->mysql_col_len;
2835
/* There are two UCS2 bytes per char,
2836
so the length has to be even. */
2838
/* Pad with 0x0020. */
2853
ut_ad(!pad_char || templ->mbminlen == 1);
2854
memset(mysql_rec + templ->mysql_col_offset,
2855
pad_char, templ->mysql_col_len);
2862
/*************************************************************************
2863
Builds a previous version of a clustered index record for a consistent read */
2866
row_sel_build_prev_vers_for_mysql(
2867
/*==============================*/
2868
/* out: DB_SUCCESS or error code */
2869
read_view_t* read_view, /* in: read view */
2870
dict_index_t* clust_index, /* in: clustered index */
2871
row_prebuilt_t* prebuilt, /* in: prebuilt struct */
2872
const rec_t* rec, /* in: record in a clustered index */
2873
ulint** offsets, /* in/out: offsets returned by
2874
rec_get_offsets(rec, clust_index) */
2875
mem_heap_t** offset_heap, /* in/out: memory heap from which
2876
the offsets are allocated */
2877
rec_t** old_vers, /* out: old version, or NULL if the
2878
record does not exist in the view:
2879
i.e., it was freshly inserted
2881
mtr_t* mtr) /* in: mtr */
2885
if (prebuilt->old_vers_heap) {
2886
mem_heap_empty(prebuilt->old_vers_heap);
2888
prebuilt->old_vers_heap = mem_heap_create(200);
2891
err = row_vers_build_for_consistent_read(
2892
rec, mtr, clust_index, offsets, read_view, offset_heap,
2893
prebuilt->old_vers_heap, old_vers);
2897
/*************************************************************************
2898
Retrieves the clustered index record corresponding to a record in a
2899
non-clustered index. Does the necessary locking. Used in the MySQL
2903
row_sel_get_clust_rec_for_mysql(
2904
/*============================*/
2905
/* out: DB_SUCCESS or error code */
2906
row_prebuilt_t* prebuilt,/* in: prebuilt struct in the handle */
2907
dict_index_t* sec_index,/* in: secondary index where rec resides */
2908
const rec_t* rec, /* in: record in a non-clustered index; if
2909
this is a locking read, then rec is not
2910
allowed to be delete-marked, and that would
2911
not make sense either */
2912
que_thr_t* thr, /* in: query thread */
2913
const rec_t** out_rec,/* out: clustered record or an old version of
2914
it, NULL if the old version did not exist
2915
in the read view, i.e., it was a fresh
2917
ulint** offsets,/* in: offsets returned by
2918
rec_get_offsets(rec, sec_index);
2919
out: offsets returned by
2920
rec_get_offsets(out_rec, clust_index) */
2921
mem_heap_t** offset_heap,/* in/out: memory heap from which
2922
the offsets are allocated */
2923
mtr_t* mtr) /* in: mtr used to get access to the
2924
non-clustered record; the same mtr is used to
2925
access the clustered index */
2927
dict_index_t* clust_index;
2928
const rec_t* clust_rec;
2934
trx = thr_get_trx(thr);
2936
row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
2937
sec_index, *offsets, trx);
2939
clust_index = dict_table_get_first_index(sec_index->table);
2941
btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
2942
PAGE_CUR_LE, BTR_SEARCH_LEAF,
2943
prebuilt->clust_pcur, 0, mtr);
2945
clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
2947
prebuilt->clust_pcur->trx_if_known = trx;
2949
/* Note: only if the search ends up on a non-infimum record is the
2950
low_match value the real match to the search tuple */
2952
if (!page_rec_is_user_rec(clust_rec)
2953
|| btr_pcur_get_low_match(prebuilt->clust_pcur)
2954
< dict_index_get_n_unique(clust_index)) {
2956
/* In a rare case it is possible that no clust rec is found
2957
for a delete-marked secondary index record: if in row0umod.c
2958
in row_undo_mod_remove_clust_low() we have already removed
2959
the clust rec, while purge is still cleaning and removing
2960
secondary index records associated with earlier versions of
2961
the clustered index record. In that case we know that the
2962
clustered index record did not exist in the read view of
2965
if (!rec_get_deleted_flag(rec,
2966
dict_table_is_comp(sec_index->table))
2967
|| prebuilt->select_lock_type != LOCK_NONE) {
2968
ut_print_timestamp(stderr);
2969
fputs(" InnoDB: error clustered record"
2970
" for sec rec not found\n"
2971
"InnoDB: ", stderr);
2972
dict_index_name_print(stderr, trx, sec_index);
2974
"InnoDB: sec index record ", stderr);
2975
rec_print(stderr, rec, sec_index);
2977
"InnoDB: clust index record ", stderr);
2978
rec_print(stderr, clust_rec, clust_index);
2980
trx_print(stderr, trx, 600);
2983
"InnoDB: Submit a detailed bug report"
2984
" to http://bugs.mysql.com\n", stderr);
2992
*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
2993
ULINT_UNDEFINED, offset_heap);
2995
if (prebuilt->select_lock_type != LOCK_NONE) {
2996
/* Try to place a lock on the index record; we are searching
2997
the clust rec with a unique condition, hence
2998
we set a LOCK_REC_NOT_GAP type lock */
3000
err = lock_clust_rec_read_check_and_lock(
3001
0, btr_pcur_get_block(prebuilt->clust_pcur),
3002
clust_rec, clust_index, *offsets,
3003
prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr);
3004
if (err != DB_SUCCESS) {
3009
/* This is a non-locking consistent read: if necessary, fetch
3010
a previous version of the record */
3014
/* If the isolation level allows reading of uncommitted data,
3015
then we never look for an earlier version */
3017
if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3018
&& !lock_clust_rec_cons_read_sees(
3019
clust_rec, clust_index, *offsets,
3022
/* The following call returns 'offsets' associated with
3024
err = row_sel_build_prev_vers_for_mysql(
3025
trx->read_view, clust_index, prebuilt,
3026
clust_rec, offsets, offset_heap, &old_vers,
3029
if (err != DB_SUCCESS || old_vers == NULL) {
3034
clust_rec = old_vers;
3037
/* If we had to go to an earlier version of row or the
3038
secondary index record is delete marked, then it may be that
3039
the secondary index record corresponding to clust_rec
3040
(or old_vers) is not rec; in that case we must ignore
3041
such row because in our snapshot rec would not have existed.
3042
Remember that from rec we cannot see directly which transaction
3043
id corresponds to it: we have to go to the clustered index
3044
record. A query where we want to fetch all rows where
3045
the secondary index value is in some interval would return
3046
a wrong result if we would not drop rows which we come to
3047
visit through secondary index records that would not really
3048
exist in our snapshot. */
3052
|| rec_get_deleted_flag(rec, dict_table_is_comp(
3054
&& !row_sel_sec_rec_is_for_clust_rec(
3055
rec, sec_index, clust_rec, clust_index)) {
3057
#ifdef UNIV_SEARCH_DEBUG
3059
ut_a(clust_rec == NULL
3060
|| row_sel_sec_rec_is_for_clust_rec(
3061
rec, sec_index, clust_rec, clust_index));
3067
*out_rec = clust_rec;
3069
if (prebuilt->select_lock_type == LOCK_X) {
3070
/* We may use the cursor in update: store its position */
3072
btr_pcur_store_position(prebuilt->clust_pcur, mtr);
3080
/************************************************************************
3081
Restores cursor position after it has been stored. We have to take into
3082
account that the record cursor was positioned on may have been deleted.
3083
Then we may have to move the cursor one step up or down. */
3086
sel_restore_position_for_mysql(
3087
/*===========================*/
3088
/* out: TRUE if we may need to
3089
process the record the cursor is
3090
now positioned on (i.e. we should
3091
not go to the next record yet) */
3092
ibool* same_user_rec, /* out: TRUE if we were able to restore
3093
the cursor on a user record with the
3094
same ordering prefix in in the
3096
ulint latch_mode, /* in: latch mode wished in
3098
btr_pcur_t* pcur, /* in: cursor whose position
3100
ibool moves_up, /* in: TRUE if the cursor moves up
3102
mtr_t* mtr) /* in: mtr; CAUTION: may commit
3106
ulint relative_position;
3108
relative_position = pcur->rel_pos;
3110
success = btr_pcur_restore_position(latch_mode, pcur, mtr);
3112
*same_user_rec = success;
3114
if (relative_position == BTR_PCUR_ON) {
3120
btr_pcur_move_to_next(pcur, mtr);
3126
if (relative_position == BTR_PCUR_AFTER
3127
|| relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
3133
if (btr_pcur_is_on_user_rec(pcur)) {
3134
btr_pcur_move_to_prev(pcur, mtr);
3140
ut_ad(relative_position == BTR_PCUR_BEFORE
3141
|| relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
3143
if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
3144
btr_pcur_move_to_next(pcur, mtr);
3150
/************************************************************************
3151
Pops a cached row for MySQL from the fetch cache. */
3154
row_sel_pop_cached_row_for_mysql(
3155
/*=============================*/
3156
byte* buf, /* in/out: buffer where to copy the
3158
row_prebuilt_t* prebuilt) /* in: prebuilt struct */
3161
mysql_row_templ_t* templ;
3163
ut_ad(prebuilt->n_fetch_cached > 0);
3164
ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
3166
if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
3167
/* Copy cache record field by field, don't touch fields that
3168
are not covered by current key */
3169
cached_rec = prebuilt->fetch_cache[
3170
prebuilt->fetch_cache_first];
3172
for (i = 0; i < prebuilt->n_template; i++) {
3173
templ = prebuilt->mysql_template + i;
3174
ut_memcpy(buf + templ->mysql_col_offset,
3175
cached_rec + templ->mysql_col_offset,
3176
templ->mysql_col_len);
3177
/* Copy NULL bit of the current field from cached_rec
3179
if (templ->mysql_null_bit_mask) {
3180
buf[templ->mysql_null_byte_offset]
3181
^= (buf[templ->mysql_null_byte_offset]
3182
^ cached_rec[templ->mysql_null_byte_offset])
3183
& (byte)templ->mysql_null_bit_mask;
3189
prebuilt->fetch_cache[prebuilt->fetch_cache_first],
3190
prebuilt->mysql_prefix_len);
3192
prebuilt->n_fetch_cached--;
3193
prebuilt->fetch_cache_first++;
3195
if (prebuilt->n_fetch_cached == 0) {
3196
prebuilt->fetch_cache_first = 0;
3200
/************************************************************************
3201
Pushes a row for MySQL to the fetch cache. */
3204
row_sel_push_cache_row_for_mysql(
3205
/*=============================*/
3206
row_prebuilt_t* prebuilt, /* in: prebuilt struct */
3207
const rec_t* rec, /* in: record to push; must
3208
be protected by a page latch */
3209
const ulint* offsets, /* in: rec_get_offsets() */
3210
ulint start_field_no, /* psergy: start from this field */
3211
byte* remainder_buf) /* if above !=0 -> where to take
3217
ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
3218
ut_ad(rec_offs_validate(rec, NULL, offsets));
3219
ut_a(!prebuilt->templ_contains_blob);
3221
if (prebuilt->fetch_cache[0] == NULL) {
3222
/* Allocate memory for the fetch cache */
3224
for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
3226
/* A user has reported memory corruption in these
3227
buffers in Linux. Put magic numbers there to help
3228
to track a possible bug. */
3230
buf = mem_alloc(prebuilt->mysql_row_len + 8);
3232
prebuilt->fetch_cache[i] = buf + 4;
3234
mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
3235
mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
3236
ROW_PREBUILT_FETCH_MAGIC_N);
3240
ut_ad(prebuilt->fetch_cache_first == 0);
3242
if (UNIV_UNLIKELY(!row_sel_store_mysql_rec(
3243
prebuilt->fetch_cache[
3244
prebuilt->n_fetch_cached],
3245
prebuilt, rec, offsets, start_field_no,
3246
prebuilt->n_template))) {
3250
if (start_field_no) {
3251
for (i=0; i < start_field_no; i++) {
3252
register ulint offs;
3253
mysql_row_templ_t* templ;
3254
templ = prebuilt->mysql_template + i;
3256
if (templ->mysql_null_bit_mask) {
3257
offs= templ->mysql_null_byte_offset;
3258
if (*(remainder_buf + offs) & templ->mysql_null_bit_mask)
3259
*(prebuilt->fetch_cache[prebuilt->n_fetch_cached] + offs) |=
3260
/* (*(remainder_buf + offs) &*/( templ->mysql_null_bit_mask);
3262
*(prebuilt->fetch_cache[prebuilt->n_fetch_cached] + offs) &=
3263
~templ->mysql_null_bit_mask;
3266
offs= templ->mysql_col_offset;
3267
memcpy(prebuilt->fetch_cache[prebuilt->n_fetch_cached] + offs,
3268
remainder_buf + offs,
3269
templ->mysql_col_len);
3274
prebuilt->n_fetch_cached++;
3277
/*************************************************************************
3278
Tries to do a shortcut to fetch a clustered index record with a unique key,
3279
using the hash index if possible (not always). We assume that the search
3280
mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
3281
btr search latch has been locked in S-mode. */
3284
row_sel_try_search_shortcut_for_mysql(
3285
/*==================================*/
3286
/* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
3287
const rec_t** out_rec,/* out: record if found */
3288
row_prebuilt_t* prebuilt,/* in: prebuilt struct */
3289
ulint** offsets,/* in/out: for rec_get_offsets(*out_rec) */
3290
mem_heap_t** heap, /* in/out: heap for rec_get_offsets() */
3291
mtr_t* mtr) /* in: started mtr */
3293
dict_index_t* index = prebuilt->index;
3294
const dtuple_t* search_tuple = prebuilt->search_tuple;
3295
btr_pcur_t* pcur = prebuilt->pcur;
3296
trx_t* trx = prebuilt->trx;
3299
ut_ad(dict_index_is_clust(index));
3300
ut_ad(!prebuilt->templ_contains_blob);
3302
btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3303
BTR_SEARCH_LEAF, pcur,
3304
#ifndef UNIV_SEARCH_DEBUG
3310
rec = btr_pcur_get_rec(pcur);
3312
if (!page_rec_is_user_rec(rec)) {
3317
/* As the cursor is now placed on a user record after a search with
3318
the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
3319
fields in the user record matched to the search tuple */
3321
if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
3323
return(SEL_EXHAUSTED);
3326
/* This is a non-locking consistent read: if necessary, fetch
3327
a previous version of the record */
3329
*offsets = rec_get_offsets(rec, index, *offsets,
3330
ULINT_UNDEFINED, heap);
3332
if (!lock_clust_rec_cons_read_sees(rec, index,
3333
*offsets, trx->read_view)) {
3338
if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
3340
return(SEL_EXHAUSTED);
3348
/************************************************************************
3349
Searches for rows in the database. This is used in the interface to
3350
MySQL. This function opens a cursor, and also implements fetch next
3351
and fetch prev. NOTE that if we do a search with a full key value
3352
from a unique index (ROW_SEL_EXACT), then we will not store the cursor
3353
position and fetch next or fetch prev must not be tried to the cursor! */
3356
row_search_for_mysql(
3357
/*=================*/
3359
DB_RECORD_NOT_FOUND,
3360
DB_END_OF_INDEX, DB_DEADLOCK,
3361
DB_LOCK_TABLE_FULL, DB_CORRUPTION,
3362
or DB_TOO_BIG_RECORD */
3363
byte* buf, /* in/out: buffer for the fetched
3364
row in the MySQL format */
3365
ulint mode, /* in: search mode PAGE_CUR_L, ... */
3366
row_prebuilt_t* prebuilt, /* in: prebuilt struct for the
3367
table handle; this contains the info
3368
of search_tuple, index; if search
3369
tuple contains 0 fields then we
3370
position the cursor at the start or
3371
the end of the index, depending on
3373
ulint match_mode, /* in: 0 or ROW_SEL_EXACT or
3374
ROW_SEL_EXACT_PREFIX */
3375
ulint direction) /* in: 0 or ROW_SEL_NEXT or
3376
ROW_SEL_PREV; NOTE: if this is != 0,
3377
then prebuilt must have a pcur
3378
with stored position! In opening of a
3379
cursor 'direction' should be 0. */
3381
dict_index_t* index = prebuilt->index;
3382
ibool comp = dict_table_is_comp(index->table);
3383
const dtuple_t* search_tuple = prebuilt->search_tuple;
3384
btr_pcur_t* pcur = prebuilt->pcur;
3385
trx_t* trx = prebuilt->trx;
3386
dict_index_t* clust_index;
3389
const rec_t* result_rec;
3390
const rec_t* clust_rec;
3391
ulint err = DB_SUCCESS;
3392
ibool unique_search = FALSE;
3393
ibool unique_search_from_clust_index = FALSE;
3394
ibool mtr_has_extra_clust_latch = FALSE;
3395
ibool moves_up = FALSE;
3396
ibool set_also_gap_locks = TRUE;
3397
/* if the query is a plain locking SELECT, and the isolation level
3398
is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
3399
ibool did_semi_consistent_read = FALSE;
3400
/* if the returned record was locked and we did a semi-consistent
3401
read (fetch the newest committed version), then this is set to
3403
#ifdef UNIV_SEARCH_DEBUG
3405
#endif /* UNIV_SEARCH_DEBUG */
3407
ibool same_user_rec;
3409
mem_heap_t* heap = NULL;
3410
ulint offsets_[REC_OFFS_NORMAL_SIZE];
3411
ulint* offsets = offsets_;
3412
ibool some_fields_in_buffer;
3413
ibool get_clust_rec= 0;
3415
rec_offs_init(offsets_);
3417
ut_ad(index && pcur && search_tuple);
3418
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
3420
if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) {
3421
ut_print_timestamp(stderr);
3422
fprintf(stderr, " InnoDB: Error:\n"
3423
"InnoDB: MySQL is trying to use a table handle"
3424
" but the .ibd file for\n"
3425
"InnoDB: table %s does not exist.\n"
3426
"InnoDB: Have you deleted the .ibd file"
3427
" from the database directory under\n"
3428
"InnoDB: the MySQL datadir, or have you used"
3429
" DISCARD TABLESPACE?\n"
3430
"InnoDB: Look from\n"
3431
"InnoDB: http://dev.mysql.com/doc/refman/5.1/en/"
3432
"innodb-troubleshooting.html\n"
3433
"InnoDB: how you can resolve the problem.\n",
3434
prebuilt->table->name);
3439
if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
3441
"InnoDB: Error: trying to free a corrupt\n"
3442
"InnoDB: table handle. Magic n %lu, table name ",
3443
(ulong) prebuilt->magic_n);
3444
ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
3447
mem_analyze_corruption(prebuilt);
3453
/* August 19, 2005 by Heikki: temporarily disable this error
3454
print until the cursor lock count is done correctly.
3455
See bugs #12263 and #12456!*/
3457
if (trx->n_mysql_tables_in_use == 0
3458
&& UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) {
3459
/* Note that if MySQL uses an InnoDB temp table that it
3460
created inside LOCK TABLES, then n_mysql_tables_in_use can
3461
be zero; in that case select_lock_type is set to LOCK_X in
3464
fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n"
3465
"InnoDB: but it has not locked"
3466
" any tables in ::external_lock()!\n",
3468
trx_print(stderr, trx, 600);
3469
fputc('\n', stderr);
3474
fprintf(stderr, "Match mode %lu\n search tuple ",
3475
(ulong) match_mode);
3476
dtuple_print(search_tuple);
3477
fprintf(stderr, "N tables locked %lu\n",
3478
(ulong) trx->mysql_n_tables_locked);
3480
/*-------------------------------------------------------------*/
3481
/* PHASE 0: Release a possible s-latch we are holding on the
3482
adaptive hash index latch if there is someone waiting behind */
3484
if (UNIV_UNLIKELY(btr_search_latch.writer != RW_LOCK_NOT_LOCKED)
3485
&& trx->has_search_latch) {
3487
/* There is an x-latch request on the adaptive hash index:
3488
release the s-latch to reduce starvation and wait for
3489
BTR_SEA_TIMEOUT rounds before trying to keep it again over
3492
rw_lock_s_unlock(&btr_search_latch);
3493
trx->has_search_latch = FALSE;
3495
trx->search_latch_timeout = BTR_SEA_TIMEOUT;
3498
/* Reset the new record lock info if srv_locks_unsafe_for_binlog
3499
is set or session is using a READ COMMITED isolation level. Then
3500
we are able to remove the record locks set here on an individual
3503
if ((srv_locks_unsafe_for_binlog
3504
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
3505
&& prebuilt->select_lock_type != LOCK_NONE) {
3507
trx_reset_new_rec_lock_info(trx);
3510
/*-------------------------------------------------------------*/
3511
/* PHASE 1: Try to pop the row from the prefetch cache */
3513
if (UNIV_UNLIKELY(direction == 0)) {
3514
trx->op_info = "starting index read";
3516
prebuilt->n_rows_fetched = 0;
3517
prebuilt->n_fetch_cached = 0;
3518
prebuilt->fetch_cache_first = 0;
3520
if (prebuilt->sel_graph == NULL) {
3521
/* Build a dummy select query graph */
3522
row_prebuild_sel_graph(prebuilt);
3525
trx->op_info = "fetching rows";
3527
if (prebuilt->n_rows_fetched == 0) {
3528
prebuilt->fetch_direction = direction;
3531
if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
3532
if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
3534
/* TODO: scrollable cursor: restore cursor to
3535
the place of the latest returned row,
3536
or better: prevent caching for a scroll
3540
prebuilt->n_rows_fetched = 0;
3541
prebuilt->n_fetch_cached = 0;
3542
prebuilt->fetch_cache_first = 0;
3544
} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
3545
row_sel_pop_cached_row_for_mysql(buf, prebuilt);
3547
prebuilt->n_rows_fetched++;
3554
if (prebuilt->fetch_cache_first > 0
3555
&& prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
3557
/* The previous returned row was popped from the fetch
3558
cache, but the cache was not full at the time of the
3559
popping: no more rows can exist in the result set */
3561
err = DB_RECORD_NOT_FOUND;
3565
prebuilt->n_rows_fetched++;
3567
if (prebuilt->n_rows_fetched > 1000000000) {
3568
/* Prevent wrap-over */
3569
prebuilt->n_rows_fetched = 500000000;
3572
mode = pcur->search_mode;
3575
/* In a search where at most one record in the index may match, we
3576
can use a LOCK_REC_NOT_GAP type record lock when locking a
3577
non-delete-marked matching record.
3579
Note that in a unique secondary index there may be different
3580
delete-marked versions of a record where only the primary key
3581
values differ: thus in a secondary index we must use next-key
3582
locks when locking delete-marked records. */
3584
if (match_mode == ROW_SEL_EXACT
3585
&& dict_index_is_unique(index)
3586
&& dtuple_get_n_fields(search_tuple)
3587
== dict_index_get_n_unique(index)
3588
&& (dict_index_is_clust(index)
3589
|| !dtuple_contains_null(search_tuple))) {
3591
/* Note above that a UNIQUE secondary index can contain many
3592
rows with the same key value if one of the columns is the SQL
3593
null. A clustered index under MySQL can never contain null
3594
columns because we demand that all the columns in primary key
3597
unique_search = TRUE;
3599
/* Even if the condition is unique, MySQL seems to try to
3600
retrieve also a second row if a primary key contains more than
3601
1 column. Return immediately if this is not a HANDLER
3604
if (UNIV_UNLIKELY(direction != 0
3605
&& !prebuilt->used_in_HANDLER)) {
3607
err = DB_RECORD_NOT_FOUND;
3614
/*-------------------------------------------------------------*/
3615
/* PHASE 2: Try fast adaptive hash index search if possible */
3617
/* Next test if this is the special case where we can use the fast
3618
adaptive hash index to try the search. Since we must release the
3619
search system latch when we retrieve an externally stored field, we
3620
cannot use the adaptive hash index in a search in the case the row
3621
may be long and there may be externally stored fields */
3623
if (UNIV_UNLIKELY(direction == 0)
3625
&& dict_index_is_clust(index)
3626
&& !prebuilt->templ_contains_blob
3627
&& !prebuilt->used_in_HANDLER
3628
&& (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
3632
unique_search_from_clust_index = TRUE;
3634
if (trx->mysql_n_tables_locked == 0
3635
&& prebuilt->select_lock_type == LOCK_NONE
3636
&& trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3637
&& trx->read_view) {
3639
/* This is a SELECT query done as a consistent read,
3640
and the read view has already been allocated:
3641
let us try a search shortcut through the hash
3643
NOTE that we must also test that
3644
mysql_n_tables_locked == 0, because this might
3645
also be INSERT INTO ... SELECT ... or
3646
CREATE TABLE ... SELECT ... . Our algorithm is
3647
NOT prepared to inserts interleaved with the SELECT,
3648
and if we try that, we can deadlock on the adaptive
3649
hash index semaphore! */
3651
#ifndef UNIV_SEARCH_DEBUG
3652
if (!trx->has_search_latch) {
3653
rw_lock_s_lock(&btr_search_latch);
3654
trx->has_search_latch = TRUE;
3657
switch (row_sel_try_search_shortcut_for_mysql(
3658
&rec, prebuilt, &offsets, &heap,
3661
#ifdef UNIV_SEARCH_DEBUG
3662
ut_a(0 == cmp_dtuple_rec(search_tuple,
3665
/* At this point, rec is protected by
3666
a page latch that was acquired by
3667
row_sel_try_search_shortcut_for_mysql().
3668
The latch will not be released until
3669
mtr_commit(&mtr). */
3671
if (!row_sel_store_mysql_rec(buf, prebuilt,
3673
prebuilt->n_template)) {
3674
err = DB_TOO_BIG_RECORD;
3676
/* We let the main loop to do the
3678
goto shortcut_fails_too_big_rec;
3683
/* ut_print_name(stderr, index->name);
3684
fputs(" shortcut\n", stderr); */
3689
goto release_search_latch_if_needed;
3694
/* ut_print_name(stderr, index->name);
3695
fputs(" record not found 2\n", stderr); */
3697
err = DB_RECORD_NOT_FOUND;
3698
release_search_latch_if_needed:
3699
if (trx->search_latch_timeout > 0
3700
&& trx->has_search_latch) {
3702
trx->search_latch_timeout--;
3704
rw_lock_s_unlock(&btr_search_latch);
3705
trx->has_search_latch = FALSE;
3708
/* NOTE that we do NOT store the cursor
3718
shortcut_fails_too_big_rec:
3724
/*-------------------------------------------------------------*/
3725
/* PHASE 3: Open or restore index cursor position */
3727
if (trx->has_search_latch) {
3728
rw_lock_s_unlock(&btr_search_latch);
3729
trx->has_search_latch = FALSE;
3732
trx_start_if_not_started(trx);
3734
if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
3735
&& prebuilt->select_lock_type != LOCK_NONE
3736
&& trx->mysql_thd != NULL
3737
&& trx->mysql_query_str != NULL
3738
&& *trx->mysql_query_str != NULL) {
3740
/* Scan the MySQL query string; check if SELECT is the first
3743
if (dict_str_starts_with_keyword(
3744
trx->mysql_thd, *trx->mysql_query_str, "SELECT")) {
3745
/* It is a plain locking SELECT and the isolation
3746
level is low: do not lock gaps */
3748
set_also_gap_locks = FALSE;
3752
/* Note that if the search mode was GE or G, then the cursor
3753
naturally moves upward (in fetch next) in alphabetical order,
3754
otherwise downward */
3756
if (UNIV_UNLIKELY(direction == 0)) {
3757
if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
3760
} else if (direction == ROW_SEL_NEXT) {
3764
thr = que_fork_get_first_thr(prebuilt->sel_graph);
3766
que_thr_move_to_run_state_for_mysql(thr, trx);
3768
clust_index = dict_table_get_first_index(index->table);
3770
if (UNIV_LIKELY(direction != 0)) {
3771
ibool need_to_process = sel_restore_position_for_mysql(
3772
&same_user_rec, BTR_SEARCH_LEAF,
3773
pcur, moves_up, &mtr);
3775
if (UNIV_UNLIKELY(need_to_process)) {
3776
if (UNIV_UNLIKELY(prebuilt->row_read_type
3777
== ROW_READ_DID_SEMI_CONSISTENT)) {
3778
/* We did a semi-consistent read,
3779
but the record was removed in
3781
prebuilt->row_read_type
3782
= ROW_READ_TRY_SEMI_CONSISTENT;
3784
} else if (UNIV_LIKELY(prebuilt->row_read_type
3785
!= ROW_READ_DID_SEMI_CONSISTENT)) {
3787
/* The cursor was positioned on the record
3788
that we returned previously. If we need
3789
to repeat a semi-consistent read as a
3790
pessimistic locking read, the record
3791
cannot be skipped. */
3796
} else if (dtuple_get_n_fields(search_tuple) > 0) {
3798
btr_pcur_open_with_no_init(index, search_tuple, mode,
3802
pcur->trx_if_known = trx;
3804
rec = btr_pcur_get_rec(pcur);
3807
&& !page_rec_is_supremum(rec)
3808
&& set_also_gap_locks
3809
&& !(srv_locks_unsafe_for_binlog
3810
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
3811
&& prebuilt->select_lock_type != LOCK_NONE) {
3813
/* Try to place a gap lock on the next index record
3814
to prevent phantoms in ORDER BY ... DESC queries */
3815
const rec_t* next = page_rec_get_next_const(rec);
3817
offsets = rec_get_offsets(next, index, offsets,
3818
ULINT_UNDEFINED, &heap);
3819
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
3820
next, index, offsets,
3821
prebuilt->select_lock_type,
3824
if (err != DB_SUCCESS) {
3826
goto lock_wait_or_error;
3830
if (mode == PAGE_CUR_G) {
3831
btr_pcur_open_at_index_side(
3832
TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3834
} else if (mode == PAGE_CUR_L) {
3835
btr_pcur_open_at_index_side(
3836
FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3841
if (!prebuilt->sql_stat_start) {
3842
/* No need to set an intention lock or assign a read view */
3844
if (trx->read_view == NULL
3845
&& prebuilt->select_lock_type == LOCK_NONE) {
3847
fputs("InnoDB: Error: MySQL is trying to"
3848
" perform a consistent read\n"
3849
"InnoDB: but the read view is not assigned!\n",
3851
trx_print(stderr, trx, 600);
3852
fputc('\n', stderr);
3855
} else if (prebuilt->select_lock_type == LOCK_NONE) {
3856
/* This is a consistent read */
3857
/* Assign a read view for the query */
3859
trx_assign_read_view(trx);
3860
prebuilt->sql_stat_start = FALSE;
3863
if (prebuilt->select_lock_type == LOCK_S) {
3864
lock_mode = LOCK_IS;
3866
lock_mode = LOCK_IX;
3868
err = lock_table(0, index->table, lock_mode, thr);
3870
if (err != DB_SUCCESS) {
3872
goto lock_wait_or_error;
3874
prebuilt->sql_stat_start = FALSE;
3878
/*-------------------------------------------------------------*/
3879
/* PHASE 4: Look for matching records in a loop */
3881
rec = btr_pcur_get_rec(pcur);
3882
ut_ad(!!page_rec_is_comp(rec) == comp);
3883
#ifdef UNIV_SEARCH_DEBUG
3885
fputs("Using ", stderr);
3886
dict_index_name_print(stderr, index);
3887
fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
3888
page_get_page_no(page_align(rec)));
3891
#endif /* UNIV_SEARCH_DEBUG */
3893
if (page_rec_is_infimum(rec)) {
3895
/* The infimum record on a page cannot be in the result set,
3896
and neither can a record lock be placed on it: we skip such
3902
if (page_rec_is_supremum(rec)) {
3904
if (set_also_gap_locks
3905
&& !(srv_locks_unsafe_for_binlog
3906
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
3907
&& prebuilt->select_lock_type != LOCK_NONE) {
3909
/* Try to place a lock on the index record */
3911
/* If innodb_locks_unsafe_for_binlog option is used
3912
or this session is using a READ COMMITTED isolation
3913
level we do not lock gaps. Supremum record is really
3914
a gap and therefore we do not set locks there. */
3916
offsets = rec_get_offsets(rec, index, offsets,
3917
ULINT_UNDEFINED, &heap);
3918
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
3919
rec, index, offsets,
3920
prebuilt->select_lock_type,
3921
LOCK_ORDINARY, thr);
3923
if (err != DB_SUCCESS) {
3925
goto lock_wait_or_error;
3928
/* A page supremum record cannot be in the result set: skip
3929
it now that we have placed a possible lock on it */
3934
/*-------------------------------------------------------------*/
3935
/* Do sanity checks in case our cursor has bumped into page
3939
next_offs = rec_get_next_offs(rec, TRUE);
3940
if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
3945
next_offs = rec_get_next_offs(rec, FALSE);
3946
if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
3952
if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
3955
if (srv_force_recovery == 0 || moves_up == FALSE) {
3956
ut_print_timestamp(stderr);
3957
buf_page_print(page_align(rec), 0);
3959
"\nInnoDB: rec address %p,"
3960
" buf block fix count %lu\n",
3961
(void*) rec, (ulong)
3962
btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
3963
->page.buf_fix_count);
3965
"InnoDB: Index corruption: rec offs %lu"
3966
" next offs %lu, page no %lu,\n"
3968
(ulong) page_offset(rec),
3970
(ulong) page_get_page_no(page_align(rec)));
3971
dict_index_name_print(stderr, trx, index);
3972
fputs(". Run CHECK TABLE. You may need to\n"
3973
"InnoDB: restore from a backup, or"
3974
" dump + drop + reimport the table.\n",
3977
err = DB_CORRUPTION;
3979
goto lock_wait_or_error;
3981
/* The user may be dumping a corrupt table. Jump
3982
over the corruption to recover as much as possible. */
3985
"InnoDB: Index corruption: rec offs %lu"
3986
" next offs %lu, page no %lu,\n"
3988
(ulong) page_offset(rec),
3990
(ulong) page_get_page_no(page_align(rec)));
3991
dict_index_name_print(stderr, trx, index);
3992
fputs(". We try to skip the rest of the page.\n",
3995
btr_pcur_move_to_last_on_page(pcur, &mtr);
4000
/*-------------------------------------------------------------*/
4002
/* Calculate the 'offsets' associated with 'rec' */
4004
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4006
if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
4007
if (!rec_validate(rec, offsets)
4008
|| !btr_index_rec_validate(rec, index, FALSE)) {
4010
"InnoDB: Index corruption: rec offs %lu"
4011
" next offs %lu, page no %lu,\n"
4013
(ulong) page_offset(rec),
4015
(ulong) page_get_page_no(page_align(rec)));
4016
dict_index_name_print(stderr, trx, index);
4017
fputs(". We try to skip the record.\n",
4024
/* Note that we cannot trust the up_match value in the cursor at this
4025
place because we can arrive here after moving the cursor! Thus
4026
we have to recompare rec and search_tuple to determine if they
4029
if (match_mode == ROW_SEL_EXACT) {
4030
/* Test if the index record matches completely to search_tuple
4031
in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
4033
/* fputs("Comparing rec and search tuple\n", stderr); */
4035
if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
4037
if (set_also_gap_locks
4038
&& !(srv_locks_unsafe_for_binlog
4039
|| trx->isolation_level
4040
== TRX_ISO_READ_COMMITTED)
4041
&& prebuilt->select_lock_type != LOCK_NONE) {
4043
/* Try to place a gap lock on the index
4044
record only if innodb_locks_unsafe_for_binlog
4045
option is not set or this session is not
4046
using a READ COMMITTED isolation level. */
4048
err = sel_set_rec_lock(
4049
btr_pcur_get_block(pcur),
4050
rec, index, offsets,
4051
prebuilt->select_lock_type, LOCK_GAP,
4054
if (err != DB_SUCCESS) {
4056
goto lock_wait_or_error;
4060
btr_pcur_store_position(pcur, &mtr);
4062
err = DB_RECORD_NOT_FOUND;
4063
/* ut_print_name(stderr, index->name);
4064
fputs(" record not found 3\n", stderr); */
4069
} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
4071
if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
4073
if (set_also_gap_locks
4074
&& !(srv_locks_unsafe_for_binlog
4075
|| trx->isolation_level
4076
== TRX_ISO_READ_COMMITTED)
4077
&& prebuilt->select_lock_type != LOCK_NONE) {
4079
/* Try to place a gap lock on the index
4080
record only if innodb_locks_unsafe_for_binlog
4081
option is not set or this session is not
4082
using a READ COMMITTED isolation level. */
4084
err = sel_set_rec_lock(
4085
btr_pcur_get_block(pcur),
4086
rec, index, offsets,
4087
prebuilt->select_lock_type, LOCK_GAP,
4090
if (err != DB_SUCCESS) {
4092
goto lock_wait_or_error;
4096
btr_pcur_store_position(pcur, &mtr);
4098
err = DB_RECORD_NOT_FOUND;
4099
/* ut_print_name(stderr, index->name);
4100
fputs(" record not found 4\n", stderr); */
4106
/* We are ready to look at a possible new index entry in the result
4107
set: the cursor is now placed on a user record */
4109
if (prebuilt->select_lock_type != LOCK_NONE) {
4110
/* Try to place a lock on the index record; note that delete
4111
marked records are a special case in a unique search. If there
4112
is a non-delete marked record, then it is enough to lock its
4113
existence with LOCK_REC_NOT_GAP. */
4115
/* If innodb_locks_unsafe_for_binlog option is used
4116
or this session is using a READ COMMITED isolation
4117
level we lock only the record, i.e., next-key locking is
4122
if (!set_also_gap_locks
4123
|| srv_locks_unsafe_for_binlog
4124
|| trx->isolation_level == TRX_ISO_READ_COMMITTED
4126
&& !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
4130
lock_type = LOCK_ORDINARY;
4133
/* If we are doing a 'greater or equal than a primary key
4134
value' search from a clustered index, and we find a record
4135
that has that exact primary key value, then there is no need
4136
to lock the gap before the record, because no insert in the
4137
gap can be in our search range. That is, no phantom row can
4140
An example: if col1 is the primary key, the search is WHERE
4141
col1 >= 100, and we find a record where col1 = 100, then no
4142
need to lock the gap before that record. */
4144
if (index == clust_index
4145
&& mode == PAGE_CUR_GE
4147
&& dtuple_get_n_fields_cmp(search_tuple)
4148
== dict_index_get_n_unique(index)
4149
&& 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
4151
lock_type = LOCK_REC_NOT_GAP;
4154
err = sel_set_rec_lock(btr_pcur_get_block(pcur),
4155
rec, index, offsets,
4156
prebuilt->select_lock_type,
4160
const rec_t* old_vers;
4164
if (UNIV_LIKELY(prebuilt->row_read_type
4165
!= ROW_READ_TRY_SEMI_CONSISTENT)
4166
|| index != clust_index) {
4168
goto lock_wait_or_error;
4171
/* The following call returns 'offsets'
4172
associated with 'old_vers' */
4173
err = row_sel_build_committed_vers_for_mysql(
4174
clust_index, prebuilt, rec,
4175
&offsets, &heap, &old_vers, &mtr);
4177
if (err != DB_SUCCESS) {
4179
goto lock_wait_or_error;
4182
mutex_enter(&kernel_mutex);
4183
if (trx->was_chosen_as_deadlock_victim) {
4184
mutex_exit(&kernel_mutex);
4187
goto lock_wait_or_error;
4189
if (UNIV_LIKELY(trx->wait_lock != NULL)) {
4190
lock_cancel_waiting_and_release(
4192
trx_reset_new_rec_lock_info(trx);
4194
mutex_exit(&kernel_mutex);
4196
/* The lock was granted while we were
4197
searching for the last committed version.
4198
Do a normal locking read. */
4200
offsets = rec_get_offsets(rec, index, offsets,
4206
mutex_exit(&kernel_mutex);
4208
if (old_vers == NULL) {
4209
/* The row was not yet committed */
4214
did_semi_consistent_read = TRUE;
4219
goto lock_wait_or_error;
4222
/* This is a non-locking consistent read: if necessary, fetch
4223
a previous version of the record */
4225
if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
4227
/* Do nothing: we let a non-locking SELECT read the
4228
latest version of the record */
4230
} else if (index == clust_index) {
4232
/* Fetch a previous version of the row if the current
4233
one is not visible in the snapshot; if we have a very
4234
high force recovery level set, we try to avoid crashes
4235
by skipping this lookup */
4237
if (UNIV_LIKELY(srv_force_recovery < 5)
4238
&& !lock_clust_rec_cons_read_sees(
4239
rec, index, offsets, trx->read_view)) {
4242
/* The following call returns 'offsets'
4243
associated with 'old_vers' */
4244
err = row_sel_build_prev_vers_for_mysql(
4245
trx->read_view, clust_index,
4246
prebuilt, rec, &offsets, &heap,
4249
if (err != DB_SUCCESS) {
4251
goto lock_wait_or_error;
4254
if (old_vers == NULL) {
4255
/* The row did not exist yet in
4263
} else if (!lock_sec_rec_cons_read_sees(rec, trx->read_view)) {
4264
/* We are looking into a non-clustered index,
4265
and to get the right version of the record we
4266
have to look also into the clustered index: this
4267
is necessary, because we can only get the undo
4268
information via the clustered index record. */
4270
ut_ad(index != clust_index);
4271
get_clust_rec= TRUE;
4272
goto idx_cond_check;
4276
/* NOTE that at this point rec can be an old version of a clustered
4277
index record built for a consistent read. We cannot assume after this
4278
point that rec is on a buffer pool page. Functions like
4279
page_rec_is_comp() cannot be used! */
4281
if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
4283
/* The record is delete-marked: we can skip it */
4285
if ((srv_locks_unsafe_for_binlog
4286
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
4287
&& prebuilt->select_lock_type != LOCK_NONE
4288
&& !did_semi_consistent_read) {
4290
/* No need to keep a lock on a delete-marked record
4291
if we do not want to use next-key locking. */
4293
row_unlock_for_mysql(prebuilt, TRUE);
4296
/* This is an optimization to skip setting the next key lock
4297
on the record that follows this delete-marked record. This
4298
optimization works because of the unique search criteria
4299
which precludes the presence of a range lock between this
4300
delete marked record and the record following it.
4302
For now this is applicable only to clustered indexes while
4303
doing a unique search. There is scope for further optimization
4304
applicable to unique secondary indexes. Current behaviour is
4305
to widen the scope of a lock on an already delete marked record
4306
if the same record is deleted twice by the same transaction */
4307
if (index == clust_index && unique_search) {
4308
err = DB_RECORD_NOT_FOUND;
4317
if (prebuilt->idx_cond_func)
4320
ut_ad(prebuilt->template_type != ROW_DRIZZLE_DUMMY_TEMPLATE);
4321
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4322
row_sel_store_mysql_rec(buf, prebuilt, rec,
4323
offsets, 0, prebuilt->n_index_fields);
4324
res= prebuilt->idx_cond_func(prebuilt->idx_cond_func_arg);
4329
err = DB_RECORD_NOT_FOUND;
4330
goto idx_cond_failed;
4334
/* Get the clustered index record if needed, if we did not do the
4335
search using the clustered index. */
4337
if (get_clust_rec || (index != clust_index &&
4338
prebuilt->need_to_access_clustered)) {
4340
/* We use a 'goto' to the preceding label if a consistent
4341
read of a secondary index record requires us to look up old
4342
versions of the associated clustered index record. */
4344
ut_ad(rec_offs_validate(rec, index, offsets));
4346
/* It was a non-clustered index and we must fetch also the
4347
clustered index record */
4349
mtr_has_extra_clust_latch = TRUE;
4351
/* The following call returns 'offsets' associated with
4352
'clust_rec'. Note that 'clust_rec' can be an old version
4353
built for a consistent read. */
4355
err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
4357
&offsets, &heap, &mtr);
4358
if (err != DB_SUCCESS) {
4360
goto lock_wait_or_error;
4363
if (clust_rec == NULL) {
4364
/* The record did not exist in the read view */
4365
ut_ad(prebuilt->select_lock_type == LOCK_NONE);
4370
if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
4372
/* The record is delete marked: we can skip it */
4374
if ((srv_locks_unsafe_for_binlog
4375
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
4376
&& prebuilt->select_lock_type != LOCK_NONE) {
4378
/* No need to keep a lock on a delete-marked
4379
record if we do not want to use next-key
4382
row_unlock_for_mysql(prebuilt, TRUE);
4388
if (prebuilt->need_to_access_clustered) {
4390
result_rec = clust_rec;
4392
ut_ad(rec_offs_validate(result_rec, clust_index,
4395
/* We used 'offsets' for the clust rec, recalculate
4397
offsets = rec_get_offsets(rec, index, offsets,
4398
ULINT_UNDEFINED, &heap);
4405
/* We found a qualifying record 'result_rec'. At this point,
4406
'offsets' are associated with 'result_rec'. */
4408
ut_ad(rec_offs_validate(result_rec,
4409
result_rec != rec ? clust_index : index,
4412
/* At this point, the clustered index record is protected
4413
by a page latch that was acquired when pcur was positioned.
4414
The latch will not be released until mtr_commit(&mtr). */
4416
if ((match_mode == ROW_SEL_EXACT
4417
|| prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
4418
&& prebuilt->select_lock_type == LOCK_NONE
4419
&& !prebuilt->templ_contains_blob
4420
&& !prebuilt->clust_index_was_generated
4421
&& !prebuilt->used_in_HANDLER
4422
&& prebuilt->template_type
4423
!= ROW_MYSQL_DUMMY_TEMPLATE) {
4425
/* Inside an update, for example, we do not cache rows,
4426
since we may use the cursor position to do the actual
4427
update, that is why we require ...lock_type == LOCK_NONE.
4428
Since we keep space in prebuilt only for the BLOBs of
4429
a single row, we cannot cache rows in the case there
4430
are BLOBs in the fields to be fetched. In HANDLER we do
4431
not cache rows because there the cursor is a scrollable
4433
some_fields_in_buffer= (index != clust_index &&
4434
prebuilt->idx_cond_func);
4436
row_sel_push_cache_row_for_mysql(prebuilt, result_rec,
4438
some_fields_in_buffer?
4439
prebuilt->n_index_fields: 0,
4441
if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) {
4448
if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) {
4449
memcpy(buf + 4, result_rec
4450
- rec_offs_extra_size(offsets),
4451
rec_offs_size(offsets));
4452
mach_write_to_4(buf,
4453
rec_offs_extra_size(offsets) + 4);
4455
if (!row_sel_store_mysql_rec(buf, prebuilt,
4456
result_rec, offsets,
4457
prebuilt->idx_cond_func?
4458
prebuilt->n_index_fields: 0,
4459
prebuilt->n_template)) {
4460
err = DB_TOO_BIG_RECORD;
4462
goto lock_wait_or_error;
4466
if (prebuilt->clust_index_was_generated) {
4467
if (result_rec != rec) {
4468
offsets = rec_get_offsets(
4469
rec, index, offsets, ULINT_UNDEFINED,
4472
row_sel_store_row_id_to_prebuilt(prebuilt, rec,
4477
/* From this point on, 'offsets' are invalid. */
4480
/* We have an optimization to save CPU time: if this is a consistent
4481
read on a unique condition on the clustered index, then we do not
4482
store the pcur position, because any fetch next or prev will anyway
4483
return 'end of file'. Exceptions are locking reads and the MySQL
4484
HANDLER command where the user can move the cursor with PREV or NEXT
4485
even after a unique search. */
4488
if (!unique_search_from_clust_index
4489
|| prebuilt->select_lock_type != LOCK_NONE
4490
|| prebuilt->used_in_HANDLER) {
4492
/* Inside an update always store the cursor position */
4494
btr_pcur_store_position(pcur, &mtr);
4502
/* Reset the old and new "did semi-consistent read" flags. */
4503
if (UNIV_UNLIKELY(prebuilt->row_read_type
4504
== ROW_READ_DID_SEMI_CONSISTENT)) {
4505
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4507
did_semi_consistent_read = FALSE;
4509
if (UNIV_UNLIKELY(srv_locks_unsafe_for_binlog
4510
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
4511
&& prebuilt->select_lock_type != LOCK_NONE) {
4513
trx_reset_new_rec_lock_info(trx);
4516
/*-------------------------------------------------------------*/
4517
/* PHASE 5: Move the cursor to the next index record */
4519
if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
4520
/* We must commit mtr if we are moving to the next
4521
non-clustered index record, because we could break the
4522
latching order if we would access a different clustered
4523
index page right away without releasing the previous. */
4525
btr_pcur_store_position(pcur, &mtr);
4528
mtr_has_extra_clust_latch = FALSE;
4531
if (sel_restore_position_for_mysql(&same_user_rec,
4533
pcur, moves_up, &mtr)) {
4534
#ifdef UNIV_SEARCH_DEBUG
4536
#endif /* UNIV_SEARCH_DEBUG */
4543
if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
4545
btr_pcur_store_position(pcur, &mtr);
4547
if (match_mode != 0) {
4548
err = DB_RECORD_NOT_FOUND;
4550
err = DB_END_OF_INDEX;
4556
if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
4561
#ifdef UNIV_SEARCH_DEBUG
4563
#endif /* UNIV_SEARCH_DEBUG */
4568
/* Reset the old and new "did semi-consistent read" flags. */
4569
if (UNIV_UNLIKELY(prebuilt->row_read_type
4570
== ROW_READ_DID_SEMI_CONSISTENT)) {
4571
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4573
did_semi_consistent_read = FALSE;
4575
/*-------------------------------------------------------------*/
4577
btr_pcur_store_position(pcur, &mtr);
4580
mtr_has_extra_clust_latch = FALSE;
4582
trx->error_state = err;
4584
/* The following is a patch for MySQL */
4586
que_thr_stop_for_mysql(thr);
4588
thr->lock_state = QUE_THR_LOCK_ROW;
4590
if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
4591
/* It was a lock wait, and it ended */
4593
thr->lock_state = QUE_THR_LOCK_NOLOCK;
4596
sel_restore_position_for_mysql(&same_user_rec,
4597
BTR_SEARCH_LEAF, pcur,
4600
if ((srv_locks_unsafe_for_binlog
4601
|| trx->isolation_level == TRX_ISO_READ_COMMITTED)
4602
&& !same_user_rec) {
4604
/* Since we were not able to restore the cursor
4605
on the same user record, we cannot use
4606
row_unlock_for_mysql() to unlock any records, and
4607
we must thus reset the new rec lock info. Since
4608
in lock0lock.c we have blocked the inheriting of gap
4609
X-locks, we actually do not have any new record locks
4612
Note that if we were able to restore on the 'same'
4613
user record, it is still possible that we were actually
4614
waiting on a delete-marked record, and meanwhile
4615
it was removed by purge and inserted again by some
4616
other user. But that is no problem, because in
4617
rec_loop we will again try to set a lock, and
4618
new_rec_lock_info in trx will be right at the end. */
4620
trx_reset_new_rec_lock_info(trx);
4623
mode = pcur->search_mode;
4628
thr->lock_state = QUE_THR_LOCK_NOLOCK;
4630
#ifdef UNIV_SEARCH_DEBUG
4631
/* fputs("Using ", stderr);
4632
dict_index_name_print(stderr, index);
4633
fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4634
#endif /* UNIV_SEARCH_DEBUG */
4638
/*-------------------------------------------------------------*/
4639
que_thr_stop_for_mysql_no_error(thr, trx);
4643
if (prebuilt->n_fetch_cached > 0) {
4644
row_sel_pop_cached_row_for_mysql(buf, prebuilt);
4649
#ifdef UNIV_SEARCH_DEBUG
4650
/* fputs("Using ", stderr);
4651
dict_index_name_print(stderr, index);
4652
fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4653
#endif /* UNIV_SEARCH_DEBUG */
4654
if (err == DB_SUCCESS) {
4660
if (UNIV_LIKELY_NULL(heap)) {
4661
mem_heap_free(heap);
4664
/* Set or reset the "did semi-consistent read" flag on return.
4665
The flag did_semi_consistent_read is set if and only if
4666
the record being returned was fetched with a semi-consistent read. */
4667
ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
4668
|| !did_semi_consistent_read);
4670
if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
4671
if (UNIV_UNLIKELY(did_semi_consistent_read)) {
4672
prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
4674
prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4680
/***********************************************************************
4681
Checks if MySQL at the moment is allowed for this table to retrieve a
4682
consistent read result, or store it to the query cache. */
4685
row_search_check_if_query_cache_permitted(
4686
/*======================================*/
4687
/* out: TRUE if storing or retrieving
4688
from the query cache is permitted */
4689
trx_t* trx, /* in: transaction object */
4690
const char* norm_name) /* in: concatenation of database name,
4691
'/' char, table name */
4693
dict_table_t* table;
4696
table = dict_table_get(norm_name, FALSE);
4698
if (table == NULL) {
4703
mutex_enter(&kernel_mutex);
4705
/* Start the transaction if it is not started yet */
4707
trx_start_if_not_started_low(trx);
4709
/* If there are locks on the table or some trx has invalidated the
4710
cache up to our trx id, then ret = FALSE.
4711
We do not check what type locks there are on the table, though only
4712
IX type locks actually would require ret = FALSE. */
4714
if (UT_LIST_GET_LEN(table->locks) == 0
4715
&& ut_dulint_cmp(trx->id,
4716
table->query_cache_inv_trx_id) >= 0) {
4720
/* If the isolation level is high, assign a read view for the
4721
transaction if it does not yet have one */
4723
if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
4724
&& !trx->read_view) {
4726
trx->read_view = read_view_open_now(
4727
trx->id, trx->global_read_view_heap);
4728
trx->global_read_view = trx->read_view;
4732
mutex_exit(&kernel_mutex);
4737
/***********************************************************************
4738
Read the AUTOINC column from the current row. If the value is less than
4739
0 and the type is not unsigned then we reset the value to 0. */
4742
row_search_autoinc_read_column(
4743
/*===========================*/
4744
/* out: value read from the column */
4745
dict_index_t* index, /* in: index to read from */
4746
const rec_t* rec, /* in: current rec */
4747
ulint col_no, /* in: column number */
4748
ibool unsigned_type) /* in: signed or unsigned flag */
4753
mem_heap_t* heap = NULL;
4754
ulint offsets_[REC_OFFS_NORMAL_SIZE];
4755
ulint* offsets = offsets_;
4757
rec_offs_init(offsets_);
4759
offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4761
data = rec_get_nth_field(rec, offsets, col_no, &len);
4763
ut_a(len != UNIV_SQL_NULL);
4764
ut_a(len <= sizeof value);
4766
/* we assume AUTOINC value cannot be negative */
4767
value = mach_read_int_type(data, len, unsigned_type);
4769
if (UNIV_LIKELY_NULL(heap)) {
4770
mem_heap_free(heap);
4773
if (!unsigned_type && (ib_int64_t) value < 0) {
4780
/***********************************************************************
4781
Get the last row. */
4784
row_search_autoinc_get_rec(
4785
/*=======================*/
4786
/* out: current rec or NULL */
4787
btr_pcur_t* pcur, /* in: the current cursor */
4788
mtr_t* mtr) /* in: mini transaction */
4791
const rec_t* rec = btr_pcur_get_rec(pcur);
4793
if (page_rec_is_user_rec(rec)) {
4796
} while (btr_pcur_move_to_prev(pcur, mtr));
4801
/***********************************************************************
4802
Read the max AUTOINC value from an index. */
4805
row_search_max_autoinc(
4806
/*===================*/
4807
/* out: DB_SUCCESS if all OK else
4808
error code, DB_RECORD_NOT_FOUND if
4809
column name can't be found in index */
4810
dict_index_t* index, /* in: index to search */
4811
const char* col_name, /* in: name of autoinc column */
4812
ib_uint64_t* value) /* out: AUTOINC value read */
4816
dict_field_t* dfield = NULL;
4817
ulint error = DB_SUCCESS;
4819
n_cols = dict_index_get_n_ordering_defined_by_user(index);
4821
/* Search the index for the AUTOINC column name */
4822
for (i = 0; i < n_cols; ++i) {
4823
dfield = dict_index_get_nth_field(index, i);
4825
if (strcmp(col_name, dfield->name) == 0) {
4832
/* Must find the AUTOINC column name */
4833
if (i < n_cols && dfield) {
4839
/* Open at the high/right end (FALSE), and INIT
4841
btr_pcur_open_at_index_side(
4842
FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
4844
if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
4847
rec = row_search_autoinc_get_rec(&pcur, &mtr);
4850
ibool unsigned_type = (
4851
dfield->col->prtype & DATA_UNSIGNED);
4853
*value = row_search_autoinc_read_column(
4854
index, rec, i, unsigned_type);
4858
btr_pcur_close(&pcur);
4862
error = DB_RECORD_NOT_FOUND;