~drizzle-trunk/drizzle/development

1 by brian
clean slate
1
/*******************************************************
2
Select
3
4
(c) 1997 Innobase Oy
5
6
Created 12/19/1997 Heikki Tuuri
7
*******************************************************/
8
9
#include "row0sel.h"
10
11
#ifdef UNIV_NONINL
12
#include "row0sel.ic"
13
#endif
14
15
#include "dict0dict.h"
16
#include "dict0boot.h"
17
#include "trx0undo.h"
18
#include "trx0trx.h"
19
#include "btr0btr.h"
20
#include "btr0cur.h"
21
#include "btr0sea.h"
22
#include "mach0data.h"
23
#include "que0que.h"
24
#include "row0upd.h"
25
#include "row0row.h"
26
#include "row0vers.h"
27
#include "rem0cmp.h"
28
#include "lock0lock.h"
29
#include "eval0eval.h"
30
#include "pars0sym.h"
31
#include "pars0pars.h"
32
#include "row0mysql.h"
33
#include "read0read.h"
34
#include "buf0lru.h"
35
36
/* Maximum number of rows to prefetch; MySQL interface has another parameter */
37
#define SEL_MAX_N_PREFETCH	16
38
39
/* Number of rows fetched, after which to start prefetching; MySQL interface
40
has another parameter */
41
#define SEL_PREFETCH_LIMIT	1
42
43
/* When a select has accessed about this many pages, it returns control back
44
to que_run_threads: this is to allow canceling runaway queries */
45
46
#define SEL_COST_LIMIT	100
47
48
/* Flags for search shortcut */
49
#define SEL_FOUND	0
50
#define	SEL_EXHAUSTED	1
51
#define SEL_RETRY	2
52
53
/************************************************************************
54
Returns TRUE if the user-defined column values in a secondary index record
55
are alphabetically the same as the corresponding columns in the clustered
56
index record.
57
NOTE: the comparison is NOT done as a binary comparison, but character
58
fields are compared with collation! */
59
static
60
ibool
61
row_sel_sec_rec_is_for_clust_rec(
62
/*=============================*/
63
					/* out: TRUE if the secondary
64
					record is equal to the corresponding
65
					fields in the clustered record,
66
					when compared with collation */
67
	rec_t*		sec_rec,	/* in: secondary index record */
68
	dict_index_t*	sec_index,	/* in: secondary index */
69
	rec_t*		clust_rec,	/* in: clustered index record */
70
	dict_index_t*	clust_index)	/* in: clustered index */
71
{
72
	byte*		sec_field;
73
	ulint		sec_len;
74
	byte*		clust_field;
75
	ulint		clust_len;
76
	ulint		n;
77
	ulint		i;
78
	mem_heap_t*	heap		= NULL;
79
	ulint		clust_offsets_[REC_OFFS_NORMAL_SIZE];
80
	ulint		sec_offsets_[REC_OFFS_SMALL_SIZE];
81
	ulint*		clust_offs	= clust_offsets_;
82
	ulint*		sec_offs	= sec_offsets_;
83
	ibool		is_equal	= TRUE;
84
85
	*clust_offsets_ = (sizeof clust_offsets_) / sizeof *clust_offsets_;
86
	*sec_offsets_ = (sizeof sec_offsets_) / sizeof *sec_offsets_;
87
88
	clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
89
				     ULINT_UNDEFINED, &heap);
90
	sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
91
				   ULINT_UNDEFINED, &heap);
92
93
	n = dict_index_get_n_ordering_defined_by_user(sec_index);
94
95
	for (i = 0; i < n; i++) {
96
		const dict_field_t*	ifield;
97
		const dict_col_t*	col;
98
99
		ifield = dict_index_get_nth_field(sec_index, i);
100
		col = dict_field_get_col(ifield);
101
102
		clust_field = rec_get_nth_field(
103
			clust_rec, clust_offs,
104
			dict_col_get_clust_pos(col, clust_index), &clust_len);
105
		sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
106
107
		if (ifield->prefix_len > 0 && clust_len != UNIV_SQL_NULL) {
108
109
			clust_len = dtype_get_at_most_n_mbchars(
110
				col->prtype, col->mbminlen, col->mbmaxlen,
111
				ifield->prefix_len,
112
				clust_len, (char*) clust_field);
113
		}
114
115
		if (0 != cmp_data_data(col->mtype, col->prtype,
116
				       clust_field, clust_len,
117
				       sec_field, sec_len)) {
118
			is_equal = FALSE;
119
			goto func_exit;
120
		}
121
	}
122
123
func_exit:
124
	if (UNIV_LIKELY_NULL(heap)) {
125
		mem_heap_free(heap);
126
	}
127
	return(is_equal);
128
}
129
130
/*************************************************************************
131
Creates a select node struct. */
132
133
sel_node_t*
134
sel_node_create(
135
/*============*/
136
				/* out, own: select node struct */
137
	mem_heap_t*	heap)	/* in: memory heap where created */
138
{
139
	sel_node_t*	node;
140
141
	node = mem_heap_alloc(heap, sizeof(sel_node_t));
142
	node->common.type = QUE_NODE_SELECT;
143
	node->state = SEL_NODE_OPEN;
144
145
	node->select_will_do_update = FALSE;
146
	node->latch_mode = BTR_SEARCH_LEAF;
147
148
	node->plans = NULL;
149
150
	return(node);
151
}
152
153
/*************************************************************************
154
Frees the memory private to a select node when a query graph is freed,
155
does not free the heap where the node was originally created. */
156
157
void
158
sel_node_free_private(
159
/*==================*/
160
	sel_node_t*	node)	/* in: select node struct */
161
{
162
	ulint	i;
163
	plan_t*	plan;
164
165
	if (node->plans != NULL) {
166
		for (i = 0; i < node->n_tables; i++) {
167
			plan = sel_node_get_nth_plan(node, i);
168
169
			btr_pcur_close(&(plan->pcur));
170
			btr_pcur_close(&(plan->clust_pcur));
171
172
			if (plan->old_vers_heap) {
173
				mem_heap_free(plan->old_vers_heap);
174
			}
175
		}
176
	}
177
}
178
179
/*************************************************************************
180
Evaluates the values in a select list. If there are aggregate functions,
181
their argument value is added to the aggregate total. */
182
UNIV_INLINE
183
void
184
sel_eval_select_list(
185
/*=================*/
186
	sel_node_t*	node)	/* in: select node */
187
{
188
	que_node_t*	exp;
189
190
	exp = node->select_list;
191
192
	while (exp) {
193
		eval_exp(exp);
194
195
		exp = que_node_get_next(exp);
196
	}
197
}
198
199
/*************************************************************************
200
Assigns the values in the select list to the possible into-variables in
201
SELECT ... INTO ... */
202
UNIV_INLINE
203
void
204
sel_assign_into_var_values(
205
/*=======================*/
206
	sym_node_t*	var,	/* in: first variable in a list of variables */
207
	sel_node_t*	node)	/* in: select node */
208
{
209
	que_node_t*	exp;
210
211
	if (var == NULL) {
212
213
		return;
214
	}
215
216
	exp = node->select_list;
217
218
	while (var) {
219
		ut_ad(exp);
220
221
		eval_node_copy_val(var->alias, exp);
222
223
		exp = que_node_get_next(exp);
224
		var = que_node_get_next(var);
225
	}
226
}
227
228
/*************************************************************************
229
Resets the aggregate value totals in the select list of an aggregate type
230
query. */
231
UNIV_INLINE
232
void
233
sel_reset_aggregate_vals(
234
/*=====================*/
235
	sel_node_t*	node)	/* in: select node */
236
{
237
	func_node_t*	func_node;
238
239
	ut_ad(node->is_aggregate);
240
241
	func_node = node->select_list;
242
243
	while (func_node) {
244
		eval_node_set_int_val(func_node, 0);
245
246
		func_node = que_node_get_next(func_node);
247
	}
248
249
	node->aggregate_already_fetched = FALSE;
250
}
251
252
/*************************************************************************
253
Copies the input variable values when an explicit cursor is opened. */
254
UNIV_INLINE
255
void
256
row_sel_copy_input_variable_vals(
257
/*=============================*/
258
	sel_node_t*	node)	/* in: select node */
259
{
260
	sym_node_t*	var;
261
262
	var = UT_LIST_GET_FIRST(node->copy_variables);
263
264
	while (var) {
265
		eval_node_copy_val(var, var->alias);
266
267
		var->indirection = NULL;
268
269
		var = UT_LIST_GET_NEXT(col_var_list, var);
270
	}
271
}
272
273
/*************************************************************************
274
Fetches the column values from a record. */
275
static
276
void
277
row_sel_fetch_columns(
278
/*==================*/
279
	dict_index_t*	index,	/* in: record index */
280
	rec_t*		rec,	/* in: record in a clustered or non-clustered
281
				index */
282
	const ulint*	offsets,/* in: rec_get_offsets(rec, index) */
283
	sym_node_t*	column)	/* in: first column in a column list, or
284
				NULL */
285
{
286
	dfield_t*	val;
287
	ulint		index_type;
288
	ulint		field_no;
289
	byte*		data;
290
	ulint		len;
291
292
	ut_ad(rec_offs_validate(rec, index, offsets));
293
294
	if (index->type & DICT_CLUSTERED) {
295
		index_type = SYM_CLUST_FIELD_NO;
296
	} else {
297
		index_type = SYM_SEC_FIELD_NO;
298
	}
299
300
	while (column) {
301
		mem_heap_t*	heap = NULL;
302
		ibool		needs_copy;
303
304
		field_no = column->field_nos[index_type];
305
306
		if (field_no != ULINT_UNDEFINED) {
307
308
			if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
309
							      field_no))) {
310
311
				/* Copy an externally stored field to the
312
				temporary heap */
313
314
				heap = mem_heap_create(1);
315
316
				data = btr_rec_copy_externally_stored_field(
317
					rec, offsets, field_no, &len, heap);
318
319
				ut_a(len != UNIV_SQL_NULL);
320
321
				needs_copy = TRUE;
322
			} else {
323
				data = rec_get_nth_field(rec, offsets,
324
							 field_no, &len);
325
326
				needs_copy = column->copy_val;
327
			}
328
329
			if (needs_copy) {
330
				eval_node_copy_and_alloc_val(column, data,
331
							     len);
332
			} else {
333
				val = que_node_get_val(column);
334
				dfield_set_data(val, data, len);
335
			}
336
337
			if (UNIV_LIKELY_NULL(heap)) {
338
				mem_heap_free(heap);
339
			}
340
		}
341
342
		column = UT_LIST_GET_NEXT(col_var_list, column);
343
	}
344
}
345
346
/*************************************************************************
347
Allocates a prefetch buffer for a column when prefetch is first time done. */
348
static
349
void
350
sel_col_prefetch_buf_alloc(
351
/*=======================*/
352
	sym_node_t*	column)	/* in: symbol table node for a column */
353
{
354
	sel_buf_t*	sel_buf;
355
	ulint		i;
356
357
	ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
358
359
	column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
360
					 * sizeof(sel_buf_t));
361
	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
362
		sel_buf = column->prefetch_buf + i;
363
364
		sel_buf->data = NULL;
365
366
		sel_buf->val_buf_size = 0;
367
	}
368
}
369
370
/*************************************************************************
371
Frees a prefetch buffer for a column, including the dynamically allocated
372
memory for data stored there. */
373
374
void
375
sel_col_prefetch_buf_free(
376
/*======================*/
377
	sel_buf_t*	prefetch_buf)	/* in, own: prefetch buffer */
378
{
379
	sel_buf_t*	sel_buf;
380
	ulint		i;
381
382
	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
383
		sel_buf = prefetch_buf + i;
384
385
		if (sel_buf->val_buf_size > 0) {
386
387
			mem_free(sel_buf->data);
388
		}
389
	}
390
}
391
392
/*************************************************************************
393
Pops the column values for a prefetched, cached row from the column prefetch
394
buffers and places them to the val fields in the column nodes. */
395
static
396
void
397
sel_pop_prefetched_row(
398
/*===================*/
399
	plan_t*	plan)	/* in: plan node for a table */
400
{
401
	sym_node_t*	column;
402
	sel_buf_t*	sel_buf;
403
	dfield_t*	val;
404
	byte*		data;
405
	ulint		len;
406
	ulint		val_buf_size;
407
408
	ut_ad(plan->n_rows_prefetched > 0);
409
410
	column = UT_LIST_GET_FIRST(plan->columns);
411
412
	while (column) {
413
		val = que_node_get_val(column);
414
415
		if (!column->copy_val) {
416
			/* We did not really push any value for the
417
			column */
418
419
			ut_ad(!column->prefetch_buf);
420
			ut_ad(que_node_get_val_buf_size(column) == 0);
421
#ifdef UNIV_DEBUG
422
			dfield_set_data(val, NULL, 0);
423
#endif
424
			goto next_col;
425
		}
426
427
		ut_ad(column->prefetch_buf);
428
429
		sel_buf = column->prefetch_buf + plan->first_prefetched;
430
431
		data = sel_buf->data;
432
		len = sel_buf->len;
433
		val_buf_size = sel_buf->val_buf_size;
434
435
		/* We must keep track of the allocated memory for
436
		column values to be able to free it later: therefore
437
		we swap the values for sel_buf and val */
438
439
		sel_buf->data = dfield_get_data(val);
440
		sel_buf->len = dfield_get_len(val);
441
		sel_buf->val_buf_size = que_node_get_val_buf_size(column);
442
443
		dfield_set_data(val, data, len);
444
		que_node_set_val_buf_size(column, val_buf_size);
445
next_col:
446
		column = UT_LIST_GET_NEXT(col_var_list, column);
447
	}
448
449
	plan->n_rows_prefetched--;
450
451
	plan->first_prefetched++;
452
}
453
454
/*************************************************************************
455
Pushes the column values for a prefetched, cached row to the column prefetch
456
buffers from the val fields in the column nodes. */
457
UNIV_INLINE
458
void
459
sel_push_prefetched_row(
460
/*====================*/
461
	plan_t*	plan)	/* in: plan node for a table */
462
{
463
	sym_node_t*	column;
464
	sel_buf_t*	sel_buf;
465
	dfield_t*	val;
466
	byte*		data;
467
	ulint		len;
468
	ulint		pos;
469
	ulint		val_buf_size;
470
471
	if (plan->n_rows_prefetched == 0) {
472
		pos = 0;
473
		plan->first_prefetched = 0;
474
	} else {
475
		pos = plan->n_rows_prefetched;
476
477
		/* We have the convention that pushing new rows starts only
478
		after the prefetch stack has been emptied: */
479
480
		ut_ad(plan->first_prefetched == 0);
481
	}
482
483
	plan->n_rows_prefetched++;
484
485
	ut_ad(pos < SEL_MAX_N_PREFETCH);
486
487
	column = UT_LIST_GET_FIRST(plan->columns);
488
489
	while (column) {
490
		if (!column->copy_val) {
491
			/* There is no sense to push pointers to database
492
			page fields when we do not keep latch on the page! */
493
494
			goto next_col;
495
		}
496
497
		if (!column->prefetch_buf) {
498
			/* Allocate a new prefetch buffer */
499
500
			sel_col_prefetch_buf_alloc(column);
501
		}
502
503
		sel_buf = column->prefetch_buf + pos;
504
505
		val = que_node_get_val(column);
506
507
		data = dfield_get_data(val);
508
		len = dfield_get_len(val);
509
		val_buf_size = que_node_get_val_buf_size(column);
510
511
		/* We must keep track of the allocated memory for
512
		column values to be able to free it later: therefore
513
		we swap the values for sel_buf and val */
514
515
		dfield_set_data(val, sel_buf->data, sel_buf->len);
516
		que_node_set_val_buf_size(column, sel_buf->val_buf_size);
517
518
		sel_buf->data = data;
519
		sel_buf->len = len;
520
		sel_buf->val_buf_size = val_buf_size;
521
next_col:
522
		column = UT_LIST_GET_NEXT(col_var_list, column);
523
	}
524
}
525
526
/*************************************************************************
527
Builds a previous version of a clustered index record for a consistent read */
528
static
529
ulint
530
row_sel_build_prev_vers(
531
/*====================*/
532
					/* out: DB_SUCCESS or error code */
533
	read_view_t*	read_view,	/* in: read view */
534
	dict_index_t*	index,		/* in: plan node for table */
535
	rec_t*		rec,		/* in: record in a clustered index */
536
	ulint**		offsets,	/* in/out: offsets returned by
537
					rec_get_offsets(rec, plan->index) */
538
	mem_heap_t**	offset_heap,	/* in/out: memory heap from which
539
					the offsets are allocated */
540
	mem_heap_t**    old_vers_heap,  /* out: old version heap to use */
541
	rec_t**		old_vers,	/* out: old version, or NULL if the
542
					record does not exist in the view:
543
					i.e., it was freshly inserted
544
					afterwards */
545
	mtr_t*		mtr)		/* in: mtr */
546
{
547
	ulint	err;
548
549
	if (*old_vers_heap) {
550
		mem_heap_empty(*old_vers_heap);
551
	} else {
552
		*old_vers_heap = mem_heap_create(512);
553
	}
554
555
	err = row_vers_build_for_consistent_read(
556
		rec, mtr, index, offsets, read_view, offset_heap,
557
		*old_vers_heap, old_vers);
558
	return(err);
559
}
560
561
/*************************************************************************
562
Builds the last committed version of a clustered index record for a
563
semi-consistent read. */
564
static
565
ulint
566
row_sel_build_committed_vers_for_mysql(
567
/*===================================*/
568
					/* out: DB_SUCCESS or error code */
569
	dict_index_t*	clust_index,	/* in: clustered index */
570
	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct */
571
	rec_t*		rec,		/* in: record in a clustered index */
572
	ulint**		offsets,	/* in/out: offsets returned by
573
					rec_get_offsets(rec, clust_index) */
574
	mem_heap_t**	offset_heap,	/* in/out: memory heap from which
575
					the offsets are allocated */
576
	rec_t**		old_vers,	/* out: old version, or NULL if the
577
					record does not exist in the view:
578
					i.e., it was freshly inserted
579
					afterwards */
580
	mtr_t*		mtr)		/* in: mtr */
581
{
582
	ulint	err;
583
584
	if (prebuilt->old_vers_heap) {
585
		mem_heap_empty(prebuilt->old_vers_heap);
586
	} else {
587
		prebuilt->old_vers_heap = mem_heap_create(200);
588
	}
589
590
	err = row_vers_build_for_semi_consistent_read(
591
		rec, mtr, clust_index, offsets, offset_heap,
592
		prebuilt->old_vers_heap, old_vers);
593
	return(err);
594
}
595
596
/*************************************************************************
597
Tests the conditions which determine when the index segment we are searching
598
through has been exhausted. */
599
UNIV_INLINE
600
ibool
601
row_sel_test_end_conds(
602
/*===================*/
603
			/* out: TRUE if row passed the tests */
604
	plan_t*	plan)	/* in: plan for the table; the column values must
605
			already have been retrieved and the right sides of
606
			comparisons evaluated */
607
{
608
	func_node_t*	cond;
609
610
	/* All conditions in end_conds are comparisons of a column to an
611
	expression */
612
613
	cond = UT_LIST_GET_FIRST(plan->end_conds);
614
615
	while (cond) {
616
		/* Evaluate the left side of the comparison, i.e., get the
617
		column value if there is an indirection */
618
619
		eval_sym(cond->args);
620
621
		/* Do the comparison */
622
623
		if (!eval_cmp(cond)) {
624
625
			return(FALSE);
626
		}
627
628
		cond = UT_LIST_GET_NEXT(cond_list, cond);
629
	}
630
631
	return(TRUE);
632
}
633
634
/*************************************************************************
635
Tests the other conditions. */
636
UNIV_INLINE
637
ibool
638
row_sel_test_other_conds(
639
/*=====================*/
640
			/* out: TRUE if row passed the tests */
641
	plan_t*	plan)	/* in: plan for the table; the column values must
642
			already have been retrieved */
643
{
644
	func_node_t*	cond;
645
646
	cond = UT_LIST_GET_FIRST(plan->other_conds);
647
648
	while (cond) {
649
		eval_exp(cond);
650
651
		if (!eval_node_get_ibool_val(cond)) {
652
653
			return(FALSE);
654
		}
655
656
		cond = UT_LIST_GET_NEXT(cond_list, cond);
657
	}
658
659
	return(TRUE);
660
}
661
662
/*************************************************************************
663
Retrieves the clustered index record corresponding to a record in a
664
non-clustered index. Does the necessary locking. */
665
static
666
ulint
667
row_sel_get_clust_rec(
668
/*==================*/
669
				/* out: DB_SUCCESS or error code */
670
	sel_node_t*	node,	/* in: select_node */
671
	plan_t*		plan,	/* in: plan node for table */
672
	rec_t*		rec,	/* in: record in a non-clustered index */
673
	que_thr_t*	thr,	/* in: query thread */
674
	rec_t**		out_rec,/* out: clustered record or an old version of
675
				it, NULL if the old version did not exist
676
				in the read view, i.e., it was a fresh
677
				inserted version */
678
	mtr_t*		mtr)	/* in: mtr used to get access to the
679
				non-clustered record; the same mtr is used to
680
				access the clustered index */
681
{
682
	dict_index_t*	index;
683
	rec_t*		clust_rec;
684
	rec_t*		old_vers;
685
	ulint		err;
686
	mem_heap_t*	heap		= NULL;
687
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
688
	ulint*		offsets		= offsets_;
689
	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
690
691
	*out_rec = NULL;
692
693
	offsets = rec_get_offsets(rec,
694
				  btr_pcur_get_btr_cur(&plan->pcur)->index,
695
				  offsets, ULINT_UNDEFINED, &heap);
696
697
	row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
698
699
	index = dict_table_get_first_index(plan->table);
700
701
	btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
702
				   node->latch_mode, &(plan->clust_pcur),
703
				   0, mtr);
704
705
	clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
706
707
	/* Note: only if the search ends up on a non-infimum record is the
708
	low_match value the real match to the search tuple */
709
710
	if (!page_rec_is_user_rec(clust_rec)
711
	    || btr_pcur_get_low_match(&(plan->clust_pcur))
712
	    < dict_index_get_n_unique(index)) {
713
714
		ut_a(rec_get_deleted_flag(rec,
715
					  dict_table_is_comp(plan->table)));
716
		ut_a(node->read_view);
717
718
		/* In a rare case it is possible that no clust rec is found
719
		for a delete-marked secondary index record: if in row0umod.c
720
		in row_undo_mod_remove_clust_low() we have already removed
721
		the clust rec, while purge is still cleaning and removing
722
		secondary index records associated with earlier versions of
723
		the clustered index record. In that case we know that the
724
		clustered index record did not exist in the read view of
725
		trx. */
726
727
		goto func_exit;
728
	}
729
730
	offsets = rec_get_offsets(clust_rec, index, offsets,
731
				  ULINT_UNDEFINED, &heap);
732
733
	if (!node->read_view) {
734
		/* Try to place a lock on the index record */
735
736
		/* If innodb_locks_unsafe_for_binlog option is used
737
		or this session is using READ COMMITTED isolation level
738
		we lock only the record, i.e., next-key locking is
739
		not used. */
740
		ulint	lock_type;
741
		trx_t*	trx;
742
743
		trx = thr_get_trx(thr);
744
745
		if (srv_locks_unsafe_for_binlog
746
		    || trx->isolation_level == TRX_ISO_READ_COMMITTED) {
747
			lock_type = LOCK_REC_NOT_GAP;
748
		} else {
749
			lock_type = LOCK_ORDINARY;
750
		}
751
752
		err = lock_clust_rec_read_check_and_lock(
753
			0, clust_rec, index, offsets,
754
			node->row_lock_mode, lock_type, thr);
755
756
		if (err != DB_SUCCESS) {
757
758
			goto err_exit;
759
		}
760
	} else {
761
		/* This is a non-locking consistent read: if necessary, fetch
762
		a previous version of the record */
763
764
		old_vers = NULL;
765
766
		if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
767
						   node->read_view)) {
768
769
			err = row_sel_build_prev_vers(
770
				node->read_view, index, clust_rec,
771
				&offsets, &heap, &plan->old_vers_heap,
772
				&old_vers, mtr);
773
774
			if (err != DB_SUCCESS) {
775
776
				goto err_exit;
777
			}
778
779
			clust_rec = old_vers;
780
781
			if (clust_rec == NULL) {
782
				goto func_exit;
783
			}
784
		}
785
786
		/* If we had to go to an earlier version of row or the
787
		secondary index record is delete marked, then it may be that
788
		the secondary index record corresponding to clust_rec
789
		(or old_vers) is not rec; in that case we must ignore
790
		such row because in our snapshot rec would not have existed.
791
		Remember that from rec we cannot see directly which transaction
792
		id corresponds to it: we have to go to the clustered index
793
		record. A query where we want to fetch all rows where
794
		the secondary index value is in some interval would return
795
		a wrong result if we would not drop rows which we come to
796
		visit through secondary index records that would not really
797
		exist in our snapshot. */
798
799
		if ((old_vers
800
		     || rec_get_deleted_flag(rec, dict_table_is_comp(
801
						     plan->table)))
802
		    && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
803
							 clust_rec, index)) {
804
			goto func_exit;
805
		}
806
	}
807
808
	/* Fetch the columns needed in test conditions */
809
810
	row_sel_fetch_columns(index, clust_rec, offsets,
811
			      UT_LIST_GET_FIRST(plan->columns));
812
	*out_rec = clust_rec;
813
func_exit:
814
	err = DB_SUCCESS;
815
err_exit:
816
	if (UNIV_LIKELY_NULL(heap)) {
817
		mem_heap_free(heap);
818
	}
819
	return(err);
820
}
821
822
/*************************************************************************
823
Sets a lock on a record. */
824
UNIV_INLINE
825
ulint
826
sel_set_rec_lock(
827
/*=============*/
828
				/* out: DB_SUCCESS or error code */
829
	rec_t*		rec,	/* in: record */
830
	dict_index_t*	index,	/* in: index */
831
	const ulint*	offsets,/* in: rec_get_offsets(rec, index) */
832
	ulint		mode,	/* in: lock mode */
833
	ulint		type,	/* in: LOCK_ORDINARY, LOCK_GAP, or
834
				LOC_REC_NOT_GAP */
835
	que_thr_t*	thr)	/* in: query thread */
836
{
837
	trx_t*	trx;
838
	ulint	err;
839
840
	trx = thr_get_trx(thr);
841
842
	if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
843
		if (buf_LRU_buf_pool_running_out()) {
844
845
			return(DB_LOCK_TABLE_FULL);
846
		}
847
	}
848
849
	if (index->type & DICT_CLUSTERED) {
850
		err = lock_clust_rec_read_check_and_lock(
851
			0, rec, index, offsets, mode, type, thr);
852
	} else {
853
		err = lock_sec_rec_read_check_and_lock(
854
			0, rec, index, offsets, mode, type, thr);
855
	}
856
857
	return(err);
858
}
859
860
/*************************************************************************
861
Opens a pcur to a table index. */
862
static
863
void
864
row_sel_open_pcur(
865
/*==============*/
866
	sel_node_t*	node,		/* in: select node */
867
	plan_t*		plan,		/* in: table plan */
868
	ibool		search_latch_locked,
869
					/* in: TRUE if the thread currently
870
					has the search latch locked in
871
					s-mode */
872
	mtr_t*		mtr)		/* in: mtr */
873
{
874
	dict_index_t*	index;
875
	func_node_t*	cond;
876
	que_node_t*	exp;
877
	ulint		n_fields;
878
	ulint		has_search_latch = 0;	/* RW_S_LATCH or 0 */
879
	ulint		i;
880
881
	if (search_latch_locked) {
882
		has_search_latch = RW_S_LATCH;
883
	}
884
885
	index = plan->index;
886
887
	/* Calculate the value of the search tuple: the exact match columns
888
	get their expressions evaluated when we evaluate the right sides of
889
	end_conds */
890
891
	cond = UT_LIST_GET_FIRST(plan->end_conds);
892
893
	while (cond) {
894
		eval_exp(que_node_get_next(cond->args));
895
896
		cond = UT_LIST_GET_NEXT(cond_list, cond);
897
	}
898
899
	if (plan->tuple) {
900
		n_fields = dtuple_get_n_fields(plan->tuple);
901
902
		if (plan->n_exact_match < n_fields) {
903
			/* There is a non-exact match field which must be
904
			evaluated separately */
905
906
			eval_exp(plan->tuple_exps[n_fields - 1]);
907
		}
908
909
		for (i = 0; i < n_fields; i++) {
910
			exp = plan->tuple_exps[i];
911
912
			dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
913
					 que_node_get_val(exp));
914
		}
915
916
		/* Open pcur to the index */
917
918
		btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
919
					   node->latch_mode, &(plan->pcur),
920
					   has_search_latch, mtr);
921
	} else {
922
		/* Open the cursor to the start or the end of the index
923
		(FALSE: no init) */
924
925
		btr_pcur_open_at_index_side(plan->asc, index, node->latch_mode,
926
					    &(plan->pcur), FALSE, mtr);
927
	}
928
929
	ut_ad(plan->n_rows_prefetched == 0);
930
	ut_ad(plan->n_rows_fetched == 0);
931
	ut_ad(plan->cursor_at_end == FALSE);
932
933
	plan->pcur_is_open = TRUE;
934
}
935
936
/*************************************************************************
937
Restores a stored pcur position to a table index. */
938
static
939
ibool
940
row_sel_restore_pcur_pos(
941
/*=====================*/
942
				/* out: TRUE if the cursor should be moved to
943
				the next record after we return from this
944
				function (moved to the previous, in the case
945
				of a descending cursor) without processing
946
				again the current cursor record */
947
	sel_node_t*	node,	/* in: select node */
948
	plan_t*		plan,	/* in: table plan */
949
	mtr_t*		mtr)	/* in: mtr */
950
{
951
	ibool	equal_position;
952
	ulint	relative_position;
953
954
	ut_ad(!plan->cursor_at_end);
955
956
	relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
957
958
	equal_position = btr_pcur_restore_position(node->latch_mode,
959
						   &(plan->pcur), mtr);
960
961
	/* If the cursor is traveling upwards, and relative_position is
962
963
	(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
964
	yet on the successor of the page infimum;
965
	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
966
	first record GREATER than the predecessor of a page supremum; we have
967
	not yet processed the cursor record: no need to move the cursor to the
968
	next record;
969
	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
970
	last record LESS or EQUAL to the old stored user record; (a) if
971
	equal_position is FALSE, this means that the cursor is now on a record
972
	less than the old user record, and we must move to the next record;
973
	(b) if equal_position is TRUE, then if
974
	plan->stored_cursor_rec_processed is TRUE, we must move to the next
975
	record, else there is no need to move the cursor. */
976
977
	if (plan->asc) {
978
		if (relative_position == BTR_PCUR_ON) {
979
980
			if (equal_position) {
981
982
				return(plan->stored_cursor_rec_processed);
983
			}
984
985
			return(TRUE);
986
		}
987
988
		ut_ad(relative_position == BTR_PCUR_AFTER
989
		      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
990
991
		return(FALSE);
992
	}
993
994
	/* If the cursor is traveling downwards, and relative_position is
995
996
	(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
997
	the last record LESS than the successor of a page infimum; we have not
998
	processed the cursor record: no need to move the cursor;
999
	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
1000
	first record GREATER than the predecessor of a page supremum; we have
1001
	processed the cursor record: we should move the cursor to the previous
1002
	record;
1003
	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
1004
	last record LESS or EQUAL to the old stored user record; (a) if
1005
	equal_position is FALSE, this means that the cursor is now on a record
1006
	less than the old user record, and we need not move to the previous
1007
	record; (b) if equal_position is TRUE, then if
1008
	plan->stored_cursor_rec_processed is TRUE, we must move to the previous
1009
	record, else there is no need to move the cursor. */
1010
1011
	if (relative_position == BTR_PCUR_BEFORE
1012
	    || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
1013
1014
		return(FALSE);
1015
	}
1016
1017
	if (relative_position == BTR_PCUR_ON) {
1018
1019
		if (equal_position) {
1020
1021
			return(plan->stored_cursor_rec_processed);
1022
		}
1023
1024
		return(FALSE);
1025
	}
1026
1027
	ut_ad(relative_position == BTR_PCUR_AFTER
1028
	      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
1029
1030
	return(TRUE);
1031
}
1032
1033
/*************************************************************************
1034
Resets a plan cursor to a closed state. */
1035
UNIV_INLINE
1036
void
1037
plan_reset_cursor(
1038
/*==============*/
1039
	plan_t*	plan)	/* in: plan */
1040
{
1041
	plan->pcur_is_open = FALSE;
1042
	plan->cursor_at_end = FALSE;
1043
	plan->n_rows_fetched = 0;
1044
	plan->n_rows_prefetched = 0;
1045
}
1046
1047
/*************************************************************************
1048
Tries to do a shortcut to fetch a clustered index record with a unique key,
1049
using the hash index if possible (not always). */
1050
static
1051
ulint
1052
row_sel_try_search_shortcut(
1053
/*========================*/
1054
				/* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
1055
	sel_node_t*	node,	/* in: select node for a consistent read */
1056
	plan_t*		plan,	/* in: plan for a unique search in clustered
1057
				index */
1058
	mtr_t*		mtr)	/* in: mtr */
1059
{
1060
	dict_index_t*	index;
1061
	rec_t*		rec;
1062
	mem_heap_t*	heap		= NULL;
1063
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
1064
	ulint*		offsets		= offsets_;
1065
	ulint		ret;
1066
	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
1067
1068
	index = plan->index;
1069
1070
	ut_ad(node->read_view);
1071
	ut_ad(plan->unique_search);
1072
	ut_ad(!plan->must_get_clust);
1073
#ifdef UNIV_SYNC_DEBUG
1074
	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
1075
#endif /* UNIV_SYNC_DEBUG */
1076
1077
	row_sel_open_pcur(node, plan, TRUE, mtr);
1078
1079
	rec = btr_pcur_get_rec(&(plan->pcur));
1080
1081
	if (!page_rec_is_user_rec(rec)) {
1082
1083
		return(SEL_RETRY);
1084
	}
1085
1086
	ut_ad(plan->mode == PAGE_CUR_GE);
1087
1088
	/* As the cursor is now placed on a user record after a search with
1089
	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
1090
	fields in the user record matched to the search tuple */
1091
1092
	if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
1093
1094
		return(SEL_EXHAUSTED);
1095
	}
1096
1097
	/* This is a non-locking consistent read: if necessary, fetch
1098
	a previous version of the record */
1099
1100
	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1101
1102
	if (index->type & DICT_CLUSTERED) {
1103
		if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1104
						   node->read_view)) {
1105
			ret = SEL_RETRY;
1106
			goto func_exit;
1107
		}
1108
	} else if (!lock_sec_rec_cons_read_sees(rec, index, node->read_view)) {
1109
1110
		ret = SEL_RETRY;
1111
		goto func_exit;
1112
	}
1113
1114
	/* Test deleted flag. Fetch the columns needed in test conditions. */
1115
1116
	row_sel_fetch_columns(index, rec, offsets,
1117
			      UT_LIST_GET_FIRST(plan->columns));
1118
1119
	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
1120
1121
		ret = SEL_EXHAUSTED;
1122
		goto func_exit;
1123
	}
1124
1125
	/* Test the rest of search conditions */
1126
1127
	if (!row_sel_test_other_conds(plan)) {
1128
1129
		ret = SEL_EXHAUSTED;
1130
		goto func_exit;
1131
	}
1132
1133
	ut_ad(plan->pcur.latch_mode == node->latch_mode);
1134
1135
	plan->n_rows_fetched++;
1136
	ret = SEL_FOUND;
1137
func_exit:
1138
	if (UNIV_LIKELY_NULL(heap)) {
1139
		mem_heap_free(heap);
1140
	}
1141
	return(ret);
1142
}
1143
1144
/*************************************************************************
1145
Performs a select step. */
1146
static
1147
ulint
1148
row_sel(
1149
/*====*/
1150
				/* out: DB_SUCCESS or error code */
1151
	sel_node_t*	node,	/* in: select node */
1152
	que_thr_t*	thr)	/* in: query thread */
1153
{
1154
	dict_index_t*	index;
1155
	plan_t*		plan;
1156
	mtr_t		mtr;
1157
	ibool		moved;
1158
	rec_t*		rec;
1159
	rec_t*		old_vers;
1160
	rec_t*		clust_rec;
1161
	ibool		search_latch_locked;
1162
	ibool		consistent_read;
1163
1164
	/* The following flag becomes TRUE when we are doing a
1165
	consistent read from a non-clustered index and we must look
1166
	at the clustered index to find out the previous delete mark
1167
	state of the non-clustered record: */
1168
1169
	ibool		cons_read_requires_clust_rec	= FALSE;
1170
	ulint		cost_counter			= 0;
1171
	ibool		cursor_just_opened;
1172
	ibool		must_go_to_next;
1173
	ibool		leaf_contains_updates		= FALSE;
1174
	/* TRUE if select_will_do_update is
1175
	TRUE and the current clustered index
1176
	leaf page has been updated during
1177
	the current mtr: mtr must be committed
1178
	at the same time as the leaf x-latch
1179
	is released */
1180
	ibool		mtr_has_extra_clust_latch	= FALSE;
1181
	/* TRUE if the search was made using
1182
	a non-clustered index, and we had to
1183
	access the clustered record: now &mtr
1184
	contains a clustered index latch, and
1185
	&mtr must be committed before we move
1186
	to the next non-clustered record */
1187
	ulint		found_flag;
1188
	ulint		err;
1189
	mem_heap_t*	heap				= NULL;
1190
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
1191
	ulint*		offsets				= offsets_;
1192
	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
1193
1194
	ut_ad(thr->run_node == node);
1195
1196
	search_latch_locked = FALSE;
1197
1198
	if (node->read_view) {
1199
		/* In consistent reads, we try to do with the hash index and
1200
		not to use the buffer page get. This is to reduce memory bus
1201
		load resulting from semaphore operations. The search latch
1202
		will be s-locked when we access an index with a unique search
1203
		condition, but not locked when we access an index with a
1204
		less selective search condition. */
1205
1206
		consistent_read = TRUE;
1207
	} else {
1208
		consistent_read = FALSE;
1209
	}
1210
1211
table_loop:
1212
	/* TABLE LOOP
1213
	----------
1214
	This is the outer major loop in calculating a join. We come here when
1215
	node->fetch_table changes, and after adding a row to aggregate totals
1216
	and, of course, when this function is called. */
1217
1218
	ut_ad(leaf_contains_updates == FALSE);
1219
	ut_ad(mtr_has_extra_clust_latch == FALSE);
1220
1221
	plan = sel_node_get_nth_plan(node, node->fetch_table);
1222
	index = plan->index;
1223
1224
	if (plan->n_rows_prefetched > 0) {
1225
		sel_pop_prefetched_row(plan);
1226
1227
		goto next_table_no_mtr;
1228
	}
1229
1230
	if (plan->cursor_at_end) {
1231
		/* The cursor has already reached the result set end: no more
1232
		rows to process for this table cursor, as also the prefetch
1233
		stack was empty */
1234
1235
		ut_ad(plan->pcur_is_open);
1236
1237
		goto table_exhausted_no_mtr;
1238
	}
1239
1240
	/* Open a cursor to index, or restore an open cursor position */
1241
1242
	mtr_start(&mtr);
1243
1244
	if (consistent_read && plan->unique_search && !plan->pcur_is_open
1245
	    && !plan->must_get_clust
1246
	    && !plan->table->big_rows) {
1247
		if (!search_latch_locked) {
1248
			rw_lock_s_lock(&btr_search_latch);
1249
1250
			search_latch_locked = TRUE;
1251
		} else if (btr_search_latch.writer_is_wait_ex) {
1252
1253
			/* There is an x-latch request waiting: release the
1254
			s-latch for a moment; as an s-latch here is often
1255
			kept for some 10 searches before being released,
1256
			a waiting x-latch request would block other threads
1257
			from acquiring an s-latch for a long time, lowering
1258
			performance significantly in multiprocessors. */
1259
1260
			rw_lock_s_unlock(&btr_search_latch);
1261
			rw_lock_s_lock(&btr_search_latch);
1262
		}
1263
1264
		found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
1265
1266
		if (found_flag == SEL_FOUND) {
1267
1268
			goto next_table;
1269
1270
		} else if (found_flag == SEL_EXHAUSTED) {
1271
1272
			goto table_exhausted;
1273
		}
1274
1275
		ut_ad(found_flag == SEL_RETRY);
1276
1277
		plan_reset_cursor(plan);
1278
1279
		mtr_commit(&mtr);
1280
		mtr_start(&mtr);
1281
	}
1282
1283
	if (search_latch_locked) {
1284
		rw_lock_s_unlock(&btr_search_latch);
1285
1286
		search_latch_locked = FALSE;
1287
	}
1288
1289
	if (!plan->pcur_is_open) {
1290
		/* Evaluate the expressions to build the search tuple and
1291
		open the cursor */
1292
1293
		row_sel_open_pcur(node, plan, search_latch_locked, &mtr);
1294
1295
		cursor_just_opened = TRUE;
1296
1297
		/* A new search was made: increment the cost counter */
1298
		cost_counter++;
1299
	} else {
1300
		/* Restore pcur position to the index */
1301
1302
		must_go_to_next = row_sel_restore_pcur_pos(node, plan, &mtr);
1303
1304
		cursor_just_opened = FALSE;
1305
1306
		if (must_go_to_next) {
1307
			/* We have already processed the cursor record: move
1308
			to the next */
1309
1310
			goto next_rec;
1311
		}
1312
	}
1313
1314
rec_loop:
1315
	/* RECORD LOOP
1316
	-----------
1317
	In this loop we use pcur and try to fetch a qualifying row, and
1318
	also fill the prefetch buffer for this table if n_rows_fetched has
1319
	exceeded a threshold. While we are inside this loop, the following
1320
	holds:
1321
	(1) &mtr is started,
1322
	(2) pcur is positioned and open.
1323
1324
	NOTE that if cursor_just_opened is TRUE here, it means that we came
1325
	to this point right after row_sel_open_pcur. */
1326
1327
	ut_ad(mtr_has_extra_clust_latch == FALSE);
1328
1329
	rec = btr_pcur_get_rec(&(plan->pcur));
1330
1331
	/* PHASE 1: Set a lock if specified */
1332
1333
	if (!node->asc && cursor_just_opened
1334
	    && !page_rec_is_supremum(rec)) {
1335
1336
		/* When we open a cursor for a descending search, we must set
1337
		a next-key lock on the successor record: otherwise it would
1338
		be possible to insert new records next to the cursor position,
1339
		and it might be that these new records should appear in the
1340
		search result set, resulting in the phantom problem. */
1341
1342
		if (!consistent_read) {
1343
1344
			/* If innodb_locks_unsafe_for_binlog option is used
1345
			or this session is using READ COMMITTED isolation
1346
			level, we lock only the record, i.e., next-key
1347
			locking is not used. */
1348
1349
			rec_t*	next_rec = page_rec_get_next(rec);
1350
			ulint	lock_type;
1351
			trx_t*	trx;
1352
1353
			trx = thr_get_trx(thr);
1354
1355
			offsets = rec_get_offsets(next_rec, index, offsets,
1356
						  ULINT_UNDEFINED, &heap);
1357
1358
			if (srv_locks_unsafe_for_binlog
1359
			    || trx->isolation_level
1360
			    == TRX_ISO_READ_COMMITTED) {
1361
1362
				if (page_rec_is_supremum(next_rec)) {
1363
1364
					goto skip_lock;
1365
				}
1366
1367
				lock_type = LOCK_REC_NOT_GAP;
1368
			} else {
1369
				lock_type = LOCK_ORDINARY;
1370
			}
1371
1372
			err = sel_set_rec_lock(next_rec, index, offsets,
1373
					       node->row_lock_mode,
1374
					       lock_type, thr);
1375
1376
			if (err != DB_SUCCESS) {
1377
				/* Note that in this case we will store in pcur
1378
				the PREDECESSOR of the record we are waiting
1379
				the lock for */
1380
1381
				goto lock_wait_or_error;
1382
			}
1383
		}
1384
	}
1385
1386
skip_lock:
1387
	if (page_rec_is_infimum(rec)) {
1388
1389
		/* The infimum record on a page cannot be in the result set,
1390
		and neither can a record lock be placed on it: we skip such
1391
		a record. We also increment the cost counter as we may have
1392
		processed yet another page of index. */
1393
1394
		cost_counter++;
1395
1396
		goto next_rec;
1397
	}
1398
1399
	if (!consistent_read) {
1400
		/* Try to place a lock on the index record */
1401
1402
		/* If innodb_locks_unsafe_for_binlog option is used
1403
		or this session is using READ COMMITTED isolation level,
1404
		we lock only the record, i.e., next-key locking is
1405
		not used. */
1406
1407
		ulint	lock_type;
1408
		trx_t*	trx;
1409
1410
		offsets = rec_get_offsets(rec, index, offsets,
1411
					  ULINT_UNDEFINED, &heap);
1412
1413
		trx = thr_get_trx(thr);
1414
1415
		if (srv_locks_unsafe_for_binlog
1416
		    || trx->isolation_level == TRX_ISO_READ_COMMITTED) {
1417
1418
			if (page_rec_is_supremum(rec)) {
1419
1420
				goto next_rec;
1421
			}
1422
1423
			lock_type = LOCK_REC_NOT_GAP;
1424
		} else {
1425
			lock_type = LOCK_ORDINARY;
1426
		}
1427
1428
		err = sel_set_rec_lock(rec, index, offsets,
1429
				       node->row_lock_mode, lock_type, thr);
1430
1431
		if (err != DB_SUCCESS) {
1432
1433
			goto lock_wait_or_error;
1434
		}
1435
	}
1436
1437
	if (page_rec_is_supremum(rec)) {
1438
1439
		/* A page supremum record cannot be in the result set: skip
1440
		it now when we have placed a possible lock on it */
1441
1442
		goto next_rec;
1443
	}
1444
1445
	ut_ad(page_rec_is_user_rec(rec));
1446
1447
	if (cost_counter > SEL_COST_LIMIT) {
1448
1449
		/* Now that we have placed the necessary locks, we can stop
1450
		for a while and store the cursor position; NOTE that if we
1451
		would store the cursor position BEFORE placing a record lock,
1452
		it might happen that the cursor would jump over some records
1453
		that another transaction could meanwhile insert adjacent to
1454
		the cursor: this would result in the phantom problem. */
1455
1456
		goto stop_for_a_while;
1457
	}
1458
1459
	/* PHASE 2: Check a mixed index mix id if needed */
1460
1461
	if (plan->unique_search && cursor_just_opened) {
1462
1463
		ut_ad(plan->mode == PAGE_CUR_GE);
1464
1465
		/* As the cursor is now placed on a user record after a search
1466
		with the mode PAGE_CUR_GE, the up_match field in the cursor
1467
		tells how many fields in the user record matched to the search
1468
		tuple */
1469
1470
		if (btr_pcur_get_up_match(&(plan->pcur))
1471
		    < plan->n_exact_match) {
1472
			goto table_exhausted;
1473
		}
1474
1475
		/* Ok, no need to test end_conds or mix id */
1476
1477
	}
1478
1479
	/* We are ready to look at a possible new index entry in the result
1480
	set: the cursor is now placed on a user record */
1481
1482
	/* PHASE 3: Get previous version in a consistent read */
1483
1484
	cons_read_requires_clust_rec = FALSE;
1485
	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
1486
1487
	if (consistent_read) {
1488
		/* This is a non-locking consistent read: if necessary, fetch
1489
		a previous version of the record */
1490
1491
		if (index->type & DICT_CLUSTERED) {
1492
1493
			if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
1494
							   node->read_view)) {
1495
1496
				err = row_sel_build_prev_vers(
1497
					node->read_view, index, rec,
1498
					&offsets, &heap, &plan->old_vers_heap,
1499
					&old_vers, &mtr);
1500
1501
				if (err != DB_SUCCESS) {
1502
1503
					goto lock_wait_or_error;
1504
				}
1505
1506
				if (old_vers == NULL) {
1507
					offsets = rec_get_offsets(
1508
						rec, index, offsets,
1509
						ULINT_UNDEFINED, &heap);
1510
					row_sel_fetch_columns(
1511
						index, rec, offsets,
1512
						UT_LIST_GET_FIRST(
1513
							plan->columns));
1514
1515
					if (!row_sel_test_end_conds(plan)) {
1516
1517
						goto table_exhausted;
1518
					}
1519
1520
					goto next_rec;
1521
				}
1522
1523
				rec = old_vers;
1524
			}
1525
		} else if (!lock_sec_rec_cons_read_sees(rec, index,
1526
							node->read_view)) {
1527
			cons_read_requires_clust_rec = TRUE;
1528
		}
1529
	}
1530
1531
	/* PHASE 4: Test search end conditions and deleted flag */
1532
1533
	/* Fetch the columns needed in test conditions */
1534
1535
	row_sel_fetch_columns(index, rec, offsets,
1536
			      UT_LIST_GET_FIRST(plan->columns));
1537
1538
	/* Test the selection end conditions: these can only contain columns
1539
	which already are found in the index, even though the index might be
1540
	non-clustered */
1541
1542
	if (plan->unique_search && cursor_just_opened) {
1543
1544
		/* No test necessary: the test was already made above */
1545
1546
	} else if (!row_sel_test_end_conds(plan)) {
1547
1548
		goto table_exhausted;
1549
	}
1550
1551
	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
1552
	    && !cons_read_requires_clust_rec) {
1553
1554
		/* The record is delete marked: we can skip it if this is
1555
		not a consistent read which might see an earlier version
1556
		of a non-clustered index record */
1557
1558
		if (plan->unique_search) {
1559
1560
			goto table_exhausted;
1561
		}
1562
1563
		goto next_rec;
1564
	}
1565
1566
	/* PHASE 5: Get the clustered index record, if needed and if we did
1567
	not do the search using the clustered index */
1568
1569
	if (plan->must_get_clust || cons_read_requires_clust_rec) {
1570
1571
		/* It was a non-clustered index and we must fetch also the
1572
		clustered index record */
1573
1574
		err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
1575
					    &mtr);
1576
		mtr_has_extra_clust_latch = TRUE;
1577
1578
		if (err != DB_SUCCESS) {
1579
1580
			goto lock_wait_or_error;
1581
		}
1582
1583
		/* Retrieving the clustered record required a search:
1584
		increment the cost counter */
1585
1586
		cost_counter++;
1587
1588
		if (clust_rec == NULL) {
1589
			/* The record did not exist in the read view */
1590
			ut_ad(consistent_read);
1591
1592
			goto next_rec;
1593
		}
1594
1595
		if (rec_get_deleted_flag(clust_rec,
1596
					 dict_table_is_comp(plan->table))) {
1597
1598
			/* The record is delete marked: we can skip it */
1599
1600
			goto next_rec;
1601
		}
1602
1603
		if (node->can_get_updated) {
1604
1605
			btr_pcur_store_position(&(plan->clust_pcur), &mtr);
1606
		}
1607
	}
1608
1609
	/* PHASE 6: Test the rest of search conditions */
1610
1611
	if (!row_sel_test_other_conds(plan)) {
1612
1613
		if (plan->unique_search) {
1614
1615
			goto table_exhausted;
1616
		}
1617
1618
		goto next_rec;
1619
	}
1620
1621
	/* PHASE 7: We found a new qualifying row for the current table; push
1622
	the row if prefetch is on, or move to the next table in the join */
1623
1624
	plan->n_rows_fetched++;
1625
1626
	ut_ad(plan->pcur.latch_mode == node->latch_mode);
1627
1628
	if (node->select_will_do_update) {
1629
		/* This is a searched update and we can do the update in-place,
1630
		saving CPU time */
1631
1632
		row_upd_in_place_in_select(node, thr, &mtr);
1633
1634
		leaf_contains_updates = TRUE;
1635
1636
		/* When the database is in the online backup mode, the number
1637
		of log records for a single mtr should be small: increment the
1638
		cost counter to ensure it */
1639
1640
		cost_counter += 1 + (SEL_COST_LIMIT / 8);
1641
1642
		if (plan->unique_search) {
1643
1644
			goto table_exhausted;
1645
		}
1646
1647
		goto next_rec;
1648
	}
1649
1650
	if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
1651
	    || plan->unique_search || plan->no_prefetch
1652
	    || plan->table->big_rows) {
1653
1654
		/* No prefetch in operation: go to the next table */
1655
1656
		goto next_table;
1657
	}
1658
1659
	sel_push_prefetched_row(plan);
1660
1661
	if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
1662
1663
		/* The prefetch buffer is now full */
1664
1665
		sel_pop_prefetched_row(plan);
1666
1667
		goto next_table;
1668
	}
1669
1670
next_rec:
1671
	ut_ad(!search_latch_locked);
1672
1673
	if (mtr_has_extra_clust_latch) {
1674
1675
		/* We must commit &mtr if we are moving to the next
1676
		non-clustered index record, because we could break the
1677
		latching order if we would access a different clustered
1678
		index page right away without releasing the previous. */
1679
1680
		goto commit_mtr_for_a_while;
1681
	}
1682
1683
	if (leaf_contains_updates
1684
	    && btr_pcur_is_after_last_on_page(&(plan->pcur), &mtr)) {
1685
1686
		/* We must commit &mtr if we are moving to a different page,
1687
		because we have done updates to the x-latched leaf page, and
1688
		the latch would be released in btr_pcur_move_to_next, without
1689
		&mtr getting committed there */
1690
1691
		ut_ad(node->asc);
1692
1693
		goto commit_mtr_for_a_while;
1694
	}
1695
1696
	if (node->asc) {
1697
		moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
1698
	} else {
1699
		moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
1700
	}
1701
1702
	if (!moved) {
1703
1704
		goto table_exhausted;
1705
	}
1706
1707
	cursor_just_opened = FALSE;
1708
1709
	/* END OF RECORD LOOP
1710
	------------------ */
1711
	goto rec_loop;
1712
1713
next_table:
1714
	/* We found a record which satisfies the conditions: we can move to
1715
	the next table or return a row in the result set */
1716
1717
	ut_ad(btr_pcur_is_on_user_rec(&(plan->pcur), &mtr));
1718
1719
	if (plan->unique_search && !node->can_get_updated) {
1720
1721
		plan->cursor_at_end = TRUE;
1722
	} else {
1723
		ut_ad(!search_latch_locked);
1724
1725
		plan->stored_cursor_rec_processed = TRUE;
1726
1727
		btr_pcur_store_position(&(plan->pcur), &mtr);
1728
	}
1729
1730
	mtr_commit(&mtr);
1731
1732
	leaf_contains_updates = FALSE;
1733
	mtr_has_extra_clust_latch = FALSE;
1734
1735
next_table_no_mtr:
1736
	/* If we use 'goto' to this label, it means that the row was popped
1737
	from the prefetched rows stack, and &mtr is already committed */
1738
1739
	if (node->fetch_table + 1 == node->n_tables) {
1740
1741
		sel_eval_select_list(node);
1742
1743
		if (node->is_aggregate) {
1744
1745
			goto table_loop;
1746
		}
1747
1748
		sel_assign_into_var_values(node->into_list, node);
1749
1750
		thr->run_node = que_node_get_parent(node);
1751
1752
		if (search_latch_locked) {
1753
			rw_lock_s_unlock(&btr_search_latch);
1754
		}
1755
1756
		err = DB_SUCCESS;
1757
		goto func_exit;
1758
	}
1759
1760
	node->fetch_table++;
1761
1762
	/* When we move to the next table, we first reset the plan cursor:
1763
	we do not care about resetting it when we backtrack from a table */
1764
1765
	plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
1766
1767
	goto table_loop;
1768
1769
table_exhausted:
1770
	/* The table cursor pcur reached the result set end: backtrack to the
1771
	previous table in the join if we do not have cached prefetched rows */
1772
1773
	plan->cursor_at_end = TRUE;
1774
1775
	mtr_commit(&mtr);
1776
1777
	leaf_contains_updates = FALSE;
1778
	mtr_has_extra_clust_latch = FALSE;
1779
1780
	if (plan->n_rows_prefetched > 0) {
1781
		/* The table became exhausted during a prefetch */
1782
1783
		sel_pop_prefetched_row(plan);
1784
1785
		goto next_table_no_mtr;
1786
	}
1787
1788
table_exhausted_no_mtr:
1789
	if (node->fetch_table == 0) {
1790
		err = DB_SUCCESS;
1791
1792
		if (node->is_aggregate && !node->aggregate_already_fetched) {
1793
1794
			node->aggregate_already_fetched = TRUE;
1795
1796
			sel_assign_into_var_values(node->into_list, node);
1797
1798
			thr->run_node = que_node_get_parent(node);
1799
1800
			if (search_latch_locked) {
1801
				rw_lock_s_unlock(&btr_search_latch);
1802
			}
1803
1804
			goto func_exit;
1805
		}
1806
1807
		node->state = SEL_NODE_NO_MORE_ROWS;
1808
1809
		thr->run_node = que_node_get_parent(node);
1810
1811
		if (search_latch_locked) {
1812
			rw_lock_s_unlock(&btr_search_latch);
1813
		}
1814
1815
		goto func_exit;
1816
	}
1817
1818
	node->fetch_table--;
1819
1820
	goto table_loop;
1821
1822
stop_for_a_while:
1823
	/* Return control for a while to que_run_threads, so that runaway
1824
	queries can be canceled. NOTE that when we come here, we must, in a
1825
	locking read, have placed the necessary (possibly waiting request)
1826
	record lock on the cursor record or its successor: when we reposition
1827
	the cursor, this record lock guarantees that nobody can meanwhile have
1828
	inserted new records which should have appeared in the result set,
1829
	which would result in the phantom problem. */
1830
1831
	ut_ad(!search_latch_locked);
1832
1833
	plan->stored_cursor_rec_processed = FALSE;
1834
	btr_pcur_store_position(&(plan->pcur), &mtr);
1835
1836
	mtr_commit(&mtr);
1837
1838
#ifdef UNIV_SYNC_DEBUG
1839
	ut_ad(sync_thread_levels_empty_gen(TRUE));
1840
#endif /* UNIV_SYNC_DEBUG */
1841
	err = DB_SUCCESS;
1842
	goto func_exit;
1843
1844
commit_mtr_for_a_while:
1845
	/* Stores the cursor position and commits &mtr; this is used if
1846
	&mtr may contain latches which would break the latching order if
1847
	&mtr would not be committed and the latches released. */
1848
1849
	plan->stored_cursor_rec_processed = TRUE;
1850
1851
	ut_ad(!search_latch_locked);
1852
	btr_pcur_store_position(&(plan->pcur), &mtr);
1853
1854
	mtr_commit(&mtr);
1855
1856
	leaf_contains_updates = FALSE;
1857
	mtr_has_extra_clust_latch = FALSE;
1858
1859
#ifdef UNIV_SYNC_DEBUG
1860
	ut_ad(sync_thread_levels_empty_gen(TRUE));
1861
#endif /* UNIV_SYNC_DEBUG */
1862
1863
	goto table_loop;
1864
1865
lock_wait_or_error:
1866
	/* See the note at stop_for_a_while: the same holds for this case */
1867
1868
	ut_ad(!btr_pcur_is_before_first_on_page(&(plan->pcur), &mtr)
1869
	      || !node->asc);
1870
	ut_ad(!search_latch_locked);
1871
1872
	plan->stored_cursor_rec_processed = FALSE;
1873
	btr_pcur_store_position(&(plan->pcur), &mtr);
1874
1875
	mtr_commit(&mtr);
1876
1877
#ifdef UNIV_SYNC_DEBUG
1878
	ut_ad(sync_thread_levels_empty_gen(TRUE));
1879
#endif /* UNIV_SYNC_DEBUG */
1880
1881
func_exit:
1882
	if (UNIV_LIKELY_NULL(heap)) {
1883
		mem_heap_free(heap);
1884
	}
1885
	return(err);
1886
}
1887
1888
/**************************************************************************
1889
Performs a select step. This is a high-level function used in SQL execution
1890
graphs. */
1891
1892
que_thr_t*
1893
row_sel_step(
1894
/*=========*/
1895
				/* out: query thread to run next or NULL */
1896
	que_thr_t*	thr)	/* in: query thread */
1897
{
1898
	ulint		i_lock_mode;
1899
	sym_node_t*	table_node;
1900
	sel_node_t*	node;
1901
	ulint		err;
1902
1903
	ut_ad(thr);
1904
1905
	node = thr->run_node;
1906
1907
	ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
1908
1909
	/* If this is a new time this node is executed (or when execution
1910
	resumes after wait for a table intention lock), set intention locks
1911
	on the tables, or assign a read view */
1912
1913
	if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
1914
1915
		node->state = SEL_NODE_OPEN;
1916
	}
1917
1918
	if (node->state == SEL_NODE_OPEN) {
1919
1920
		/* It may be that the current session has not yet started
1921
		its transaction, or it has been committed: */
1922
1923
		trx_start_if_not_started(thr_get_trx(thr));
1924
1925
		plan_reset_cursor(sel_node_get_nth_plan(node, 0));
1926
1927
		if (node->consistent_read) {
1928
			/* Assign a read view for the query */
1929
			node->read_view = trx_assign_read_view(
1930
				thr_get_trx(thr));
1931
		} else {
1932
			if (node->set_x_locks) {
1933
				i_lock_mode = LOCK_IX;
1934
			} else {
1935
				i_lock_mode = LOCK_IS;
1936
			}
1937
1938
			table_node = node->table_list;
1939
1940
			while (table_node) {
1941
				err = lock_table(0, table_node->table,
1942
						 i_lock_mode, thr);
1943
				if (err != DB_SUCCESS) {
1944
					thr_get_trx(thr)->error_state = err;
1945
1946
					return(NULL);
1947
				}
1948
1949
				table_node = que_node_get_next(table_node);
1950
			}
1951
		}
1952
1953
		/* If this is an explicit cursor, copy stored procedure
1954
		variable values, so that the values cannot change between
1955
		fetches (currently, we copy them also for non-explicit
1956
		cursors) */
1957
1958
		if (node->explicit_cursor
1959
		    && UT_LIST_GET_FIRST(node->copy_variables)) {
1960
1961
			row_sel_copy_input_variable_vals(node);
1962
		}
1963
1964
		node->state = SEL_NODE_FETCH;
1965
		node->fetch_table = 0;
1966
1967
		if (node->is_aggregate) {
1968
			/* Reset the aggregate total values */
1969
			sel_reset_aggregate_vals(node);
1970
		}
1971
	}
1972
1973
	err = row_sel(node, thr);
1974
1975
	/* NOTE! if queries are parallelized, the following assignment may
1976
	have problems; the assignment should be made only if thr is the
1977
	only top-level thr in the graph: */
1978
1979
	thr->graph->last_sel_node = node;
1980
1981
	if (err != DB_SUCCESS) {
1982
		thr_get_trx(thr)->error_state = err;
1983
1984
		return(NULL);
1985
	}
1986
1987
	return(thr);
1988
}
1989
1990
/**************************************************************************
1991
Performs a fetch for a cursor. */
1992
1993
que_thr_t*
1994
fetch_step(
1995
/*=======*/
1996
				/* out: query thread to run next or NULL */
1997
	que_thr_t*	thr)	/* in: query thread */
1998
{
1999
	sel_node_t*	sel_node;
2000
	fetch_node_t*	node;
2001
2002
	ut_ad(thr);
2003
2004
	node = thr->run_node;
2005
	sel_node = node->cursor_def;
2006
2007
	ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
2008
2009
	if (thr->prev_node != que_node_get_parent(node)) {
2010
2011
		if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
2012
2013
			if (node->into_list) {
2014
				sel_assign_into_var_values(node->into_list,
2015
							   sel_node);
2016
			} else {
2017
				void* ret = (*node->func->func)(
2018
					sel_node, node->func->arg);
2019
2020
				if (!ret) {
2021
					sel_node->state
2022
						= SEL_NODE_NO_MORE_ROWS;
2023
				}
2024
			}
2025
		}
2026
2027
		thr->run_node = que_node_get_parent(node);
2028
2029
		return(thr);
2030
	}
2031
2032
	/* Make the fetch node the parent of the cursor definition for
2033
	the time of the fetch, so that execution knows to return to this
2034
	fetch node after a row has been selected or we know that there is
2035
	no row left */
2036
2037
	sel_node->common.parent = node;
2038
2039
	if (sel_node->state == SEL_NODE_CLOSED) {
2040
		fprintf(stderr,
2041
			"InnoDB: Error: fetch called on a closed cursor\n");
2042
2043
		thr_get_trx(thr)->error_state = DB_ERROR;
2044
2045
		return(NULL);
2046
	}
2047
2048
	thr->run_node = sel_node;
2049
2050
	return(thr);
2051
}
2052
2053
/********************************************************************
2054
Sample callback function for fetch that prints each row.*/
2055
2056
void*
2057
row_fetch_print(
2058
/*============*/
2059
				/* out: always returns non-NULL */
2060
	void*	row,		/* in:  sel_node_t* */
2061
	void*	user_arg)	/* in:  not used */
2062
{
2063
	sel_node_t*	node = row;
2064
	que_node_t*	exp;
2065
	ulint		i = 0;
2066
2067
	UT_NOT_USED(user_arg);
2068
2069
	fprintf(stderr, "row_fetch_print: row %p\n", row);
2070
2071
	exp = node->select_list;
2072
2073
	while (exp) {
2074
		dfield_t*	dfield = que_node_get_val(exp);
2075
		dtype_t*	type = dfield_get_type(dfield);
2076
2077
		fprintf(stderr, " column %lu:\n", (ulong)i);
2078
2079
		dtype_print(type);
2080
		fprintf(stderr, "\n");
2081
2082
		if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
2083
			ut_print_buf(stderr, dfield_get_data(dfield),
2084
				     dfield_get_len(dfield));
2085
		} else {
2086
			fprintf(stderr, " <NULL>;");
2087
		}
2088
2089
		fprintf(stderr, "\n");
2090
2091
		exp = que_node_get_next(exp);
2092
		i++;
2093
	}
2094
2095
	return((void*)42);
2096
}
2097
2098
/********************************************************************
2099
Callback function for fetch that stores an unsigned 4 byte integer to the
2100
location pointed. The column's type must be DATA_INT, DATA_UNSIGNED, length
2101
= 4. */
2102
2103
void*
2104
row_fetch_store_uint4(
2105
/*==================*/
2106
				/* out: always returns NULL */
2107
	void*	row,		/* in:  sel_node_t* */
2108
	void*	user_arg)	/* in:  data pointer */
2109
{
2110
	sel_node_t*	node = row;
2111
	ib_uint32_t*	val = user_arg;
2112
	ulint		tmp;
2113
2114
	dfield_t*	dfield = que_node_get_val(node->select_list);
2115
	dtype_t*	type = dfield_get_type(dfield);
2116
	ulint		len = dfield_get_len(dfield);
2117
2118
	ut_a(dtype_get_mtype(type) == DATA_INT);
2119
	ut_a(dtype_get_prtype(type) & DATA_UNSIGNED);
2120
	ut_a(len == 4);
2121
2122
	tmp = mach_read_from_4(dfield_get_data(dfield));
2123
	*val = (ib_uint32_t) tmp;
2124
2125
	return(NULL);
2126
}
2127
2128
/***************************************************************
2129
Prints a row in a select result. */
2130
2131
que_thr_t*
2132
row_printf_step(
2133
/*============*/
2134
				/* out: query thread to run next or NULL */
2135
	que_thr_t*	thr)	/* in: query thread */
2136
{
2137
	row_printf_node_t*	node;
2138
	sel_node_t*		sel_node;
2139
	que_node_t*		arg;
2140
2141
	ut_ad(thr);
2142
2143
	node = thr->run_node;
2144
2145
	sel_node = node->sel_node;
2146
2147
	ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
2148
2149
	if (thr->prev_node == que_node_get_parent(node)) {
2150
2151
		/* Reset the cursor */
2152
		sel_node->state = SEL_NODE_OPEN;
2153
2154
		/* Fetch next row to print */
2155
2156
		thr->run_node = sel_node;
2157
2158
		return(thr);
2159
	}
2160
2161
	if (sel_node->state != SEL_NODE_FETCH) {
2162
2163
		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
2164
2165
		/* No more rows to print */
2166
2167
		thr->run_node = que_node_get_parent(node);
2168
2169
		return(thr);
2170
	}
2171
2172
	arg = sel_node->select_list;
2173
2174
	while (arg) {
2175
		dfield_print_also_hex(que_node_get_val(arg));
2176
2177
		fputs(" ::: ", stderr);
2178
2179
		arg = que_node_get_next(arg);
2180
	}
2181
2182
	putc('\n', stderr);
2183
2184
	/* Fetch next row to print */
2185
2186
	thr->run_node = sel_node;
2187
2188
	return(thr);
2189
}
2190
2191
/********************************************************************
2192
Converts a key value stored in MySQL format to an Innobase dtuple. The last
2193
field of the key value may be just a prefix of a fixed length field: hence
2194
the parameter key_len. But currently we do not allow search keys where the
2195
last field is only a prefix of the full key field len and print a warning if
2196
such appears. A counterpart of this function is
2197
ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2198
2199
void
2200
row_sel_convert_mysql_key_to_innobase(
2201
/*==================================*/
2202
	dtuple_t*	tuple,		/* in: tuple where to build;
2203
					NOTE: we assume that the type info
2204
					in the tuple is already according
2205
					to index! */
2206
	byte*		buf,		/* in: buffer to use in field
2207
					conversions */
2208
	ulint		buf_len,	/* in: buffer length */
2209
	dict_index_t*	index,		/* in: index of the key value */
2210
	byte*		key_ptr,	/* in: MySQL key value */
2211
	ulint		key_len,	/* in: MySQL key value length */
2212
	trx_t*		trx)		/* in: transaction */
2213
{
2214
	byte*		original_buf	= buf;
2215
	byte*		original_key_ptr = key_ptr;
2216
	dict_field_t*	field;
2217
	dfield_t*	dfield;
2218
	ulint		data_offset;
2219
	ulint		data_len;
2220
	ulint		data_field_len;
2221
	ibool		is_null;
2222
	byte*		key_end;
2223
	ulint		n_fields = 0;
2224
	ulint		type;
2225
2226
	/* For documentation of the key value storage format in MySQL, see
2227
	ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
2228
2229
	key_end = key_ptr + key_len;
2230
2231
	/* Permit us to access any field in the tuple (ULINT_MAX): */
2232
2233
	dtuple_set_n_fields(tuple, ULINT_MAX);
2234
2235
	dfield = dtuple_get_nth_field(tuple, 0);
2236
	field = dict_index_get_nth_field(index, 0);
2237
2238
	if (dfield_get_type(dfield)->mtype == DATA_SYS) {
2239
		/* A special case: we are looking for a position in the
2240
		generated clustered index which InnoDB automatically added
2241
		to a table with no primary key: the first and the only
2242
		ordering column is ROW_ID which InnoDB stored to the key_ptr
2243
		buffer. */
2244
2245
		ut_a(key_len == DATA_ROW_ID_LEN);
2246
2247
		dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
2248
2249
		dtuple_set_n_fields(tuple, 1);
2250
2251
		return;
2252
	}
2253
2254
	while (key_ptr < key_end) {
2255
2256
		ut_a(field->col->mtype == dfield_get_type(dfield)->mtype);
2257
2258
		data_offset = 0;
2259
		is_null = FALSE;
2260
2261
		if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
2262
			/* The first byte in the field tells if this is
2263
			an SQL NULL value */
2264
2265
			data_offset = 1;
2266
2267
			if (*key_ptr != 0) {
2268
				dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
2269
2270
				is_null = TRUE;
2271
			}
2272
		}
2273
2274
		type = dfield_get_type(dfield)->mtype;
2275
2276
		/* Calculate data length and data field total length */
2277
2278
		if (type == DATA_BLOB) {
2279
			/* The key field is a column prefix of a BLOB or
2280
			TEXT */
2281
2282
			ut_a(field->prefix_len > 0);
2283
2284
			/* MySQL stores the actual data length to the first 2
2285
			bytes after the optional SQL NULL marker byte. The
2286
			storage format is little-endian, that is, the most
2287
			significant byte at a higher address. In UTF-8, MySQL
2288
			seems to reserve field->prefix_len bytes for
2289
			storing this field in the key value buffer, even
2290
			though the actual value only takes data_len bytes
2291
			from the start. */
2292
2293
			data_len = key_ptr[data_offset]
2294
				+ 256 * key_ptr[data_offset + 1];
2295
			data_field_len = data_offset + 2 + field->prefix_len;
2296
2297
			data_offset += 2;
2298
2299
			/* Now that we know the length, we store the column
2300
			value like it would be a fixed char field */
2301
2302
		} else if (field->prefix_len > 0) {
2303
			/* Looks like MySQL pads unused end bytes in the
2304
			prefix with space. Therefore, also in UTF-8, it is ok
2305
			to compare with a prefix containing full prefix_len
2306
			bytes, and no need to take at most prefix_len / 3
2307
			UTF-8 characters from the start.
2308
			If the prefix is used as the upper end of a LIKE
2309
			'abc%' query, then MySQL pads the end with chars
2310
			0xff. TODO: in that case does it any harm to compare
2311
			with the full prefix_len bytes. How do characters
2312
			0xff in UTF-8 behave? */
2313
2314
			data_len = field->prefix_len;
2315
			data_field_len = data_offset + data_len;
2316
		} else {
2317
			data_len = dfield_get_type(dfield)->len;
2318
			data_field_len = data_offset + data_len;
2319
		}
2320
2321
		if (dtype_get_mysql_type(dfield_get_type(dfield))
2322
		    == DATA_MYSQL_TRUE_VARCHAR
2323
		    && dfield_get_type(dfield)->mtype != DATA_INT) {
2324
			/* In a MySQL key value format, a true VARCHAR is
2325
			always preceded by 2 bytes of a length field.
2326
			dfield_get_type(dfield)->len returns the maximum
2327
			'payload' len in bytes. That does not include the
2328
			2 bytes that tell the actual data length.
2329
2330
			We added the check != DATA_INT to make sure we do
2331
			not treat MySQL ENUM or SET as a true VARCHAR! */
2332
2333
			data_len += 2;
2334
			data_field_len += 2;
2335
		}
2336
2337
		/* Storing may use at most data_len bytes of buf */
2338
2339
		if (!is_null) {
2340
			row_mysql_store_col_in_innobase_format(
2341
				dfield, buf,
2342
				FALSE, /* MySQL key value format col */
2343
				key_ptr + data_offset, data_len,
2344
				dict_table_is_comp(index->table));
2345
			buf += data_len;
2346
		}
2347
2348
		key_ptr += data_field_len;
2349
2350
		if (key_ptr > key_end) {
2351
			/* The last field in key was not a complete key field
2352
			but a prefix of it.
2353
2354
			Print a warning about this! HA_READ_PREFIX_LAST does
2355
			not currently work in InnoDB with partial-field key
2356
			value prefixes. Since MySQL currently uses a padding
2357
			trick to calculate LIKE 'abc%' type queries there
2358
			should never be partial-field prefixes in searches. */
2359
2360
			ut_print_timestamp(stderr);
2361
2362
			fputs("  InnoDB: Warning: using a partial-field"
2363
			      " key prefix in search.\n"
2364
			      "InnoDB: ", stderr);
2365
			dict_index_name_print(stderr, trx, index);
2366
			fprintf(stderr, ". Last data field length %lu bytes,\n"
2367
				"InnoDB: key ptr now exceeds"
2368
				" key end by %lu bytes.\n"
2369
				"InnoDB: Key value in the MySQL format:\n",
2370
				(ulong) data_field_len,
2371
				(ulong) (key_ptr - key_end));
2372
			fflush(stderr);
2373
			ut_print_buf(stderr, original_key_ptr, key_len);
2374
			fprintf(stderr, "\n");
2375
2376
			if (!is_null) {
2377
				dfield->len -= (ulint)(key_ptr - key_end);
2378
			}
2379
		}
2380
2381
		n_fields++;
2382
		field++;
2383
		dfield++;
2384
	}
2385
2386
	ut_a(buf <= original_buf + buf_len);
2387
2388
	/* We set the length of tuple to n_fields: we assume that the memory
2389
	area allocated for it is big enough (usually bigger than n_fields). */
2390
2391
	dtuple_set_n_fields(tuple, n_fields);
2392
}
2393
2394
/******************************************************************
2395
Stores the row id to the prebuilt struct. */
2396
static
2397
void
2398
row_sel_store_row_id_to_prebuilt(
2399
/*=============================*/
2400
	row_prebuilt_t*	prebuilt,	/* in: prebuilt */
2401
	rec_t*		index_rec,	/* in: record */
2402
	dict_index_t*	index,		/* in: index of the record */
2403
	const ulint*	offsets)	/* in: rec_get_offsets
2404
					(index_rec, index) */
2405
{
2406
	byte*	data;
2407
	ulint	len;
2408
2409
	ut_ad(rec_offs_validate(index_rec, index, offsets));
2410
2411
	data = rec_get_nth_field(
2412
		index_rec, offsets,
2413
		dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
2414
2415
	if (len != DATA_ROW_ID_LEN) {
2416
		fprintf(stderr,
2417
			"InnoDB: Error: Row id field is"
2418
			" wrong length %lu in ", (ulong) len);
2419
		dict_index_name_print(stderr, prebuilt->trx, index);
2420
		fprintf(stderr, "\n"
2421
			"InnoDB: Field number %lu, record:\n",
2422
			(ulong) dict_index_get_sys_col_pos(index,
2423
							   DATA_ROW_ID));
2424
		rec_print_new(stderr, index_rec, offsets);
2425
		putc('\n', stderr);
2426
		ut_error;
2427
	}
2428
2429
	ut_memcpy(prebuilt->row_id, data, len);
2430
}
2431
2432
/******************************************************************
2433
Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
2434
function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
2435
static
2436
void
2437
row_sel_field_store_in_mysql_format(
2438
/*================================*/
2439
	byte*	dest,	/* in/out: buffer where to store; NOTE that BLOBs
2440
			are not in themselves stored here: the caller must
2441
			allocate and copy the BLOB into buffer before, and pass
2442
			the pointer to the BLOB in 'data' */
2443
	const mysql_row_templ_t* templ,	/* in: MySQL column template.
2444
			Its following fields are referenced:
2445
			type, is_unsigned, mysql_col_len, mbminlen, mbmaxlen */
2446
	byte*	data,	/* in: data to store */
2447
	ulint	len)	/* in: length of the data */
2448
{
2449
	byte*	ptr;
2450
	byte*	field_end;
2451
	byte*	pad_ptr;
2452
2453
	ut_ad(len != UNIV_SQL_NULL);
2454
2455
	if (templ->type == DATA_INT) {
2456
		/* Convert integer data from Innobase to a little-endian
2457
		format, sign bit restored to normal */
2458
2459
		ptr = dest + len;
2460
2461
		for (;;) {
2462
			ptr--;
2463
			*ptr = *data;
2464
			if (ptr == dest) {
2465
				break;
2466
			}
2467
			data++;
2468
		}
2469
2470
		if (!templ->is_unsigned) {
2471
			dest[len - 1] = (byte) (dest[len - 1] ^ 128);
2472
		}
2473
2474
		ut_ad(templ->mysql_col_len == len);
2475
	} else if (templ->type == DATA_VARCHAR
2476
		   || templ->type == DATA_VARMYSQL
2477
		   || templ->type == DATA_BINARY) {
2478
2479
		field_end = dest + templ->mysql_col_len;
2480
2481
		if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
2482
			/* This is a >= 5.0.3 type true VARCHAR. Store the
2483
			length of the data to the first byte or the first
2484
			two bytes of dest. */
2485
2486
			dest = row_mysql_store_true_var_len(
2487
				dest, len, templ->mysql_length_bytes);
2488
		}
2489
2490
		/* Copy the actual data */
2491
		ut_memcpy(dest, data, len);
2492
2493
		/* Pad with trailing spaces. We pad with spaces also the
2494
		unused end of a >= 5.0.3 true VARCHAR column, just in case
2495
		MySQL expects its contents to be deterministic. */
2496
2497
		pad_ptr = dest + len;
2498
2499
		ut_ad(templ->mbminlen <= templ->mbmaxlen);
2500
2501
		/* We handle UCS2 charset strings differently. */
2502
		if (templ->mbminlen == 2) {
2503
			/* A space char is two bytes, 0x0020 in UCS2 */
2504
2505
			if (len & 1) {
2506
				/* A 0x20 has been stripped from the column.
2507
				Pad it back. */
2508
2509
				if (pad_ptr < field_end) {
2510
					*pad_ptr = 0x20;
2511
					pad_ptr++;
2512
				}
2513
			}
2514
2515
			/* Pad the rest of the string with 0x0020 */
2516
2517
			while (pad_ptr < field_end) {
2518
				*pad_ptr = 0x00;
2519
				pad_ptr++;
2520
				*pad_ptr = 0x20;
2521
				pad_ptr++;
2522
			}
2523
		} else {
2524
			ut_ad(templ->mbminlen == 1);
2525
			/* space=0x20 */
2526
2527
			memset(pad_ptr, 0x20, field_end - pad_ptr);
2528
		}
2529
	} else if (templ->type == DATA_BLOB) {
2530
		/* Store a pointer to the BLOB buffer to dest: the BLOB was
2531
		already copied to the buffer in row_sel_store_mysql_rec */
2532
2533
		row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
2534
					 len);
2535
	} else if (templ->type == DATA_MYSQL) {
2536
		memcpy(dest, data, len);
2537
2538
		ut_ad(templ->mysql_col_len >= len);
2539
		ut_ad(templ->mbmaxlen >= templ->mbminlen);
2540
2541
		ut_ad(templ->mbmaxlen > templ->mbminlen
2542
		      || templ->mysql_col_len == len);
2543
		/* The following assertion would fail for old tables
2544
		containing UTF-8 ENUM columns due to Bug #9526. */
2545
		ut_ad(!templ->mbmaxlen
2546
		      || !(templ->mysql_col_len % templ->mbmaxlen));
2547
		ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
2548
2549
		if (templ->mbminlen != templ->mbmaxlen) {
2550
			/* Pad with spaces. This undoes the stripping
2551
			done in row0mysql.ic, function
2552
			row_mysql_store_col_in_innobase_format(). */
2553
2554
			memset(dest + len, 0x20, templ->mysql_col_len - len);
2555
		}
2556
	} else {
2557
		ut_ad(templ->type == DATA_CHAR
2558
		      || templ->type == DATA_FIXBINARY
2559
		      /*|| templ->type == DATA_SYS_CHILD
2560
		      || templ->type == DATA_SYS*/
2561
		      || templ->type == DATA_FLOAT
2562
		      || templ->type == DATA_DOUBLE
2563
		      || templ->type == DATA_DECIMAL);
2564
		ut_ad(templ->mysql_col_len == len);
2565
2566
		memcpy(dest, data, len);
2567
	}
2568
}
2569
2570
/******************************************************************
2571
Convert a row in the Innobase format to a row in the MySQL format.
2572
Note that the template in prebuilt may advise us to copy only a few
2573
columns to mysql_rec, other columns are left blank. All columns may not
2574
be needed in the query. */
2575
static
2576
ibool
2577
row_sel_store_mysql_rec(
2578
/*====================*/
2579
					/* out: TRUE if success, FALSE if
2580
					could not allocate memory for a BLOB
2581
					(though we may also assert in that
2582
					case) */
2583
	byte*		mysql_rec,	/* out: row in the MySQL format */
2584
	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct */
2585
	rec_t*		rec,		/* in: Innobase record in the index
2586
					which was described in prebuilt's
2587
					template */
2588
	const ulint*	offsets, 	/* in: array returned by
2589
					rec_get_offsets() */
2590
        ulint start_field_no,
2591
        ulint end_field_no)
2592
{
2593
	mysql_row_templ_t*	templ;
2594
	mem_heap_t*		extern_field_heap	= NULL;
2595
	mem_heap_t*		heap;
2596
	byte*			data;
2597
	ulint			len;
2598
	ulint			i;
2599
2600
	ut_ad(prebuilt->mysql_template);
2601
	ut_ad(rec_offs_validate(rec, NULL, offsets));
2602
2603
	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
2604
		mem_heap_free(prebuilt->blob_heap);
2605
		prebuilt->blob_heap = NULL;
2606
	}
2607
2608
	for (i = start_field_no; i < end_field_no /* prebuilt->n_template */ ; i++) {
2609
2610
		templ = prebuilt->mysql_template + i;
2611
2612
		if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
2613
						      templ->rec_field_no))) {
2614
2615
			/* Copy an externally stored field to the temporary
2616
			heap */
2617
2618
			ut_a(!prebuilt->trx->has_search_latch);
2619
2620
			if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
2621
				if (prebuilt->blob_heap == NULL) {
2622
					prebuilt->blob_heap = mem_heap_create(
2623
						UNIV_PAGE_SIZE);
2624
				}
2625
2626
				heap = prebuilt->blob_heap;
2627
			} else {
2628
				extern_field_heap
2629
					= mem_heap_create(UNIV_PAGE_SIZE);
2630
2631
				heap = extern_field_heap;
2632
			}
2633
2634
			/* NOTE: if we are retrieving a big BLOB, we may
2635
			already run out of memory in the next call, which
2636
			causes an assert */
2637
2638
			data = btr_rec_copy_externally_stored_field(
2639
				rec, offsets, templ->rec_field_no,
2640
				&len, heap);
2641
2642
			ut_a(len != UNIV_SQL_NULL);
2643
		} else {
2644
			/* Field is stored in the row. */
2645
2646
			data = rec_get_nth_field(rec, offsets,
2647
						 templ->rec_field_no, &len);
2648
2649
			if (UNIV_UNLIKELY(templ->type == DATA_BLOB)
2650
			    && len != UNIV_SQL_NULL) {
2651
2652
				/* It is a BLOB field locally stored in the
2653
				InnoDB record: we MUST copy its contents to
2654
				prebuilt->blob_heap here because later code
2655
				assumes all BLOB values have been copied to a
2656
				safe place. */
2657
2658
				if (prebuilt->blob_heap == NULL) {
2659
					prebuilt->blob_heap = mem_heap_create(
2660
						UNIV_PAGE_SIZE);
2661
				}
2662
2663
				data = memcpy(mem_heap_alloc(
2664
						prebuilt->blob_heap, len),
2665
						data, len);
2666
			}
2667
		}
2668
2669
		if (len != UNIV_SQL_NULL) {
2670
			row_sel_field_store_in_mysql_format(
2671
				mysql_rec + templ->mysql_col_offset,
2672
				templ, data, len);
2673
2674
			/* Cleanup */
2675
			if (extern_field_heap) {
2676
				mem_heap_free(extern_field_heap);
2677
				extern_field_heap = NULL;
2678
			}
2679
2680
			if (templ->mysql_null_bit_mask) {
2681
				/* It is a nullable column with a non-NULL
2682
				value */
2683
				mysql_rec[templ->mysql_null_byte_offset]
2684
					&= ~(byte) templ->mysql_null_bit_mask;
2685
			}
2686
		} else {
2687
			/* MySQL seems to assume the field for an SQL NULL
2688
			value is set to zero or space. Not taking this into
2689
			account caused seg faults with NULL BLOB fields, and
2690
			bug number 154 in the MySQL bug database: GROUP BY
2691
			and DISTINCT could treat NULL values inequal. */
2692
			int	pad_char;
2693
2694
			mysql_rec[templ->mysql_null_byte_offset]
2695
				|= (byte) templ->mysql_null_bit_mask;
2696
			switch (templ->type) {
2697
			case DATA_VARCHAR:
2698
			case DATA_BINARY:
2699
			case DATA_VARMYSQL:
2700
				if (templ->mysql_type
2701
				    == DATA_MYSQL_TRUE_VARCHAR) {
2702
					/* This is a >= 5.0.3 type
2703
					true VARCHAR.  Zero the field. */
2704
					pad_char = 0x00;
2705
					break;
2706
				}
2707
				/* Fall through */
2708
			case DATA_CHAR:
2709
			case DATA_FIXBINARY:
2710
			case DATA_MYSQL:
2711
				/* MySQL pads all string types (except
2712
				BLOB, TEXT and true VARCHAR) with space. */
2713
				if (UNIV_UNLIKELY(templ->mbminlen == 2)) {
2714
					/* Treat UCS2 as a special case. */
2715
					data = mysql_rec
2716
						+ templ->mysql_col_offset;
2717
					len = templ->mysql_col_len;
2718
					/* There are two UCS2 bytes per char,
2719
					so the length has to be even. */
2720
					ut_a(!(len & 1));
2721
					/* Pad with 0x0020. */
2722
					while (len) {
2723
						*data++ = 0x00;
2724
						*data++ = 0x20;
2725
						len -= 2;
2726
					}
2727
					continue;
2728
				}
2729
				pad_char = 0x20;
2730
				break;
2731
			default:
2732
				pad_char = 0x00;
2733
				break;
2734
			}
2735
2736
			ut_ad(!pad_char || templ->mbminlen == 1);
2737
			memset(mysql_rec + templ->mysql_col_offset,
2738
			       pad_char, templ->mysql_col_len);
2739
		}
2740
	}
2741
2742
	return(TRUE);
2743
}
2744
2745
/*************************************************************************
2746
Builds a previous version of a clustered index record for a consistent read */
2747
static
2748
ulint
2749
row_sel_build_prev_vers_for_mysql(
2750
/*==============================*/
2751
					/* out: DB_SUCCESS or error code */
2752
	read_view_t*	read_view,	/* in: read view */
2753
	dict_index_t*	clust_index,	/* in: clustered index */
2754
	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct */
2755
	rec_t*		rec,		/* in: record in a clustered index */
2756
	ulint**		offsets,	/* in/out: offsets returned by
2757
					rec_get_offsets(rec, clust_index) */
2758
	mem_heap_t**	offset_heap,	/* in/out: memory heap from which
2759
					the offsets are allocated */
2760
	rec_t**		old_vers,	/* out: old version, or NULL if the
2761
					record does not exist in the view:
2762
					i.e., it was freshly inserted
2763
					afterwards */
2764
	mtr_t*		mtr)		/* in: mtr */
2765
{
2766
	ulint	err;
2767
2768
	if (prebuilt->old_vers_heap) {
2769
		mem_heap_empty(prebuilt->old_vers_heap);
2770
	} else {
2771
		prebuilt->old_vers_heap = mem_heap_create(200);
2772
	}
2773
2774
	err = row_vers_build_for_consistent_read(
2775
		rec, mtr, clust_index, offsets, read_view, offset_heap,
2776
		prebuilt->old_vers_heap, old_vers);
2777
	return(err);
2778
}
2779
2780
/*************************************************************************
2781
Retrieves the clustered index record corresponding to a record in a
2782
non-clustered index. Does the necessary locking. Used in the MySQL
2783
interface. */
2784
static
2785
ulint
2786
row_sel_get_clust_rec_for_mysql(
2787
/*============================*/
2788
				/* out: DB_SUCCESS or error code */
2789
	row_prebuilt_t*	prebuilt,/* in: prebuilt struct in the handle */
2790
	dict_index_t*	sec_index,/* in: secondary index where rec resides */
2791
	rec_t*		rec,	/* in: record in a non-clustered index; if
2792
				this is a locking read, then rec is not
2793
				allowed to be delete-marked, and that would
2794
				not make sense either */
2795
	que_thr_t*	thr,	/* in: query thread */
2796
	rec_t**		out_rec,/* out: clustered record or an old version of
2797
				it, NULL if the old version did not exist
2798
				in the read view, i.e., it was a fresh
2799
				inserted version */
2800
	ulint**		offsets,/* out: offsets returned by
2801
				rec_get_offsets(out_rec, clust_index) */
2802
	mem_heap_t**	offset_heap,/* in/out: memory heap from which
2803
				the offsets are allocated */
2804
	mtr_t*		mtr)	/* in: mtr used to get access to the
2805
				non-clustered record; the same mtr is used to
2806
				access the clustered index */
2807
{
2808
	dict_index_t*	clust_index;
2809
	rec_t*		clust_rec;
2810
	rec_t*		old_vers;
2811
	ulint		err;
2812
	trx_t*		trx;
2813
2814
	*out_rec = NULL;
2815
	trx = thr_get_trx(thr);
2816
2817
	row_build_row_ref_in_tuple(prebuilt->clust_ref, sec_index, rec, trx);
2818
2819
	clust_index = dict_table_get_first_index(sec_index->table);
2820
2821
	btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
2822
				   PAGE_CUR_LE, BTR_SEARCH_LEAF,
2823
				   prebuilt->clust_pcur, 0, mtr);
2824
2825
	clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
2826
2827
	prebuilt->clust_pcur->trx_if_known = trx;
2828
2829
	/* Note: only if the search ends up on a non-infimum record is the
2830
	low_match value the real match to the search tuple */
2831
2832
	if (!page_rec_is_user_rec(clust_rec)
2833
	    || btr_pcur_get_low_match(prebuilt->clust_pcur)
2834
	    < dict_index_get_n_unique(clust_index)) {
2835
2836
		/* In a rare case it is possible that no clust rec is found
2837
		for a delete-marked secondary index record: if in row0umod.c
2838
		in row_undo_mod_remove_clust_low() we have already removed
2839
		the clust rec, while purge is still cleaning and removing
2840
		secondary index records associated with earlier versions of
2841
		the clustered index record. In that case we know that the
2842
		clustered index record did not exist in the read view of
2843
		trx. */
2844
2845
		if (!rec_get_deleted_flag(rec,
2846
					  dict_table_is_comp(sec_index->table))
2847
		    || prebuilt->select_lock_type != LOCK_NONE) {
2848
			ut_print_timestamp(stderr);
2849
			fputs("  InnoDB: error clustered record"
2850
			      " for sec rec not found\n"
2851
			      "InnoDB: ", stderr);
2852
			dict_index_name_print(stderr, trx, sec_index);
2853
			fputs("\n"
2854
			      "InnoDB: sec index record ", stderr);
2855
			rec_print(stderr, rec, sec_index);
2856
			fputs("\n"
2857
			      "InnoDB: clust index record ", stderr);
2858
			rec_print(stderr, clust_rec, clust_index);
2859
			putc('\n', stderr);
2860
			trx_print(stderr, trx, 600);
2861
2862
			fputs("\n"
2863
			      "InnoDB: Submit a detailed bug report"
2864
			      " to http://bugs.mysql.com\n", stderr);
2865
		}
2866
2867
		clust_rec = NULL;
2868
2869
		goto func_exit;
2870
	}
2871
2872
	*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
2873
				   ULINT_UNDEFINED, offset_heap);
2874
2875
	if (prebuilt->select_lock_type != LOCK_NONE) {
2876
		/* Try to place a lock on the index record; we are searching
2877
		the clust rec with a unique condition, hence
2878
		we set a LOCK_REC_NOT_GAP type lock */
2879
2880
		err = lock_clust_rec_read_check_and_lock(
2881
			0, clust_rec, clust_index, *offsets,
2882
			prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr);
2883
		if (err != DB_SUCCESS) {
2884
2885
			goto err_exit;
2886
		}
2887
	} else {
2888
		/* This is a non-locking consistent read: if necessary, fetch
2889
		a previous version of the record */
2890
2891
		old_vers = NULL;
2892
2893
		/* If the isolation level allows reading of uncommitted data,
2894
		then we never look for an earlier version */
2895
2896
		if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
2897
		    && !lock_clust_rec_cons_read_sees(
2898
			    clust_rec, clust_index, *offsets,
2899
			    trx->read_view)) {
2900
2901
			/* The following call returns 'offsets' associated with
2902
			'old_vers' */
2903
			err = row_sel_build_prev_vers_for_mysql(
2904
				trx->read_view, clust_index, prebuilt,
2905
				clust_rec, offsets, offset_heap, &old_vers,
2906
				mtr);
2907
2908
			if (err != DB_SUCCESS) {
2909
2910
				goto err_exit;
2911
			}
2912
2913
			clust_rec = old_vers;
2914
		}
2915
2916
		/* If we had to go to an earlier version of row or the
2917
		secondary index record is delete marked, then it may be that
2918
		the secondary index record corresponding to clust_rec
2919
		(or old_vers) is not rec; in that case we must ignore
2920
		such row because in our snapshot rec would not have existed.
2921
		Remember that from rec we cannot see directly which transaction
2922
		id corresponds to it: we have to go to the clustered index
2923
		record. A query where we want to fetch all rows where
2924
		the secondary index value is in some interval would return
2925
		a wrong result if we would not drop rows which we come to
2926
		visit through secondary index records that would not really
2927
		exist in our snapshot. */
2928
2929
		if (clust_rec && (old_vers || rec_get_deleted_flag(
2930
					  rec,
2931
					  dict_table_is_comp(
2932
						  sec_index->table)))
2933
		    && !row_sel_sec_rec_is_for_clust_rec(
2934
			    rec, sec_index, clust_rec, clust_index)) {
2935
			clust_rec = NULL;
2936
		} else {
2937
#ifdef UNIV_SEARCH_DEBUG
2938
			ut_a(clust_rec == NULL
2939
			     || row_sel_sec_rec_is_for_clust_rec(
2940
				     rec, sec_index, clust_rec, clust_index));
2941
#endif
2942
		}
2943
	}
2944
2945
func_exit:
2946
	*out_rec = clust_rec;
2947
2948
	if (prebuilt->select_lock_type == LOCK_X) {
2949
		/* We may use the cursor in update: store its position */
2950
2951
		btr_pcur_store_position(prebuilt->clust_pcur, mtr);
2952
	}
2953
2954
	err = DB_SUCCESS;
2955
err_exit:
2956
	return(err);
2957
}
2958
2959
/************************************************************************
2960
Restores cursor position after it has been stored. We have to take into
2961
account that the record cursor was positioned on may have been deleted.
2962
Then we may have to move the cursor one step up or down. */
2963
static
2964
ibool
2965
sel_restore_position_for_mysql(
2966
/*===========================*/
2967
					/* out: TRUE if we may need to
2968
					process the record the cursor is
2969
					now positioned on (i.e. we should
2970
					not go to the next record yet) */
2971
	ibool*		same_user_rec,	/* out: TRUE if we were able to restore
2972
					the cursor on a user record with the
2973
					same ordering prefix in in the
2974
					B-tree index */
2975
	ulint		latch_mode,	/* in: latch mode wished in
2976
					restoration */
2977
	btr_pcur_t*	pcur,		/* in: cursor whose position
2978
					has been stored */
2979
	ibool		moves_up,	/* in: TRUE if the cursor moves up
2980
					in the index */
2981
	mtr_t*		mtr)		/* in: mtr; CAUTION: may commit
2982
					mtr temporarily! */
2983
{
2984
	ibool	success;
2985
	ulint	relative_position;
2986
2987
	relative_position = pcur->rel_pos;
2988
2989
	success = btr_pcur_restore_position(latch_mode, pcur, mtr);
2990
2991
	*same_user_rec = success;
2992
2993
	if (relative_position == BTR_PCUR_ON) {
2994
		if (success) {
2995
			return(FALSE);
2996
		}
2997
2998
		if (moves_up) {
2999
			btr_pcur_move_to_next(pcur, mtr);
3000
		}
3001
3002
		return(TRUE);
3003
	}
3004
3005
	if (relative_position == BTR_PCUR_AFTER
3006
	    || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
3007
3008
		if (moves_up) {
3009
			return(TRUE);
3010
		}
3011
3012
		if (btr_pcur_is_on_user_rec(pcur, mtr)) {
3013
			btr_pcur_move_to_prev(pcur, mtr);
3014
		}
3015
3016
		return(TRUE);
3017
	}
3018
3019
	ut_ad(relative_position == BTR_PCUR_BEFORE
3020
	      || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
3021
3022
	if (moves_up && btr_pcur_is_on_user_rec(pcur, mtr)) {
3023
		btr_pcur_move_to_next(pcur, mtr);
3024
	}
3025
3026
	return(TRUE);
3027
}
3028
3029
/************************************************************************
3030
Pops a cached row for MySQL from the fetch cache. */
3031
UNIV_INLINE
3032
void
3033
row_sel_pop_cached_row_for_mysql(
3034
/*=============================*/
3035
	byte*		buf,		/* in/out: buffer where to copy the
3036
					row */
3037
	row_prebuilt_t*	prebuilt)	/* in: prebuilt struct */
3038
{
3039
	ulint			i;
3040
	mysql_row_templ_t*	templ;
3041
	byte*			cached_rec;
3042
	ut_ad(prebuilt->n_fetch_cached > 0);
3043
	ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
3044
3045
	if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
3046
		/* Copy cache record field by field, don't touch fields that
3047
		are not covered by current key */
3048
		cached_rec = prebuilt->fetch_cache[
3049
			prebuilt->fetch_cache_first];
3050
3051
		for (i = 0; i < prebuilt->n_template; i++) {
3052
			templ = prebuilt->mysql_template + i;
3053
			ut_memcpy(buf + templ->mysql_col_offset,
3054
				  cached_rec + templ->mysql_col_offset,
3055
				  templ->mysql_col_len);
3056
			/* Copy NULL bit of the current field from cached_rec
3057
			to buf */
3058
			if (templ->mysql_null_bit_mask) {
3059
				buf[templ->mysql_null_byte_offset]
3060
					^= (buf[templ->mysql_null_byte_offset]
3061
					    ^ cached_rec[templ->mysql_null_byte_offset])
3062
					& (byte)templ->mysql_null_bit_mask;
3063
			}
3064
		}
3065
	}
3066
	else {
3067
		ut_memcpy(buf,
3068
			  prebuilt->fetch_cache[prebuilt->fetch_cache_first],
3069
			  prebuilt->mysql_prefix_len);
3070
	}
3071
	prebuilt->n_fetch_cached--;
3072
	prebuilt->fetch_cache_first++;
3073
3074
	if (prebuilt->n_fetch_cached == 0) {
3075
		prebuilt->fetch_cache_first = 0;
3076
	}
3077
}
3078
3079
/************************************************************************
3080
Pushes a row for MySQL to the fetch cache. */
3081
UNIV_INLINE
3082
void
3083
row_sel_push_cache_row_for_mysql(
3084
/*=============================*/
3085
	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct */
3086
	rec_t*		rec,		/* in: record to push */
3087
	const ulint*	offsets,	/* in: rec_get_offsets() */
3088
        ulint           start_field_no, /* psergey: start from this field */
3089
        byte*           remainder_buf)  /* if above !=0 -> where to take prev fields */
3090
{
3091
	byte*	buf;
3092
	ulint	i;
3093
3094
	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
3095
	ut_ad(rec_offs_validate(rec, NULL, offsets));
3096
	ut_a(!prebuilt->templ_contains_blob);
3097
3098
	if (prebuilt->fetch_cache[0] == NULL) {
3099
		/* Allocate memory for the fetch cache */
3100
3101
		for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
3102
3103
			/* A user has reported memory corruption in these
3104
			buffers in Linux. Put magic numbers there to help
3105
			to track a possible bug. */
3106
3107
			buf = mem_alloc(prebuilt->mysql_row_len + 8);
3108
3109
			prebuilt->fetch_cache[i] = buf + 4;
3110
3111
			mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
3112
			mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
3113
					ROW_PREBUILT_FETCH_MAGIC_N);
3114
		}
3115
	}
3116
3117
	ut_ad(prebuilt->fetch_cache_first == 0);
3118
3119
	if (UNIV_UNLIKELY(!row_sel_store_mysql_rec(
3120
				  prebuilt->fetch_cache[
3121
					  prebuilt->n_fetch_cached],
3122
				  prebuilt, rec, offsets, start_field_no,
3123
                                  prebuilt->n_template))) {
3124
		ut_error;
3125
	}
3126
        if (start_field_no) {
3127
          for (i=0; i < start_field_no; i++) {
3128
            register ulint offs;
3129
	    mysql_row_templ_t* templ;
3130
            templ = prebuilt->mysql_template + i;
3131
3132
            if (templ->mysql_null_bit_mask) {
3133
              offs= templ->mysql_null_byte_offset;
3134
              *(prebuilt->fetch_cache[prebuilt->n_fetch_cached] + offs) ^= 
3135
                (*(remainder_buf + offs) & templ->mysql_null_bit_mask);
3136
            }
3137
            offs= templ->mysql_col_offset;
3138
            memcpy(prebuilt->fetch_cache[prebuilt->n_fetch_cached] + offs,
3139
                   remainder_buf + offs,
3140
                   templ->mysql_col_len);
3141
          }
3142
        }
3143
3144
	prebuilt->n_fetch_cached++;
3145
}
3146
3147
/*************************************************************************
3148
Tries to do a shortcut to fetch a clustered index record with a unique key,
3149
using the hash index if possible (not always). We assume that the search
3150
mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
3151
btr search latch has been locked in S-mode. */
3152
static
3153
ulint
3154
row_sel_try_search_shortcut_for_mysql(
3155
/*==================================*/
3156
				/* out: SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
3157
	rec_t**		out_rec,/* out: record if found */
3158
	row_prebuilt_t*	prebuilt,/* in: prebuilt struct */
3159
	ulint**		offsets,/* in/out: for rec_get_offsets(*out_rec) */
3160
	mem_heap_t**	heap,	/* in/out: heap for rec_get_offsets() */
3161
	mtr_t*		mtr)	/* in: started mtr */
3162
{
3163
	dict_index_t*	index		= prebuilt->index;
3164
	dtuple_t*	search_tuple	= prebuilt->search_tuple;
3165
	btr_pcur_t*	pcur		= prebuilt->pcur;
3166
	trx_t*		trx		= prebuilt->trx;
3167
	rec_t*		rec;
3168
3169
	ut_ad(index->type & DICT_CLUSTERED);
3170
	ut_ad(!prebuilt->templ_contains_blob);
3171
3172
	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
3173
				   BTR_SEARCH_LEAF, pcur,
3174
#ifndef UNIV_SEARCH_DEBUG
3175
				   RW_S_LATCH,
3176
#else
3177
				   0,
3178
#endif
3179
				   mtr);
3180
	rec = btr_pcur_get_rec(pcur);
3181
3182
	if (!page_rec_is_user_rec(rec)) {
3183
3184
		return(SEL_RETRY);
3185
	}
3186
3187
	/* As the cursor is now placed on a user record after a search with
3188
	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
3189
	fields in the user record matched to the search tuple */
3190
3191
	if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
3192
3193
		return(SEL_EXHAUSTED);
3194
	}
3195
3196
	/* This is a non-locking consistent read: if necessary, fetch
3197
	a previous version of the record */
3198
3199
	*offsets = rec_get_offsets(rec, index, *offsets,
3200
				   ULINT_UNDEFINED, heap);
3201
3202
	if (!lock_clust_rec_cons_read_sees(rec, index,
3203
					   *offsets, trx->read_view)) {
3204
3205
		return(SEL_RETRY);
3206
	}
3207
3208
	if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
3209
3210
		return(SEL_EXHAUSTED);
3211
	}
3212
3213
	*out_rec = rec;
3214
3215
	return(SEL_FOUND);
3216
}
3217
3218
/************************************************************************
3219
Searches for rows in the database. This is used in the interface to
3220
MySQL. This function opens a cursor, and also implements fetch next
3221
and fetch prev. NOTE that if we do a search with a full key value
3222
from a unique index (ROW_SEL_EXACT), then we will not store the cursor
3223
position and fetch next or fetch prev must not be tried to the cursor! */
3224
3225
ulint
3226
row_search_for_mysql(
3227
/*=================*/
3228
					/* out: DB_SUCCESS,
3229
					DB_RECORD_NOT_FOUND,
3230
					DB_END_OF_INDEX, DB_DEADLOCK,
3231
					DB_LOCK_TABLE_FULL, DB_CORRUPTION,
3232
					or DB_TOO_BIG_RECORD */
3233
	byte*		buf,		/* in/out: buffer for the fetched
3234
					row in the MySQL format */
3235
	ulint		mode,		/* in: search mode PAGE_CUR_L, ... */
3236
	row_prebuilt_t*	prebuilt,	/* in: prebuilt struct for the
3237
					table handle; this contains the info
3238
					of search_tuple, index; if search
3239
					tuple contains 0 fields then we
3240
					position the cursor at the start or
3241
					the end of the index, depending on
3242
					'mode' */
3243
	ulint		match_mode,	/* in: 0 or ROW_SEL_EXACT or
3244
					ROW_SEL_EXACT_PREFIX */
3245
	ulint		direction)	/* in: 0 or ROW_SEL_NEXT or
3246
					ROW_SEL_PREV; NOTE: if this is != 0,
3247
					then prebuilt must have a pcur
3248
					with stored position! In opening of a
3249
					cursor 'direction' should be 0. */
3250
{
3251
	dict_index_t*	index		= prebuilt->index;
3252
	ibool		comp		= dict_table_is_comp(index->table);
3253
	dtuple_t*	search_tuple	= prebuilt->search_tuple;
3254
	btr_pcur_t*	pcur		= prebuilt->pcur;
3255
	trx_t*		trx		= prebuilt->trx;
3256
	dict_index_t*	clust_index;
3257
	que_thr_t*	thr;
3258
	rec_t*		rec;
3259
	rec_t*		result_rec;
3260
	rec_t*		clust_rec;
3261
	ulint		err				= DB_SUCCESS;
3262
	ibool		unique_search			= FALSE;
3263
	ibool		unique_search_from_clust_index	= FALSE;
3264
	ibool		mtr_has_extra_clust_latch	= FALSE;
3265
	ibool		moves_up			= FALSE;
3266
	ibool		set_also_gap_locks		= TRUE;
3267
	/* if the query is a plain locking SELECT, and the isolation level
3268
	is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
3269
	ibool		did_semi_consistent_read	= FALSE;
3270
	/* if the returned record was locked and we did a semi-consistent
3271
	read (fetch the newest committed version), then this is set to
3272
	TRUE */
3273
#ifdef UNIV_SEARCH_DEBUG
3274
	ulint		cnt				= 0;
3275
#endif /* UNIV_SEARCH_DEBUG */
3276
	ulint		next_offs;
3277
	ibool		same_user_rec;
3278
	mtr_t		mtr;
3279
	mem_heap_t*	heap				= NULL;
3280
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
3281
	ulint*		offsets				= offsets_;
3282
        ibool           some_fields_in_buffer;
3283
        ibool           get_clust_rec= 0;
3284
3285
	*offsets_ = (sizeof offsets_) / sizeof *offsets_;
3286
3287
	ut_ad(index && pcur && search_tuple);
3288
	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
3289
3290
	if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) {
3291
		ut_print_timestamp(stderr);
3292
		fprintf(stderr, "  InnoDB: Error:\n"
3293
			"InnoDB: MySQL is trying to use a table handle"
3294
			" but the .ibd file for\n"
3295
			"InnoDB: table %s does not exist.\n"
3296
			"InnoDB: Have you deleted the .ibd file"
3297
			" from the database directory under\n"
3298
			"InnoDB: the MySQL datadir, or have you used"
3299
			" DISCARD TABLESPACE?\n"
3300
			"InnoDB: Look from\n"
3301
			"InnoDB: http://dev.mysql.com/doc/refman/5.1/en/"
3302
			"innodb-troubleshooting.html\n"
3303
			"InnoDB: how you can resolve the problem.\n",
3304
			prebuilt->table->name);
3305
3306
		return(DB_ERROR);
3307
	}
3308
3309
	if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
3310
		fprintf(stderr,
3311
			"InnoDB: Error: trying to free a corrupt\n"
3312
			"InnoDB: table handle. Magic n %lu, table name ",
3313
			(ulong) prebuilt->magic_n);
3314
		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
3315
		putc('\n', stderr);
3316
3317
		mem_analyze_corruption(prebuilt);
3318
3319
		ut_error;
3320
	}
3321
3322
#if 0
3323
	/* August 19, 2005 by Heikki: temporarily disable this error
3324
	print until the cursor lock count is done correctly.
3325
	See bugs #12263 and #12456!*/
3326
3327
	if (trx->n_mysql_tables_in_use == 0
3328
	    && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) {
3329
		/* Note that if MySQL uses an InnoDB temp table that it
3330
		created inside LOCK TABLES, then n_mysql_tables_in_use can
3331
		be zero; in that case select_lock_type is set to LOCK_X in
3332
		::start_stmt. */
3333
3334
		fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n"
3335
		      "InnoDB: but it has not locked"
3336
		      " any tables in ::external_lock()!\n",
3337
		      stderr);
3338
		trx_print(stderr, trx, 600);
3339
		fputc('\n', stderr);
3340
	}
3341
#endif
3342
3343
#if 0
3344
	fprintf(stderr, "Match mode %lu\n search tuple ",
3345
		(ulong) match_mode);
3346
	dtuple_print(search_tuple);
3347
	fprintf(stderr, "N tables locked %lu\n",
3348
		(ulong) trx->mysql_n_tables_locked);
3349
#endif
3350
	/*-------------------------------------------------------------*/
3351
	/* PHASE 0: Release a possible s-latch we are holding on the
3352
	adaptive hash index latch if there is someone waiting behind */
3353
3354
	if (UNIV_UNLIKELY(btr_search_latch.writer != RW_LOCK_NOT_LOCKED)
3355
	    && trx->has_search_latch) {
3356
3357
		/* There is an x-latch request on the adaptive hash index:
3358
		release the s-latch to reduce starvation and wait for
3359
		BTR_SEA_TIMEOUT rounds before trying to keep it again over
3360
		calls from MySQL */
3361
3362
		rw_lock_s_unlock(&btr_search_latch);
3363
		trx->has_search_latch = FALSE;
3364
3365
		trx->search_latch_timeout = BTR_SEA_TIMEOUT;
3366
	}
3367
3368
	/* Reset the new record lock info if srv_locks_unsafe_for_binlog
3369
	is set or session is using a READ COMMITED isolation level. Then
3370
	we are able to remove the record locks set here on an individual
3371
	row. */
3372
3373
	if ((srv_locks_unsafe_for_binlog
3374
	     || trx->isolation_level == TRX_ISO_READ_COMMITTED)
3375
	    && prebuilt->select_lock_type != LOCK_NONE) {
3376
3377
		trx_reset_new_rec_lock_info(trx);
3378
	}
3379
3380
	/*-------------------------------------------------------------*/
3381
	/* PHASE 1: Try to pop the row from the prefetch cache */
3382
3383
	if (UNIV_UNLIKELY(direction == 0)) {
3384
		trx->op_info = "starting index read";
3385
3386
		prebuilt->n_rows_fetched = 0;
3387
		prebuilt->n_fetch_cached = 0;
3388
		prebuilt->fetch_cache_first = 0;
3389
3390
		if (prebuilt->sel_graph == NULL) {
3391
			/* Build a dummy select query graph */
3392
			row_prebuild_sel_graph(prebuilt);
3393
		}
3394
	} else {
3395
		trx->op_info = "fetching rows";
3396
3397
		if (prebuilt->n_rows_fetched == 0) {
3398
			prebuilt->fetch_direction = direction;
3399
		}
3400
3401
		if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
3402
			if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
3403
				ut_error;
3404
				/* TODO: scrollable cursor: restore cursor to
3405
				the place of the latest returned row,
3406
				or better: prevent caching for a scroll
3407
				cursor! */
3408
			}
3409
3410
			prebuilt->n_rows_fetched = 0;
3411
			prebuilt->n_fetch_cached = 0;
3412
			prebuilt->fetch_cache_first = 0;
3413
3414
		} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
3415
			row_sel_pop_cached_row_for_mysql(buf, prebuilt);
3416
3417
			prebuilt->n_rows_fetched++;
3418
3419
			srv_n_rows_read++;
3420
			err = DB_SUCCESS;
3421
			goto func_exit;
3422
		}
3423
3424
		if (prebuilt->fetch_cache_first > 0
3425
		    && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
3426
3427
			/* The previous returned row was popped from the fetch
3428
			cache, but the cache was not full at the time of the
3429
			popping: no more rows can exist in the result set */
3430
3431
			err = DB_RECORD_NOT_FOUND;
3432
			goto func_exit;
3433
		}
3434
3435
		prebuilt->n_rows_fetched++;
3436
3437
		if (prebuilt->n_rows_fetched > 1000000000) {
3438
			/* Prevent wrap-over */
3439
			prebuilt->n_rows_fetched = 500000000;
3440
		}
3441
3442
		mode = pcur->search_mode;
3443
	}
3444
3445
	/* In a search where at most one record in the index may match, we
3446
	can use a LOCK_REC_NOT_GAP type record lock when locking a
3447
	non-delete-marked matching record.
3448
3449
	Note that in a unique secondary index there may be different
3450
	delete-marked versions of a record where only the primary key
3451
	values differ: thus in a secondary index we must use next-key
3452
	locks when locking delete-marked records. */
3453
3454
	if (match_mode == ROW_SEL_EXACT
3455
	    && index->type & DICT_UNIQUE
3456
	    && dtuple_get_n_fields(search_tuple)
3457
	    == dict_index_get_n_unique(index)
3458
	    && (index->type & DICT_CLUSTERED
3459
		|| !dtuple_contains_null(search_tuple))) {
3460
3461
		/* Note above that a UNIQUE secondary index can contain many
3462
		rows with the same key value if one of the columns is the SQL
3463
		null. A clustered index under MySQL can never contain null
3464
		columns because we demand that all the columns in primary key
3465
		are non-null. */
3466
3467
		unique_search = TRUE;
3468
3469
		/* Even if the condition is unique, MySQL seems to try to
3470
		retrieve also a second row if a primary key contains more than
3471
		1 column. Return immediately if this is not a HANDLER
3472
		command. */
3473
3474
		if (UNIV_UNLIKELY(direction != 0
3475
				  && !prebuilt->used_in_HANDLER)) {
3476
3477
			err = DB_RECORD_NOT_FOUND;
3478
			goto func_exit;
3479
		}
3480
	}
3481
3482
	mtr_start(&mtr);
3483
3484
	/*-------------------------------------------------------------*/
3485
	/* PHASE 2: Try fast adaptive hash index search if possible */
3486
3487
	/* Next test if this is the special case where we can use the fast
3488
	adaptive hash index to try the search. Since we must release the
3489
	search system latch when we retrieve an externally stored field, we
3490
	cannot use the adaptive hash index in a search in the case the row
3491
	may be long and there may be externally stored fields */
3492
3493
	if (UNIV_UNLIKELY(direction == 0)
3494
	    && unique_search
3495
	    && index->type & DICT_CLUSTERED
3496
	    && !prebuilt->templ_contains_blob
3497
	    && !prebuilt->used_in_HANDLER
3498
	    && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
3499
3500
		mode = PAGE_CUR_GE;
3501
3502
		unique_search_from_clust_index = TRUE;
3503
3504
		if (trx->mysql_n_tables_locked == 0
3505
		    && prebuilt->select_lock_type == LOCK_NONE
3506
		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
3507
		    && trx->read_view) {
3508
3509
			/* This is a SELECT query done as a consistent read,
3510
			and the read view has already been allocated:
3511
			let us try a search shortcut through the hash
3512
			index.
3513
			NOTE that we must also test that
3514
			mysql_n_tables_locked == 0, because this might
3515
			also be INSERT INTO ... SELECT ... or
3516
			CREATE TABLE ... SELECT ... . Our algorithm is
3517
			NOT prepared to inserts interleaved with the SELECT,
3518
			and if we try that, we can deadlock on the adaptive
3519
			hash index semaphore! */
3520
3521
#ifndef UNIV_SEARCH_DEBUG
3522
			if (!trx->has_search_latch) {
3523
				rw_lock_s_lock(&btr_search_latch);
3524
				trx->has_search_latch = TRUE;
3525
			}
3526
#endif
3527
			switch (row_sel_try_search_shortcut_for_mysql(
3528
					&rec, prebuilt, &offsets, &heap,
3529
					&mtr)) {
3530
			case SEL_FOUND:
3531
#ifdef UNIV_SEARCH_DEBUG
3532
				ut_a(0 == cmp_dtuple_rec(search_tuple,
3533
							 rec, offsets));
3534
#endif
3535
				if (!row_sel_store_mysql_rec(buf, prebuilt,
3536
							     rec, offsets, 0, 
3537
                                                             prebuilt->n_template)) {
3538
					err = DB_TOO_BIG_RECORD;
3539
3540
					/* We let the main loop to do the
3541
					error handling */
3542
					goto shortcut_fails_too_big_rec;
3543
				}
3544
3545
				mtr_commit(&mtr);
3546
3547
				/* ut_print_name(stderr, index->name);
3548
				fputs(" shortcut\n", stderr); */
3549
3550
				srv_n_rows_read++;
3551
3552
				if (trx->search_latch_timeout > 0
3553
				    && trx->has_search_latch) {
3554
3555
					trx->search_latch_timeout--;
3556
3557
					rw_lock_s_unlock(&btr_search_latch);
3558
					trx->has_search_latch = FALSE;
3559
				}
3560
3561
				/* NOTE that we do NOT store the cursor
3562
				position */
3563
				err = DB_SUCCESS;
3564
				goto func_exit;
3565
3566
			case SEL_EXHAUSTED:
3567
				mtr_commit(&mtr);
3568
3569
				/* ut_print_name(stderr, index->name);
3570
				fputs(" record not found 2\n", stderr); */
3571
3572
				if (trx->search_latch_timeout > 0
3573
				    && trx->has_search_latch) {
3574
3575
					trx->search_latch_timeout--;
3576
3577
					rw_lock_s_unlock(&btr_search_latch);
3578
					trx->has_search_latch = FALSE;
3579
				}
3580
3581
				/* NOTE that we do NOT store the cursor
3582
				position */
3583
3584
				err = DB_RECORD_NOT_FOUND;
3585
				goto func_exit;
3586
			}
3587
shortcut_fails_too_big_rec:
3588
			mtr_commit(&mtr);
3589
			mtr_start(&mtr);
3590
		}
3591
	}
3592
3593
	/*-------------------------------------------------------------*/
3594
	/* PHASE 3: Open or restore index cursor position */
3595
3596
	if (trx->has_search_latch) {
3597
		rw_lock_s_unlock(&btr_search_latch);
3598
		trx->has_search_latch = FALSE;
3599
	}
3600
3601
	trx_start_if_not_started(trx);
3602
3603
	if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
3604
	    && prebuilt->select_lock_type != LOCK_NONE
3605
	    && trx->mysql_query_str != NULL
3606
	    && *trx->mysql_query_str != NULL
3607
	    && trx->mysql_thd != NULL) {
3608
3609
		/* Scan the MySQL query string; check if SELECT is the first
3610
		word there */
3611
3612
		if (dict_str_starts_with_keyword(
3613
			    trx->mysql_thd, *trx->mysql_query_str, "SELECT")) {
3614
			/* It is a plain locking SELECT and the isolation
3615
			level is low: do not lock gaps */
3616
3617
			set_also_gap_locks = FALSE;
3618
		}
3619
	}
3620
3621
	/* Note that if the search mode was GE or G, then the cursor
3622
	naturally moves upward (in fetch next) in alphabetical order,
3623
	otherwise downward */
3624
3625
	if (UNIV_UNLIKELY(direction == 0)) {
3626
		if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
3627
			moves_up = TRUE;
3628
		}
3629
	} else if (direction == ROW_SEL_NEXT) {
3630
		moves_up = TRUE;
3631
	}
3632
3633
	thr = que_fork_get_first_thr(prebuilt->sel_graph);
3634
3635
	que_thr_move_to_run_state_for_mysql(thr, trx);
3636
3637
	clust_index = dict_table_get_first_index(index->table);
3638
3639
	if (UNIV_LIKELY(direction != 0)) {
3640
		ibool	need_to_process = sel_restore_position_for_mysql(
3641
			&same_user_rec, BTR_SEARCH_LEAF,
3642
			pcur, moves_up, &mtr);
3643
3644
		if (UNIV_UNLIKELY(need_to_process)) {
3645
			if (UNIV_UNLIKELY(prebuilt->row_read_type
3646
					  == ROW_READ_DID_SEMI_CONSISTENT)) {
3647
				/* We did a semi-consistent read,
3648
				but the record was removed in
3649
				the meantime. */
3650
				prebuilt->row_read_type
3651
					= ROW_READ_TRY_SEMI_CONSISTENT;
3652
			}
3653
		} else if (UNIV_LIKELY(prebuilt->row_read_type
3654
				       != ROW_READ_DID_SEMI_CONSISTENT)) {
3655
3656
			/* The cursor was positioned on the record
3657
			that we returned previously.  If we need
3658
			to repeat a semi-consistent read as a
3659
			pessimistic locking read, the record
3660
			cannot be skipped. */
3661
3662
			goto next_rec;
3663
		}
3664
3665
	} else if (dtuple_get_n_fields(search_tuple) > 0) {
3666
3667
		btr_pcur_open_with_no_init(index, search_tuple, mode,
3668
					   BTR_SEARCH_LEAF,
3669
					   pcur, 0, &mtr);
3670
3671
		pcur->trx_if_known = trx;
3672
3673
		rec = btr_pcur_get_rec(pcur);
3674
3675
		if (!moves_up
3676
		    && !page_rec_is_supremum(rec)
3677
		    && set_also_gap_locks
3678
		    && !(srv_locks_unsafe_for_binlog
3679
			 || trx->isolation_level == TRX_ISO_READ_COMMITTED)
3680
		    && prebuilt->select_lock_type != LOCK_NONE) {
3681
3682
			/* Try to place a gap lock on the next index record
3683
			to prevent phantoms in ORDER BY ... DESC queries */
3684
3685
			offsets = rec_get_offsets(page_rec_get_next(rec),
3686
						  index, offsets,
3687
						  ULINT_UNDEFINED, &heap);
3688
			err = sel_set_rec_lock(page_rec_get_next(rec),
3689
					       index, offsets,
3690
					       prebuilt->select_lock_type,
3691
					       LOCK_GAP, thr);
3692
3693
			if (err != DB_SUCCESS) {
3694
3695
				goto lock_wait_or_error;
3696
			}
3697
		}
3698
	} else {
3699
		if (mode == PAGE_CUR_G) {
3700
			btr_pcur_open_at_index_side(
3701
				TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3702
				&mtr);
3703
		} else if (mode == PAGE_CUR_L) {
3704
			btr_pcur_open_at_index_side(
3705
				FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE,
3706
				&mtr);
3707
		}
3708
	}
3709
3710
	if (!prebuilt->sql_stat_start) {
3711
		/* No need to set an intention lock or assign a read view */
3712
3713
		if (trx->read_view == NULL
3714
		    && prebuilt->select_lock_type == LOCK_NONE) {
3715
3716
			fputs("InnoDB: Error: MySQL is trying to"
3717
			      " perform a consistent read\n"
3718
			      "InnoDB: but the read view is not assigned!\n",
3719
			      stderr);
3720
			trx_print(stderr, trx, 600);
3721
			fputc('\n', stderr);
3722
			ut_a(0);
3723
		}
3724
	} else if (prebuilt->select_lock_type == LOCK_NONE) {
3725
		/* This is a consistent read */
3726
		/* Assign a read view for the query */
3727
3728
		trx_assign_read_view(trx);
3729
		prebuilt->sql_stat_start = FALSE;
3730
	} else {
3731
		ulint	lock_mode;
3732
		if (prebuilt->select_lock_type == LOCK_S) {
3733
			lock_mode = LOCK_IS;
3734
		} else {
3735
			lock_mode = LOCK_IX;
3736
		}
3737
		err = lock_table(0, index->table, lock_mode, thr);
3738
3739
		if (err != DB_SUCCESS) {
3740
3741
			goto lock_wait_or_error;
3742
		}
3743
		prebuilt->sql_stat_start = FALSE;
3744
	}
3745
3746
rec_loop:
3747
	/*-------------------------------------------------------------*/
3748
	/* PHASE 4: Look for matching records in a loop */
3749
3750
	rec = btr_pcur_get_rec(pcur);
3751
	ut_ad(!!page_rec_is_comp(rec) == comp);
3752
#ifdef UNIV_SEARCH_DEBUG
3753
	/*
3754
	fputs("Using ", stderr);
3755
	dict_index_name_print(stderr, index);
3756
	fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
3757
	buf_frame_get_page_no(buf_frame_align(rec)));
3758
	rec_print(rec);
3759
	*/
3760
#endif /* UNIV_SEARCH_DEBUG */
3761
3762
	if (page_rec_is_infimum(rec)) {
3763
3764
		/* The infimum record on a page cannot be in the result set,
3765
		and neither can a record lock be placed on it: we skip such
3766
		a record. */
3767
3768
		goto next_rec;
3769
	}
3770
3771
	if (page_rec_is_supremum(rec)) {
3772
3773
		if (set_also_gap_locks
3774
		    && !(srv_locks_unsafe_for_binlog
3775
			 || trx->isolation_level == TRX_ISO_READ_COMMITTED)
3776
		    && prebuilt->select_lock_type != LOCK_NONE) {
3777
3778
			/* Try to place a lock on the index record */
3779
3780
			/* If innodb_locks_unsafe_for_binlog option is used
3781
			or this session is using a READ COMMITTED isolation
3782
			level we do not lock gaps. Supremum record is really
3783
			a gap and therefore we do not set locks there. */
3784
3785
			offsets = rec_get_offsets(rec, index, offsets,
3786
						  ULINT_UNDEFINED, &heap);
3787
			err = sel_set_rec_lock(rec, index, offsets,
3788
					       prebuilt->select_lock_type,
3789
					       LOCK_ORDINARY, thr);
3790
3791
			if (err != DB_SUCCESS) {
3792
3793
				goto lock_wait_or_error;
3794
			}
3795
		}
3796
		/* A page supremum record cannot be in the result set: skip
3797
		it now that we have placed a possible lock on it */
3798
3799
		goto next_rec;
3800
	}
3801
3802
	/*-------------------------------------------------------------*/
3803
	/* Do sanity checks in case our cursor has bumped into page
3804
	corruption */
3805
3806
	if (comp) {
3807
		next_offs = rec_get_next_offs(rec, TRUE);
3808
		if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
3809
3810
			goto wrong_offs;
3811
		}
3812
	} else {
3813
		next_offs = rec_get_next_offs(rec, FALSE);
3814
		if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
3815
3816
			goto wrong_offs;
3817
		}
3818
	}
3819
3820
	if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
3821
3822
wrong_offs:
3823
		if (srv_force_recovery == 0 || moves_up == FALSE) {
3824
			ut_print_timestamp(stderr);
3825
			buf_page_print(buf_frame_align(rec));
3826
			fprintf(stderr,
3827
				"\nInnoDB: rec address %p, first"
3828
				" buffer frame %p\n"
3829
				"InnoDB: buffer pool high end %p,"
3830
				" buf block fix count %lu\n",
3831
				(void*) rec, (void*) buf_pool->frame_zero,
3832
				(void*) buf_pool->high_end,
3833
				(ulong)buf_block_align(rec)->buf_fix_count);
3834
			fprintf(stderr,
3835
				"InnoDB: Index corruption: rec offs %lu"
3836
				" next offs %lu, page no %lu,\n"
3837
				"InnoDB: ",
3838
				(ulong) page_offset(rec),
3839
				(ulong) next_offs,
3840
				(ulong) buf_frame_get_page_no(rec));
3841
			dict_index_name_print(stderr, trx, index);
3842
			fputs(". Run CHECK TABLE. You may need to\n"
3843
			      "InnoDB: restore from a backup, or"
3844
			      " dump + drop + reimport the table.\n",
3845
			      stderr);
3846
3847
			err = DB_CORRUPTION;
3848
3849
			goto lock_wait_or_error;
3850
		} else {
3851
			/* The user may be dumping a corrupt table. Jump
3852
			over the corruption to recover as much as possible. */
3853
3854
			fprintf(stderr,
3855
				"InnoDB: Index corruption: rec offs %lu"
3856
				" next offs %lu, page no %lu,\n"
3857
				"InnoDB: ",
3858
				(ulong) page_offset(rec),
3859
				(ulong) next_offs,
3860
				(ulong) buf_frame_get_page_no(rec));
3861
			dict_index_name_print(stderr, trx, index);
3862
			fputs(". We try to skip the rest of the page.\n",
3863
			      stderr);
3864
3865
			btr_pcur_move_to_last_on_page(pcur, &mtr);
3866
3867
			goto next_rec;
3868
		}
3869
	}
3870
	/*-------------------------------------------------------------*/
3871
3872
	/* Calculate the 'offsets' associated with 'rec' */
3873
3874
	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
3875
3876
	if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
3877
		if (!rec_validate(rec, offsets)
3878
		    || !btr_index_rec_validate(rec, index, FALSE)) {
3879
			fprintf(stderr,
3880
				"InnoDB: Index corruption: rec offs %lu"
3881
				" next offs %lu, page no %lu,\n"
3882
				"InnoDB: ",
3883
				(ulong) page_offset(rec),
3884
				(ulong) next_offs,
3885
				(ulong) buf_frame_get_page_no(rec));
3886
			dict_index_name_print(stderr, trx, index);
3887
			fputs(". We try to skip the record.\n",
3888
			      stderr);
3889
3890
			goto next_rec;
3891
		}
3892
	}
3893
3894
	/* Note that we cannot trust the up_match value in the cursor at this
3895
	place because we can arrive here after moving the cursor! Thus
3896
	we have to recompare rec and search_tuple to determine if they
3897
	match enough. */
3898
3899
	if (match_mode == ROW_SEL_EXACT) {
3900
		/* Test if the index record matches completely to search_tuple
3901
		in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
3902
3903
		/* fputs("Comparing rec and search tuple\n", stderr); */
3904
3905
		if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
3906
3907
			if (set_also_gap_locks
3908
			    && !(srv_locks_unsafe_for_binlog
3909
				 || trx->isolation_level
3910
				 == TRX_ISO_READ_COMMITTED)
3911
			    && prebuilt->select_lock_type != LOCK_NONE) {
3912
3913
				/* Try to place a gap lock on the index
3914
				record only if innodb_locks_unsafe_for_binlog
3915
				option is not set or this session is not
3916
				using a READ COMMITTED isolation level. */
3917
3918
				err = sel_set_rec_lock(
3919
					rec, index, offsets,
3920
					prebuilt->select_lock_type, LOCK_GAP,
3921
					thr);
3922
3923
				if (err != DB_SUCCESS) {
3924
3925
					goto lock_wait_or_error;
3926
				}
3927
			}
3928
3929
			btr_pcur_store_position(pcur, &mtr);
3930
3931
			err = DB_RECORD_NOT_FOUND;
3932
			/* ut_print_name(stderr, index->name);
3933
			fputs(" record not found 3\n", stderr); */
3934
3935
			goto normal_return;
3936
		}
3937
3938
	} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
3939
3940
		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
3941
3942
			if (set_also_gap_locks
3943
			    && !(srv_locks_unsafe_for_binlog
3944
				 || trx->isolation_level
3945
				 == TRX_ISO_READ_COMMITTED)
3946
			    && prebuilt->select_lock_type != LOCK_NONE) {
3947
3948
				/* Try to place a gap lock on the index
3949
				record only if innodb_locks_unsafe_for_binlog
3950
				option is not set or this session is not
3951
				using a READ COMMITTED isolation level. */
3952
3953
				err = sel_set_rec_lock(
3954
					rec, index, offsets,
3955
					prebuilt->select_lock_type, LOCK_GAP,
3956
					thr);
3957
3958
				if (err != DB_SUCCESS) {
3959
3960
					goto lock_wait_or_error;
3961
				}
3962
			}
3963
3964
			btr_pcur_store_position(pcur, &mtr);
3965
3966
			err = DB_RECORD_NOT_FOUND;
3967
			/* ut_print_name(stderr, index->name);
3968
			fputs(" record not found 4\n", stderr); */
3969
3970
			goto normal_return;
3971
		}
3972
	}
3973
3974
	/* We are ready to look at a possible new index entry in the result
3975
	set: the cursor is now placed on a user record */
3976
3977
	if (prebuilt->select_lock_type != LOCK_NONE) {
3978
		/* Try to place a lock on the index record; note that delete
3979
		marked records are a special case in a unique search. If there
3980
		is a non-delete marked record, then it is enough to lock its
3981
		existence with LOCK_REC_NOT_GAP. */
3982
3983
		/* If innodb_locks_unsafe_for_binlog option is used
3984
		or this session is using a READ COMMITED isolation
3985
		level we lock only the record, i.e., next-key locking is
3986
		not used. */
3987
3988
		ulint	lock_type;
3989
3990
		if (!set_also_gap_locks
3991
		    || srv_locks_unsafe_for_binlog
3992
		    || trx->isolation_level == TRX_ISO_READ_COMMITTED
3993
		    || (unique_search
3994
			&& !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
3995
3996
			goto no_gap_lock;
3997
		} else {
3998
			lock_type = LOCK_ORDINARY;
3999
		}
4000
4001
		/* If we are doing a 'greater or equal than a primary key
4002
		value' search from a clustered index, and we find a record
4003
		that has that exact primary key value, then there is no need
4004
		to lock the gap before the record, because no insert in the
4005
		gap can be in our search range. That is, no phantom row can
4006
		appear that way.
4007
4008
		An example: if col1 is the primary key, the search is WHERE
4009
		col1 >= 100, and we find a record where col1 = 100, then no
4010
		need to lock the gap before that record. */
4011
4012
		if (index == clust_index
4013
		    && mode == PAGE_CUR_GE
4014
		    && direction == 0
4015
		    && dtuple_get_n_fields_cmp(search_tuple)
4016
		    == dict_index_get_n_unique(index)
4017
		    && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
4018
no_gap_lock:
4019
			lock_type = LOCK_REC_NOT_GAP;
4020
		}
4021
4022
		err = sel_set_rec_lock(rec, index, offsets,
4023
				       prebuilt->select_lock_type,
4024
				       lock_type, thr);
4025
4026
		switch (err) {
4027
			rec_t*	old_vers;
4028
		case DB_SUCCESS:
4029
			break;
4030
		case DB_LOCK_WAIT:
4031
			if (UNIV_LIKELY(prebuilt->row_read_type
4032
					!= ROW_READ_TRY_SEMI_CONSISTENT)
4033
			    || index != clust_index) {
4034
4035
				goto lock_wait_or_error;
4036
			}
4037
4038
			/* The following call returns 'offsets'
4039
			associated with 'old_vers' */
4040
			err = row_sel_build_committed_vers_for_mysql(
4041
				clust_index, prebuilt, rec,
4042
				&offsets, &heap, &old_vers, &mtr);
4043
4044
			if (err != DB_SUCCESS) {
4045
4046
				goto lock_wait_or_error;
4047
			}
4048
4049
			mutex_enter(&kernel_mutex);
4050
			if (trx->was_chosen_as_deadlock_victim) {
4051
				mutex_exit(&kernel_mutex);
4052
				err = DB_DEADLOCK;
4053
4054
				goto lock_wait_or_error;
4055
			}
4056
			if (UNIV_LIKELY(trx->wait_lock != NULL)) {
4057
				lock_cancel_waiting_and_release(
4058
					trx->wait_lock);
4059
				trx_reset_new_rec_lock_info(trx);
4060
			} else {
4061
				mutex_exit(&kernel_mutex);
4062
4063
				/* The lock was granted while we were
4064
				searching for the last committed version.
4065
				Do a normal locking read. */
4066
4067
				offsets = rec_get_offsets(rec, index, offsets,
4068
							  ULINT_UNDEFINED,
4069
							  &heap);
4070
				err = DB_SUCCESS;
4071
				break;
4072
			}
4073
			mutex_exit(&kernel_mutex);
4074
4075
			if (old_vers == NULL) {
4076
				/* The row was not yet committed */
4077
4078
				goto next_rec;
4079
			}
4080
4081
			did_semi_consistent_read = TRUE;
4082
			rec = old_vers;
4083
			break;
4084
		default:
4085
4086
			goto lock_wait_or_error;
4087
		}
4088
	} else {
4089
		/* This is a non-locking consistent read: if necessary, fetch
4090
		a previous version of the record */
4091
4092
		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
4093
4094
			/* Do nothing: we let a non-locking SELECT read the
4095
			latest version of the record */
4096
4097
		} else if (index == clust_index) {
4098
4099
			/* Fetch a previous version of the row if the current
4100
			one is not visible in the snapshot; if we have a very
4101
			high force recovery level set, we try to avoid crashes
4102
			by skipping this lookup */
4103
4104
			if (UNIV_LIKELY(srv_force_recovery < 5)
4105
			    && !lock_clust_rec_cons_read_sees(
4106
				    rec, index, offsets, trx->read_view)) {
4107
4108
				rec_t*	old_vers;
4109
				/* The following call returns 'offsets'
4110
				associated with 'old_vers' */
4111
				err = row_sel_build_prev_vers_for_mysql(
4112
					trx->read_view, clust_index,
4113
					prebuilt, rec, &offsets, &heap,
4114
					&old_vers, &mtr);
4115
4116
				if (err != DB_SUCCESS) {
4117
4118
					goto lock_wait_or_error;
4119
				}
4120
4121
				if (old_vers == NULL) {
4122
					/* The row did not exist yet in
4123
					the read view */
4124
4125
					goto next_rec;
4126
				}
4127
4128
				rec = old_vers;
4129
			}
4130
		} else if (!lock_sec_rec_cons_read_sees(rec, index,
4131
							trx->read_view)) {
4132
			/* We are looking into a non-clustered index,
4133
			and to get the right version of the record we
4134
			have to look also into the clustered index: this
4135
			is necessary, because we can only get the undo
4136
			information via the clustered index record. */
4137
4138
			ut_ad(index != clust_index);
4139
                        get_clust_rec= TRUE;
4140
			goto idx_cond_check;
4141
		}
4142
	}
4143
4144
	/* NOTE that at this point rec can be an old version of a clustered
4145
	index record built for a consistent read. We cannot assume after this
4146
	point that rec is on a buffer pool page. Functions like
4147
	page_rec_is_comp() cannot be used! */
4148
4149
	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
4150
4151
		/* The record is delete-marked: we can skip it */
4152
4153
		if ((srv_locks_unsafe_for_binlog
4154
		     || trx->isolation_level == TRX_ISO_READ_COMMITTED)
4155
		    && prebuilt->select_lock_type != LOCK_NONE
4156
		    && !did_semi_consistent_read) {
4157
4158
			/* No need to keep a lock on a delete-marked record
4159
			if we do not want to use next-key locking. */
4160
4161
			row_unlock_for_mysql(prebuilt, TRUE);
4162
		}
4163
4164
		/* This is an optimization to skip setting the next key lock
4165
		on the record that follows this delete-marked record. This
4166
		optimization works because of the unique search criteria
4167
		which precludes the presence of a range lock between this
4168
		delete marked record and the record following it.
4169
4170
		For now this is applicable only to clustered indexes while
4171
		doing a unique search. There is scope for further optimization
4172
		applicable to unique secondary indexes. Current behaviour is
4173
		to widen the scope of a lock on an already delete marked record
4174
		if the same record is deleted twice by the same transaction */
4175
		if (index == clust_index && unique_search) {
4176
			err = DB_RECORD_NOT_FOUND;
4177
4178
			goto normal_return;
4179
		}
4180
4181
		goto next_rec;
4182
	}
4183
4184
4185
idx_cond_check:
4186
        if (prebuilt->idx_cond_func)
4187
        {
4188
          int res;
4189
          ut_ad(prebuilt->template_type != ROW_MYSQL_DUMMY_TEMPLATE);
4190
          offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4191
          row_sel_store_mysql_rec(buf, prebuilt, rec,
4192
                                  offsets, 0, prebuilt->n_index_fields);
4193
          res= prebuilt->idx_cond_func(prebuilt->idx_cond_func_arg);
4194
          if (res == 0)
4195
            goto next_rec;
4196
          if (res == 2)
4197
          {
4198
            err = DB_RECORD_NOT_FOUND;
4199
            goto idx_cond_failed;
4200
          }
4201
        }
4202
4203
	/* Get the clustered index record if needed, if we did not do the
4204
	search using the clustered index. */
4205
	if (get_clust_rec || (index != clust_index &&
4206
            prebuilt->need_to_access_clustered)) {
4207
4208
		/* We use a 'goto' to the preceding label if a consistent
4209
		read of a secondary index record requires us to look up old
4210
		versions of the associated clustered index record. */
4211
4212
		ut_ad(rec_offs_validate(rec, index, offsets));
4213
                
4214
		/* It was a non-clustered index and we must fetch also the
4215
		clustered index record */
4216
4217
		mtr_has_extra_clust_latch = TRUE;
4218
4219
		/* The following call returns 'offsets' associated with
4220
		'clust_rec'. Note that 'clust_rec' can be an old version
4221
		built for a consistent read. */
4222
4223
		err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
4224
						      thr, &clust_rec,
4225
						      &offsets, &heap, &mtr);
4226
		if (err != DB_SUCCESS) {
4227
4228
			goto lock_wait_or_error;
4229
		}
4230
4231
		if (clust_rec == NULL) {
4232
			/* The record did not exist in the read view */
4233
			ut_ad(prebuilt->select_lock_type == LOCK_NONE);
4234
4235
			goto next_rec;
4236
		}
4237
4238
		if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
4239
4240
			/* The record is delete marked: we can skip it */
4241
4242
			if ((srv_locks_unsafe_for_binlog
4243
			     || trx->isolation_level == TRX_ISO_READ_COMMITTED)
4244
			    && prebuilt->select_lock_type != LOCK_NONE) {
4245
4246
				/* No need to keep a lock on a delete-marked
4247
				record if we do not want to use next-key
4248
				locking. */
4249
4250
				row_unlock_for_mysql(prebuilt, TRUE);
4251
			}
4252
4253
			goto next_rec;
4254
		}
4255
4256
		if (prebuilt->need_to_access_clustered) {
4257
4258
			result_rec = clust_rec;
4259
4260
			ut_ad(rec_offs_validate(result_rec, clust_index,
4261
						offsets));
4262
		} else {
4263
			/* We used 'offsets' for the clust rec, recalculate
4264
			them for 'rec' */
4265
			offsets = rec_get_offsets(rec, index, offsets,
4266
						  ULINT_UNDEFINED, &heap);
4267
			result_rec = rec;
4268
		}
4269
	} else {
4270
		result_rec = rec;
4271
	}
4272
4273
	/* We found a qualifying record 'result_rec'. At this point,
4274
	'offsets' are associated with 'result_rec'. */
4275
4276
	ut_ad(rec_offs_validate(result_rec,
4277
				result_rec != rec ? clust_index : index,
4278
				offsets));
4279
4280
	if ((match_mode == ROW_SEL_EXACT
4281
	     || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
4282
	    && prebuilt->select_lock_type == LOCK_NONE
4283
	    && !prebuilt->templ_contains_blob
4284
	    && !prebuilt->clust_index_was_generated
4285
	    && !prebuilt->used_in_HANDLER
4286
	    && prebuilt->template_type
4287
	    != ROW_MYSQL_DUMMY_TEMPLATE) {
4288
4289
		/* Inside an update, for example, we do not cache rows,
4290
		since we may use the cursor position to do the actual
4291
		update, that is why we require ...lock_type == LOCK_NONE.
4292
		Since we keep space in prebuilt only for the BLOBs of
4293
		a single row, we cannot cache rows in the case there
4294
		are BLOBs in the fields to be fetched. In HANDLER we do
4295
		not cache rows because there the cursor is a scrollable
4296
		cursor. */
4297
                some_fields_in_buffer= (index != clust_index &&
4298
                                        prebuilt->idx_cond_func);
4299
4300
		row_sel_push_cache_row_for_mysql(prebuilt, result_rec,
4301
						 offsets, 
4302
                                                 some_fields_in_buffer? 
4303
                                                 prebuilt->n_index_fields: 0,
4304
                                                 buf);
4305
		if (prebuilt->n_fetch_cached == MYSQL_FETCH_CACHE_SIZE) {
4306
4307
			goto got_row;
4308
		}
4309
4310
		goto next_rec;
4311
	} else {
4312
		if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) {
4313
			memcpy(buf + 4, result_rec
4314
			       - rec_offs_extra_size(offsets),
4315
			       rec_offs_size(offsets));
4316
			mach_write_to_4(buf,
4317
					rec_offs_extra_size(offsets) + 4);
4318
		} else {
4319
			if (!row_sel_store_mysql_rec(buf, prebuilt,
4320
						     result_rec, offsets,
4321
                                                     prebuilt->idx_cond_func? 
4322
                                                     prebuilt->n_index_fields: 0,
4323
                                                     prebuilt->n_template)) {
4324
				err = DB_TOO_BIG_RECORD;
4325
4326
				goto lock_wait_or_error;
4327
			}
4328
		}
4329
4330
		if (prebuilt->clust_index_was_generated) {
4331
			if (result_rec != rec) {
4332
				offsets = rec_get_offsets(
4333
					rec, index, offsets, ULINT_UNDEFINED,
4334
					&heap);
4335
			}
4336
			row_sel_store_row_id_to_prebuilt(prebuilt, rec,
4337
							 index, offsets);
4338
		}
4339
	}
4340
4341
	/* From this point on, 'offsets' are invalid. */
4342
4343
got_row:
4344
	/* We have an optimization to save CPU time: if this is a consistent
4345
	read on a unique condition on the clustered index, then we do not
4346
	store the pcur position, because any fetch next or prev will anyway
4347
	return 'end of file'. Exceptions are locking reads and the MySQL
4348
	HANDLER command where the user can move the cursor with PREV or NEXT
4349
	even after a unique search. */
4350
4351
	err = DB_SUCCESS;
4352
4353
idx_cond_failed:
4354
	if (!unique_search_from_clust_index
4355
	    || prebuilt->select_lock_type != LOCK_NONE
4356
	    || prebuilt->used_in_HANDLER) {
4357
4358
		/* Inside an update always store the cursor position */
4359
4360
		btr_pcur_store_position(pcur, &mtr);
4361
	}
4362
4363
	goto normal_return;
4364
4365
next_rec:
4366
	/* Reset the old and new "did semi-consistent read" flags. */
4367
        get_clust_rec= FALSE;
4368
	if (UNIV_UNLIKELY(prebuilt->row_read_type
4369
			  == ROW_READ_DID_SEMI_CONSISTENT)) {
4370
		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4371
	}
4372
	did_semi_consistent_read = FALSE;
4373
4374
	if (UNIV_UNLIKELY(srv_locks_unsafe_for_binlog
4375
			  || trx->isolation_level == TRX_ISO_READ_COMMITTED)
4376
	    && prebuilt->select_lock_type != LOCK_NONE) {
4377
4378
		trx_reset_new_rec_lock_info(trx);
4379
	}
4380
4381
	/*-------------------------------------------------------------*/
4382
	/* PHASE 5: Move the cursor to the next index record */
4383
4384
	if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
4385
		/* We must commit mtr if we are moving to the next
4386
		non-clustered index record, because we could break the
4387
		latching order if we would access a different clustered
4388
		index page right away without releasing the previous. */
4389
4390
		btr_pcur_store_position(pcur, &mtr);
4391
4392
		mtr_commit(&mtr);
4393
		mtr_has_extra_clust_latch = FALSE;
4394
4395
		mtr_start(&mtr);
4396
		if (sel_restore_position_for_mysql(&same_user_rec,
4397
						   BTR_SEARCH_LEAF,
4398
						   pcur, moves_up, &mtr)) {
4399
#ifdef UNIV_SEARCH_DEBUG
4400
			cnt++;
4401
#endif /* UNIV_SEARCH_DEBUG */
4402
4403
			goto rec_loop;
4404
		}
4405
	}
4406
4407
	if (moves_up) {
4408
		if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
4409
not_moved:
4410
			btr_pcur_store_position(pcur, &mtr);
4411
4412
			if (match_mode != 0) {
4413
				err = DB_RECORD_NOT_FOUND;
4414
			} else {
4415
				err = DB_END_OF_INDEX;
4416
			}
4417
4418
			goto normal_return;
4419
		}
4420
	} else {
4421
		if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
4422
			goto not_moved;
4423
		}
4424
	}
4425
4426
#ifdef UNIV_SEARCH_DEBUG
4427
	cnt++;
4428
#endif /* UNIV_SEARCH_DEBUG */
4429
4430
	goto rec_loop;
4431
4432
lock_wait_or_error:
4433
	/* Reset the old and new "did semi-consistent read" flags. */
4434
	if (UNIV_UNLIKELY(prebuilt->row_read_type
4435
			  == ROW_READ_DID_SEMI_CONSISTENT)) {
4436
		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4437
	}
4438
	did_semi_consistent_read = FALSE;
4439
4440
	/*-------------------------------------------------------------*/
4441
4442
	btr_pcur_store_position(pcur, &mtr);
4443
4444
	mtr_commit(&mtr);
4445
	mtr_has_extra_clust_latch = FALSE;
4446
4447
	trx->error_state = err;
4448
4449
	/* The following is a patch for MySQL */
4450
4451
	que_thr_stop_for_mysql(thr);
4452
4453
	thr->lock_state = QUE_THR_LOCK_ROW;
4454
4455
	if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
4456
		/* It was a lock wait, and it ended */
4457
4458
		thr->lock_state = QUE_THR_LOCK_NOLOCK;
4459
		mtr_start(&mtr);
4460
4461
		sel_restore_position_for_mysql(&same_user_rec,
4462
					       BTR_SEARCH_LEAF, pcur,
4463
					       moves_up, &mtr);
4464
4465
		if ((srv_locks_unsafe_for_binlog
4466
		     || trx->isolation_level == TRX_ISO_READ_COMMITTED)
4467
		    && !same_user_rec) {
4468
4469
			/* Since we were not able to restore the cursor
4470
			on the same user record, we cannot use
4471
			row_unlock_for_mysql() to unlock any records, and
4472
			we must thus reset the new rec lock info. Since
4473
			in lock0lock.c we have blocked the inheriting of gap
4474
			X-locks, we actually do not have any new record locks
4475
			set in this case.
4476
4477
			Note that if we were able to restore on the 'same'
4478
			user record, it is still possible that we were actually
4479
			waiting on a delete-marked record, and meanwhile
4480
			it was removed by purge and inserted again by some
4481
			other user. But that is no problem, because in
4482
			rec_loop we will again try to set a lock, and
4483
			new_rec_lock_info in trx will be right at the end. */
4484
4485
			trx_reset_new_rec_lock_info(trx);
4486
		}
4487
4488
		mode = pcur->search_mode;
4489
4490
		goto rec_loop;
4491
	}
4492
4493
	thr->lock_state = QUE_THR_LOCK_NOLOCK;
4494
4495
#ifdef UNIV_SEARCH_DEBUG
4496
	/*	fputs("Using ", stderr);
4497
	dict_index_name_print(stderr, index);
4498
	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4499
#endif /* UNIV_SEARCH_DEBUG */
4500
	goto func_exit;
4501
4502
normal_return:
4503
	/*-------------------------------------------------------------*/
4504
	que_thr_stop_for_mysql_no_error(thr, trx);
4505
4506
	mtr_commit(&mtr);
4507
4508
	if (prebuilt->n_fetch_cached > 0) {
4509
		row_sel_pop_cached_row_for_mysql(buf, prebuilt);
4510
4511
		err = DB_SUCCESS;
4512
	}
4513
4514
#ifdef UNIV_SEARCH_DEBUG
4515
	/*	fputs("Using ", stderr);
4516
	dict_index_name_print(stderr, index);
4517
	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
4518
#endif /* UNIV_SEARCH_DEBUG */
4519
	if (err == DB_SUCCESS) {
4520
		srv_n_rows_read++;
4521
	}
4522
4523
func_exit:
4524
	trx->op_info = "";
4525
	if (UNIV_LIKELY_NULL(heap)) {
4526
		mem_heap_free(heap);
4527
	}
4528
4529
	/* Set or reset the "did semi-consistent read" flag on return.
4530
	The flag did_semi_consistent_read is set if and only if
4531
	the record being returned was fetched with a semi-consistent read. */
4532
	ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
4533
	      || !did_semi_consistent_read);
4534
4535
	if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
4536
		if (UNIV_UNLIKELY(did_semi_consistent_read)) {
4537
			prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
4538
		} else {
4539
			prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
4540
		}
4541
	}
4542
	return(err);
4543
}
4544
4545
/***********************************************************************
4546
Checks if MySQL at the moment is allowed for this table to retrieve a
4547
consistent read result, or store it to the query cache. */
4548
4549
ibool
4550
row_search_check_if_query_cache_permitted(
4551
/*======================================*/
4552
					/* out: TRUE if storing or retrieving
4553
					from the query cache is permitted */
4554
	trx_t*		trx,		/* in: transaction object */
4555
	const char*	norm_name)	/* in: concatenation of database name,
4556
					'/' char, table name */
4557
{
4558
	dict_table_t*	table;
4559
	ibool		ret	= FALSE;
4560
4561
	table = dict_table_get(norm_name, FALSE);
4562
4563
	if (table == NULL) {
4564
4565
		return(FALSE);
4566
	}
4567
4568
	mutex_enter(&kernel_mutex);
4569
4570
	/* Start the transaction if it is not started yet */
4571
4572
	trx_start_if_not_started_low(trx);
4573
4574
	/* If there are locks on the table or some trx has invalidated the
4575
	cache up to our trx id, then ret = FALSE.
4576
	We do not check what type locks there are on the table, though only
4577
	IX type locks actually would require ret = FALSE. */
4578
4579
	if (UT_LIST_GET_LEN(table->locks) == 0
4580
	    && ut_dulint_cmp(trx->id,
4581
			     table->query_cache_inv_trx_id) >= 0) {
4582
4583
		ret = TRUE;
4584
4585
		/* If the isolation level is high, assign a read view for the
4586
		transaction if it does not yet have one */
4587
4588
		if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
4589
		    && !trx->read_view) {
4590
4591
			trx->read_view = read_view_open_now(
4592
				trx->id, trx->global_read_view_heap);
4593
			trx->global_read_view = trx->read_view;
4594
		}
4595
	}
4596
4597
	mutex_exit(&kernel_mutex);
4598
4599
	return(ret);
4600
}
4601
4602
/***********************************************************************
4603
Read the AUTOINC column from the current row. If the value is less than
4604
0 and the type is not unsigned then we reset the value to 0. */
4605
static
4606
ib_longlong
4607
row_search_autoinc_read_column(
4608
/*===========================*/
4609
					/* out: value read from the column */
4610
	dict_index_t*	index,		/* in: index to read from */
4611
	const rec_t*	rec,		/* in: current rec */
4612
	ulint		col_no,		/* in: column number */
4613
	ibool		unsigned_type)	/* in: signed or unsigned flag */
4614
{
4615
	ulint		len;
4616
	const byte*	data;
4617
	ib_longlong	value;
4618
	mem_heap_t*	heap = NULL;
4619
	/* Our requirement is that dest should be word aligned. */
4620
	byte		dest[sizeof(value)];
4621
	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
4622
	ulint*		offsets	= offsets_;
4623
4624
	*offsets_ = sizeof offsets_ / sizeof *offsets_;
4625
4626
	/* TODO: We have to cast away the const of rec for now.  This needs
4627
	to be fixed later.*/
4628
	offsets = rec_get_offsets(
4629
		(rec_t*) rec, index, offsets, ULINT_UNDEFINED, &heap);
4630
4631
	/* TODO: We have to cast away the const of rec for now.  This needs
4632
	to be fixed later.*/
4633
	data = rec_get_nth_field((rec_t*)rec, offsets, col_no, &len);
4634
4635
	ut_a(len != UNIV_SQL_NULL);
4636
	ut_a(len <= sizeof value);
4637
4638
	mach_read_int_type(dest, data, len, unsigned_type);
4639
4640
	/* The assumption here is that the AUTOINC value can't be negative
4641
	and that dest is word aligned. */
4642
	switch (len) {
4643
	case 8:
4644
		value = *(ib_longlong*) dest;
4645
		break;
4646
4647
	case 4:
4648
		value = *(ib_uint32_t*) dest;
4649
		break;
4650
4651
	case 3:
4652
		value = *(ib_uint32_t*) dest;
4653
		value &= 0xFFFFFF;
4654
		break;
4655
4656
	case 2:
4657
		value = *(uint16 *) dest;
4658
		break;
4659
4660
	case 1:
4661
		value = *dest;
4662
		break;
4663
4664
	default:
4665
		ut_error;
4666
	}
4667
4668
	if (UNIV_LIKELY_NULL(heap)) {
4669
		mem_heap_free(heap);
4670
	}
4671
4672
	if (!unsigned_type && value < 0) {
4673
		value = 0;
4674
	}
4675
4676
	return(value);
4677
}
4678
4679
/***********************************************************************
4680
Get the last row. */
4681
static
4682
const rec_t*
4683
row_search_autoinc_get_rec(
4684
/*=======================*/
4685
					/* out: current rec or NULL */
4686
	btr_pcur_t*	pcur,		/* in: the current cursor */
4687
	mtr_t*		mtr)		/* in: mini transaction */
4688
{
4689
	do {
4690
		const rec_t* rec = btr_pcur_get_rec(pcur);
4691
4692
		if (page_rec_is_user_rec(rec)) {
4693
			return(rec);
4694
		}
4695
	} while (btr_pcur_move_to_prev(pcur, mtr));
4696
4697
	return(NULL);
4698
}
4699
4700
/***********************************************************************
4701
Read the max AUTOINC value from an index. */
4702
4703
ulint
4704
row_search_max_autoinc(
4705
/*===================*/
4706
					/* out: DB_SUCCESS if all OK else
4707
					error code, DB_RECORD_NOT_FOUND if
4708
					column name can't be found in index */
4709
	dict_index_t*	index,		/* in: index to search */
4710
	const char*	col_name,	/* in: name of autoinc column */
4711
	ib_longlong*	value)		/* out: AUTOINC value read */
4712
{
4713
	ulint		i;
4714
	ulint		n_cols;
4715
	dict_field_t*	dfield = NULL;
4716
	ulint		error = DB_SUCCESS;
4717
4718
	n_cols = dict_index_get_n_ordering_defined_by_user(index);
4719
4720
	/* Search the index for the AUTOINC column name */
4721
	for (i = 0; i < n_cols; ++i) {
4722
		dfield = dict_index_get_nth_field(index, i);
4723
4724
		if (strcmp(col_name, dfield->name) == 0) {
4725
			break;
4726
		}
4727
	}
4728
4729
	*value = 0;
4730
4731
	/* Must find the AUTOINC column name */
4732
	if (i < n_cols && dfield) {
4733
		mtr_t		mtr;
4734
		btr_pcur_t	pcur;
4735
4736
		mtr_start(&mtr);
4737
4738
		/* Open at the high/right end (FALSE), and INIT
4739
		cursor (TRUE) */
4740
		btr_pcur_open_at_index_side(
4741
			FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
4742
4743
		if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
4744
			const rec_t*	rec;
4745
4746
			rec = row_search_autoinc_get_rec(&pcur, &mtr);
4747
4748
			if (rec != NULL) {
4749
				ibool unsigned_type = (
4750
					dfield->col->prtype & DATA_UNSIGNED);
4751
4752
				*value = row_search_autoinc_read_column(
4753
					index, rec, i, unsigned_type);
4754
			}
4755
		}
4756
4757
		btr_pcur_close(&pcur);
4758
4759
		mtr_commit(&mtr);
4760
	} else {
4761
		error = DB_RECORD_NOT_FOUND;
4762
	}
4763
4764
	return(error);
4765
}