~drizzle-trunk/drizzle/development

641.2.2 by Monty Taylor
InnoDB Plugin 1.0.3
1
/*****************************************************************************
2
3
Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
4
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
8
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
Place, Suite 330, Boston, MA 02111-1307 USA
16
17
*****************************************************************************/
18
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
19
/******************************************************
20
New index creation routines using a merge sort
21
22
Created 12/4/2005 Jan Lindstrom
23
Completed by Sunny Bains and Marko Makela
24
*******************************************************/
25
26
#include "row0merge.h"
27
#include "row0ext.h"
28
#include "row0row.h"
29
#include "row0upd.h"
30
#include "row0ins.h"
31
#include "row0sel.h"
32
#include "dict0dict.h"
33
#include "dict0mem.h"
34
#include "dict0boot.h"
35
#include "dict0crea.h"
36
#include "dict0load.h"
37
#include "btr0btr.h"
38
#include "mach0data.h"
39
#include "trx0rseg.h"
40
#include "trx0trx.h"
41
#include "trx0roll.h"
42
#include "trx0undo.h"
43
#include "trx0purge.h"
44
#include "trx0rec.h"
45
#include "que0que.h"
46
#include "rem0cmp.h"
47
#include "read0read.h"
48
#include "os0file.h"
49
#include "lock0lock.h"
50
#include "data0data.h"
51
#include "data0type.h"
52
#include "que0que.h"
53
#include "pars0pars.h"
54
#include "mem0mem.h"
55
#include "log0log.h"
56
#include "ut0sort.h"
57
#include "handler0alter.h"
58
59
#ifdef UNIV_DEBUG
60
/* Set these in order ot enable debug printout. */
61
static ibool	row_merge_print_cmp;
62
static ibool	row_merge_print_read;
63
static ibool	row_merge_print_write;
64
#endif /* UNIV_DEBUG */
65
66
/* Block size for I/O operations in merge sort.  The minimum is
67
UNIV_PAGE_SIZE, or page_get_free_space_of_empty() rounded to a power of 2.
68
69
When not creating a PRIMARY KEY that contains column prefixes, this
70
can be set as small as UNIV_PAGE_SIZE / 2.  See the comment above
71
ut_ad(data_size < sizeof(row_merge_block_t)). */
72
73
typedef byte	row_merge_block_t[1048576];
74
75
/* Secondary buffer for I/O operations of merge records.  This buffer
76
is used for writing or reading a record that spans two row_merge_block_t.
77
Thus, it must be able to hold one merge record, whose maximum size is
78
the same as the minimum size of row_merge_block_t. */
79
80
typedef byte	mrec_buf_t[UNIV_PAGE_SIZE];
81
82
/* Merge record in row_merge_block_t.  The format is the same as a
83
record in ROW_FORMAT=COMPACT with the exception that the
84
REC_N_NEW_EXTRA_BYTES are omitted. */
85
typedef byte	mrec_t;
86
87
/* Buffer for sorting in main memory. */
88
struct row_merge_buf_struct {
89
	mem_heap_t*	heap;		/* memory heap where allocated */
90
	dict_index_t*	index;		/* the index the tuples belong to */
91
	ulint		total_size;	/* total amount of data bytes */
92
	ulint		n_tuples;	/* number of data tuples */
93
	ulint		max_tuples;	/* maximum number of data tuples */
94
	const dfield_t**tuples;		/* array of pointers to
95
					arrays of fields that form
96
					the data tuples */
97
	const dfield_t**tmp_tuples;	/* temporary copy of tuples,
98
					for sorting */
99
};
100
101
typedef struct row_merge_buf_struct row_merge_buf_t;
102
103
/* Information about temporary files used in merge sort are stored
104
to this structure */
105
106
struct merge_file_struct {
107
	int	fd;		/* File descriptor */
108
	ulint	offset;		/* File offset */
109
};
110
111
typedef struct merge_file_struct merge_file_t;
112
113
#ifdef UNIV_DEBUG
114
/**********************************************************
115
Display a merge tuple. */
116
static
117
void
118
row_merge_tuple_print(
119
/*==================*/
120
	FILE*		f,	/* in: output stream */
121
	const dfield_t*	entry,	/* in: tuple to print */
122
	ulint		n_fields)/* in: number of fields in the tuple */
123
{
124
	ulint	j;
125
126
	for (j = 0; j < n_fields; j++) {
127
		const dfield_t*	field = &entry[j];
128
129
		if (dfield_is_null(field)) {
130
			fputs("\n NULL;", f);
131
		} else {
132
			ulint	field_len	= dfield_get_len(field);
133
			ulint	len		= ut_min(field_len, 20);
134
			if (dfield_is_ext(field)) {
135
				fputs("\nE", f);
136
			} else {
137
				fputs("\n ", f);
138
			}
139
			ut_print_buf(f, dfield_get_data(field), len);
140
			if (len != field_len) {
141
				fprintf(f, " (total %lu bytes)", field_len);
142
			}
143
		}
144
	}
145
	putc('\n', f);
146
}
147
#endif /* UNIV_DEBUG */
148
149
/**********************************************************
150
Allocate a sort buffer. */
151
static
152
row_merge_buf_t*
153
row_merge_buf_create_low(
154
/*=====================*/
155
					/* out,own: sort buffer */
156
	mem_heap_t*	heap,		/* in: heap where allocated */
157
	dict_index_t*	index,		/* in: secondary index */
158
	ulint		max_tuples,	/* in: maximum number of data tuples */
159
	ulint		buf_size)	/* in: size of the buffer, in bytes */
160
{
161
	row_merge_buf_t*	buf;
162
163
	ut_ad(max_tuples > 0);
164
	ut_ad(max_tuples <= sizeof(row_merge_block_t));
165
	ut_ad(max_tuples < buf_size);
166
167
	buf = mem_heap_zalloc(heap, buf_size);
168
	buf->heap = heap;
169
	buf->index = index;
170
	buf->max_tuples = max_tuples;
171
	buf->tuples = mem_heap_alloc(heap,
172
				     2 * max_tuples * sizeof *buf->tuples);
173
	buf->tmp_tuples = buf->tuples + max_tuples;
174
175
	return(buf);
176
}
177
178
/**********************************************************
179
Allocate a sort buffer. */
180
static
181
row_merge_buf_t*
182
row_merge_buf_create(
183
/*=================*/
184
				/* out,own: sort buffer */
185
	dict_index_t*	index)	/* in: secondary index */
186
{
187
	row_merge_buf_t*	buf;
188
	ulint			max_tuples;
189
	ulint			buf_size;
190
	mem_heap_t*		heap;
191
192
	max_tuples = sizeof(row_merge_block_t)
193
		/ ut_max(1, dict_index_get_min_size(index));
194
195
	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
196
197
	heap = mem_heap_create(buf_size + sizeof(row_merge_block_t));
198
199
	buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
200
201
	return(buf);
202
}
203
204
/**********************************************************
205
Empty a sort buffer. */
206
static
207
row_merge_buf_t*
208
row_merge_buf_empty(
209
/*================*/
210
					/* out: sort buffer */
211
	row_merge_buf_t*	buf)	/* in,own: sort buffer */
212
{
213
	ulint		buf_size;
214
	ulint		max_tuples	= buf->max_tuples;
215
	mem_heap_t*	heap		= buf->heap;
216
	dict_index_t*	index		= buf->index;
217
218
	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
219
220
	mem_heap_empty(heap);
221
222
	return(row_merge_buf_create_low(heap, index, max_tuples, buf_size));
223
}
224
225
/**********************************************************
226
Deallocate a sort buffer. */
227
static
228
void
229
row_merge_buf_free(
230
/*===============*/
231
	row_merge_buf_t*	buf)	/* in,own: sort buffer, to be freed */
232
{
233
	mem_heap_free(buf->heap);
234
}
235
236
/**********************************************************
237
Insert a data tuple into a sort buffer. */
238
static
239
ibool
240
row_merge_buf_add(
241
/*==============*/
242
					/* out: TRUE if added,
243
					FALSE if out of space */
244
	row_merge_buf_t*	buf,	/* in/out: sort buffer */
245
	const dtuple_t*		row,	/* in: row in clustered index */
246
	const row_ext_t*	ext)	/* in: cache of externally stored
247
					column prefixes, or NULL */
248
{
249
	ulint			i;
250
	ulint			n_fields;
251
	ulint			data_size;
252
	ulint			extra_size;
253
	const dict_index_t*	index;
254
	dfield_t*		entry;
255
	dfield_t*		field;
256
257
	if (buf->n_tuples >= buf->max_tuples) {
258
		return(FALSE);
259
	}
260
261
	UNIV_PREFETCH_R(row->fields);
262
263
	index = buf->index;
264
265
	n_fields = dict_index_get_n_fields(index);
266
267
	entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
268
	buf->tuples[buf->n_tuples] = entry;
269
	field = entry;
270
271
	data_size = 0;
272
	extra_size = UT_BITS_IN_BYTES(index->n_nullable);
273
274
	for (i = 0; i < n_fields; i++, field++) {
275
		const dict_field_t*	ifield;
276
		const dict_col_t*	col;
277
		ulint			col_no;
278
		const dfield_t*		row_field;
279
		ulint			len;
280
281
		ifield = dict_index_get_nth_field(index, i);
282
		col = ifield->col;
283
		col_no = dict_col_get_no(col);
284
		row_field = dtuple_get_nth_field(row, col_no);
285
		dfield_copy(field, row_field);
286
		len = dfield_get_len(field);
287
288
		if (dfield_is_null(field)) {
289
			ut_ad(!(col->prtype & DATA_NOT_NULL));
290
			continue;
291
		} else if (UNIV_LIKELY(!ext)) {
292
		} else if (dict_index_is_clust(index)) {
293
			/* Flag externally stored fields. */
294
			const byte*	buf = row_ext_lookup(ext, col_no,
295
							     &len);
296
			if (UNIV_LIKELY_NULL(buf)) {
297
				ut_a(buf != field_ref_zero);
298
				if (i < dict_index_get_n_unique(index)) {
299
					dfield_set_data(field, buf, len);
300
				} else {
301
					dfield_set_ext(field);
302
					len = dfield_get_len(field);
303
				}
304
			}
305
		} else {
306
			const byte*	buf = row_ext_lookup(ext, col_no,
307
							     &len);
308
			if (UNIV_LIKELY_NULL(buf)) {
309
				ut_a(buf != field_ref_zero);
310
				dfield_set_data(field, buf, len);
311
			}
312
		}
313
314
		/* If a column prefix index, take only the prefix */
315
316
		if (ifield->prefix_len) {
317
			len = dtype_get_at_most_n_mbchars(
318
				col->prtype,
319
				col->mbminlen, col->mbmaxlen,
320
				ifield->prefix_len,
321
				len, dfield_get_data(field));
322
			dfield_set_len(field, len);
323
		}
324
325
		ut_ad(len <= col->len || col->mtype == DATA_BLOB);
326
327
		if (ifield->fixed_len) {
328
			ut_ad(len == ifield->fixed_len);
329
			ut_ad(!dfield_is_ext(field));
330
		} else if (dfield_is_ext(field)) {
331
			extra_size += 2;
332
		} else if (len < 128
333
			   || (col->len < 256 && col->mtype != DATA_BLOB)) {
334
			extra_size++;
335
		} else {
336
			/* For variable-length columns, we look up the
337
			maximum length from the column itself.  If this
338
			is a prefix index column shorter than 256 bytes,
339
			this will waste one byte. */
340
			extra_size += 2;
341
		}
342
		data_size += len;
343
	}
344
345
#ifdef UNIV_DEBUG
346
	{
347
		ulint	size;
348
		ulint	extra;
349
350
		size = rec_get_converted_size_comp(index,
351
						   REC_STATUS_ORDINARY,
352
						   entry, n_fields, &extra);
353
354
		ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size);
355
		ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra);
356
	}
357
#endif /* UNIV_DEBUG */
358
359
	/* Add to the total size of the record in row_merge_block_t
360
	the encoded length of extra_size and the extra bytes (extra_size).
361
	See row_merge_buf_write() for the variable-length encoding
362
	of extra_size. */
363
	data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
364
365
	/* The following assertion may fail if row_merge_block_t is
366
	declared very small and a PRIMARY KEY is being created with
367
	many prefix columns.  In that case, the record may exceed the
368
	page_zip_rec_needs_ext() limit.  However, no further columns
369
	will be moved to external storage until the record is inserted
370
	to the clustered index B-tree. */
371
	ut_ad(data_size < sizeof(row_merge_block_t));
372
373
	/* Reserve one byte for the end marker of row_merge_block_t. */
374
	if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) {
375
		return(FALSE);
376
	}
377
378
	buf->total_size += data_size;
379
	buf->n_tuples++;
380
381
	field = entry;
382
383
	/* Copy the data fields. */
384
385
	do {
386
		dfield_dup(field++, buf->heap);
387
	} while (--n_fields);
388
389
	return(TRUE);
390
}
391
392
/* Structure for reporting duplicate records. */
393
struct row_merge_dup_struct {
394
	const dict_index_t*	index;		/* index being sorted */
395
	TABLE*			table;		/* MySQL table object */
396
	ulint			n_dup;		/* number of duplicates */
397
};
398
399
typedef struct row_merge_dup_struct row_merge_dup_t;
400
401
/*****************************************************************
402
Report a duplicate key. */
403
static
404
void
405
row_merge_dup_report(
406
/*=================*/
407
	row_merge_dup_t*	dup,	/* in/out: for reporting duplicates */
408
	const dfield_t*		entry)	/* in: duplicate index entry */
409
{
410
	mrec_buf_t 		buf;
411
	const dtuple_t*		tuple;
412
	dtuple_t		tuple_store;
413
	const rec_t*		rec;
414
	const dict_index_t*	index	= dup->index;
415
	ulint			n_fields= dict_index_get_n_fields(index);
416
	mem_heap_t*		heap	= NULL;
417
	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
418
	ulint*			offsets;
419
	ulint			n_ext;
420
421
	if (dup->n_dup++) {
422
		/* Only report the first duplicate record,
423
		but count all duplicate records. */
424
		return;
425
	}
426
427
	rec_offs_init(offsets_);
428
429
	/* Convert the tuple to a record and then to MySQL format. */
430
431
	tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
432
	n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
433
434
	rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext);
435
	offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED,
436
				  &heap);
437
438
	innobase_rec_to_mysql(dup->table, rec, index, offsets);
439
440
	if (UNIV_LIKELY_NULL(heap)) {
441
		mem_heap_free(heap);
442
	}
443
}
444
445
/*****************************************************************
446
Compare two tuples. */
447
static
448
int
449
row_merge_tuple_cmp(
450
/*================*/
451
					/* out: 1, 0, -1 if a is greater,
452
					equal, less, respectively, than b */
453
	ulint			n_field,/* in: number of fields */
454
	const dfield_t*		a,	/* in: first tuple to be compared */
455
	const dfield_t*		b,	/* in: second tuple to be compared */
456
	row_merge_dup_t*	dup)	/* in/out: for reporting duplicates */
457
{
458
	int		cmp;
459
	const dfield_t*	field	= a;
460
641.2.2 by Monty Taylor
InnoDB Plugin 1.0.3
461
	/* Compare the fields of the tuples until a difference is
462
	found or we run out of fields to compare.  If !cmp at the
463
	end, the tuples are equal. */
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
464
	do {
465
		cmp = cmp_dfield_dfield(a++, b++);
466
	} while (!cmp && --n_field);
467
468
	if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) {
641.2.2 by Monty Taylor
InnoDB Plugin 1.0.3
469
		/* Report a duplicate value error if the tuples are
470
		logically equal.  NULL columns are logically inequal,
471
		although they are equal in the sorting order.  Find
472
		out if any of the fields are NULL. */
473
		for (b = field; b != a; b++) {
474
			if (dfield_is_null(b)) {
475
476
				goto func_exit;
477
			}
478
		}
479
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
480
		row_merge_dup_report(dup, field);
481
	}
482
641.2.2 by Monty Taylor
InnoDB Plugin 1.0.3
483
func_exit:
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
484
	return(cmp);
485
}
486
487
/**************************************************************************
488
Merge sort the tuple buffer in main memory. */
489
static
490
void
491
row_merge_tuple_sort(
492
/*=================*/
493
	ulint			n_field,/* in: number of fields */
494
	row_merge_dup_t*	dup,	/* in/out: for reporting duplicates */
495
	const dfield_t**	tuples,	/* in/out: tuples */
496
	const dfield_t**	aux,	/* in/out: work area */
497
	ulint			low,	/* in: lower bound of the
498
					sorting area, inclusive */
499
	ulint			high)	/* in: upper bound of the
500
					sorting area, exclusive */
501
{
502
#define row_merge_tuple_sort_ctx(a,b,c,d) \
503
	row_merge_tuple_sort(n_field, dup, a, b, c, d)
504
#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup)
505
506
	UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
507
			      tuples, aux, low, high, row_merge_tuple_cmp_ctx);
508
}
509
510
/**********************************************************
511
Sort a buffer. */
512
static
513
void
514
row_merge_buf_sort(
515
/*===============*/
516
	row_merge_buf_t*	buf,	/* in/out: sort buffer */
517
	row_merge_dup_t*	dup)	/* in/out: for reporting duplicates */
518
{
519
	row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup,
520
			     buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
521
}
522
523
/**********************************************************
524
Write a buffer to a block. */
525
static
526
void
527
row_merge_buf_write(
528
/*================*/
529
	const row_merge_buf_t*	buf,	/* in: sorted buffer */
530
#ifdef UNIV_DEBUG
531
	const merge_file_t*	of,	/* in: output file */
532
#endif /* UNIV_DEBUG */
533
	row_merge_block_t*	block)	/* out: buffer for writing to file */
534
#ifndef UNIV_DEBUG
535
# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block)
536
#endif /* !UNIV_DEBUG */
537
{
538
	const dict_index_t*	index	= buf->index;
539
	ulint			n_fields= dict_index_get_n_fields(index);
540
	byte*			b	= &(*block)[0];
541
542
	ulint		i;
543
544
	for (i = 0; i < buf->n_tuples; i++) {
545
		ulint		size;
546
		ulint		extra_size;
547
		const dfield_t*	entry		= buf->tuples[i];
548
549
		size = rec_get_converted_size_comp(index,
550
						   REC_STATUS_ORDINARY,
551
						   entry, n_fields,
552
						   &extra_size);
553
		ut_ad(size > extra_size);
554
		ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES);
555
		extra_size -= REC_N_NEW_EXTRA_BYTES;
556
		size -= REC_N_NEW_EXTRA_BYTES;
557
558
		/* Encode extra_size + 1 */
559
		if (extra_size + 1 < 0x80) {
560
			*b++ = (byte) (extra_size + 1);
561
		} else {
562
			ut_ad((extra_size + 1) < 0x8000);
563
			*b++ = (byte) (0x80 | ((extra_size + 1) >> 8));
564
			*b++ = (byte) (extra_size + 1);
565
		}
566
567
		ut_ad(b + size < block[1]);
568
569
		rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index,
570
					       REC_STATUS_ORDINARY,
571
					       entry, n_fields);
572
573
		b += size;
574
575
#ifdef UNIV_DEBUG
576
		if (row_merge_print_write) {
577
			fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
578
				(void*) b, of->fd, (ulong) of->offset,
579
				(ulong) i);
580
			row_merge_tuple_print(stderr, entry, n_fields);
581
		}
582
#endif /* UNIV_DEBUG */
583
	}
584
585
	/* Write an "end-of-chunk" marker. */
586
	ut_a(b < block[1]);
587
	ut_a(b == block[0] + buf->total_size);
588
	*b++ = 0;
589
#ifdef UNIV_DEBUG_VALGRIND
590
	/* The rest of the block is uninitialized.  Initialize it
591
	to avoid bogus warnings. */
592
	memset(b, 0xff, block[1] - b);
593
#endif /* UNIV_DEBUG_VALGRIND */
594
#ifdef UNIV_DEBUG
595
	if (row_merge_print_write) {
596
		fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
597
			(void*) b, of->fd, (ulong) of->offset);
598
	}
599
#endif /* UNIV_DEBUG */
600
}
601
602
/**********************************************************
603
Create a memory heap and allocate space for row_merge_rec_offsets(). */
604
static
605
mem_heap_t*
606
row_merge_heap_create(
607
/*==================*/
608
						/* out: memory heap */
609
	const dict_index_t*	index,		/* in: record descriptor */
610
	ulint**			offsets1,	/* out: offsets */
611
	ulint**			offsets2)	/* out: offsets */
612
{
613
	ulint		i	= 1 + REC_OFFS_HEADER_SIZE
614
		+ dict_index_get_n_fields(index);
615
	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof *offsets1);
616
617
	*offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1);
618
	*offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2);
619
620
	(*offsets1)[0] = (*offsets2)[0] = i;
621
	(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
622
623
	return(heap);
624
}
625
626
/**************************************************************************
627
Search an index object by name and column names.  If several indexes match,
628
return the index with the max id. */
629
static
630
dict_index_t*
631
row_merge_dict_table_get_index(
632
/*===========================*/
633
						/* out: matching index,
634
						NULL if not found */
635
	dict_table_t*		table,		/* in: table */
636
	const merge_index_def_t*index_def)	/* in: index definition */
637
{
638
	ulint		i;
639
	dict_index_t*	index;
640
	const char**	column_names;
641
642
	column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
643
644
	for (i = 0; i < index_def->n_fields; ++i) {
645
		column_names[i] = index_def->fields[i].field_name;
646
	}
647
648
	index = dict_table_get_index_by_max_id(
649
		table, index_def->name, column_names, index_def->n_fields);
650
651
	mem_free((void*) column_names);
652
653
	return(index);
654
}
655
656
/************************************************************************
657
Read a merge block from the file system. */
658
static
659
ibool
660
row_merge_read(
661
/*===========*/
662
					/* out: TRUE if request was
663
					successful, FALSE if fail */
664
	int			fd,	/* in: file descriptor */
665
	ulint			offset,	/* in: offset where to read */
666
	row_merge_block_t*	buf)	/* out: data */
667
{
668
	ib_uint64_t	ofs = ((ib_uint64_t) offset) * sizeof *buf;
669
	ibool		success;
670
671
	success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
672
						 (ulint) (ofs & 0xFFFFFFFF),
673
						 (ulint) (ofs >> 32),
674
						 sizeof *buf);
675
	if (UNIV_UNLIKELY(!success)) {
676
		ut_print_timestamp(stderr);
677
		fprintf(stderr,
641.1.4 by Monty Taylor
Merged in InnoDB changes.
678
			"  InnoDB: failed to read merge block at %"PRIu64"\n", ofs);
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
679
	}
680
681
	return(UNIV_LIKELY(success));
682
}
683
684
/************************************************************************
685
Read a merge block from the file system. */
686
static
687
ibool
688
row_merge_write(
689
/*============*/
690
				/* out: TRUE if request was
691
				successful, FALSE if fail */
692
	int		fd,	/* in: file descriptor */
693
	ulint		offset,	/* in: offset where to write */
694
	const void*	buf)	/* in: data */
695
{
696
	ib_uint64_t	ofs = ((ib_uint64_t) offset)
697
		* sizeof(row_merge_block_t);
698
699
	return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
700
					 (ulint) (ofs & 0xFFFFFFFF),
701
					 (ulint) (ofs >> 32),
702
					 sizeof(row_merge_block_t))));
703
}
704
705
/************************************************************************
706
Read a merge record. */
707
static
708
const byte*
709
row_merge_read_rec(
710
/*===============*/
711
					/* out: pointer to next record,
712
					or NULL on I/O error
713
					or end of list */
714
	row_merge_block_t*	block,	/* in/out: file buffer */
715
	mrec_buf_t*		buf,	/* in/out: secondary buffer */
716
	const byte*		b,	/* in: pointer to record */
717
	const dict_index_t*	index,	/* in: index of the record */
718
	int			fd,	/* in: file descriptor */
719
	ulint*			foffs,	/* in/out: file offset */
720
	const mrec_t**		mrec,	/* out: pointer to merge record,
721
					or NULL on end of list
722
					(non-NULL on I/O error) */
723
	ulint*			offsets)/* out: offsets of mrec */
724
{
725
	ulint	extra_size;
726
	ulint	data_size;
727
	ulint	avail_size;
728
729
	ut_ad(block);
730
	ut_ad(buf);
731
	ut_ad(b >= block[0]);
732
	ut_ad(b < block[1]);
733
	ut_ad(index);
734
	ut_ad(foffs);
735
	ut_ad(mrec);
736
	ut_ad(offsets);
737
738
	ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
739
	      + dict_index_get_n_fields(index));
740
741
	extra_size = *b++;
742
743
	if (UNIV_UNLIKELY(!extra_size)) {
744
		/* End of list */
745
		*mrec = NULL;
746
#ifdef UNIV_DEBUG
747
		if (row_merge_print_read) {
748
			fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
749
				(const void*) b, (const void*) block,
750
				fd, (ulong) *foffs);
751
		}
752
#endif /* UNIV_DEBUG */
753
		return(NULL);
754
	}
755
756
	if (extra_size >= 0x80) {
757
		/* Read another byte of extra_size. */
758
759
		if (UNIV_UNLIKELY(b >= block[1])) {
760
			if (!row_merge_read(fd, ++(*foffs), block)) {
761
err_exit:
762
				/* Signal I/O error. */
763
				*mrec = b;
764
				return(NULL);
765
			}
766
767
			/* Wrap around to the beginning of the buffer. */
768
			b = block[0];
769
		}
770
771
		extra_size = (extra_size & 0x7f) << 8;
772
		extra_size |= *b++;
773
	}
774
775
	/* Normalize extra_size.  Above, value 0 signals "end of list". */
776
	extra_size--;
777
778
	/* Read the extra bytes. */
779
780
	if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
781
		/* The record spans two blocks.  Copy the entire record
782
		to the auxiliary buffer and handle this as a special
783
		case. */
784
785
		avail_size = block[1] - b;
786
787
		memcpy(*buf, b, avail_size);
788
789
		if (!row_merge_read(fd, ++(*foffs), block)) {
790
791
			goto err_exit;
792
		}
793
794
		/* Wrap around to the beginning of the buffer. */
795
		b = block[0];
796
797
		/* Copy the record. */
798
		memcpy(*buf + avail_size, b, extra_size - avail_size);
799
		b += extra_size - avail_size;
800
801
		*mrec = *buf + extra_size;
802
803
		rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
804
805
		data_size = rec_offs_data_size(offsets);
806
807
		/* These overflows should be impossible given that
808
		records are much smaller than either buffer, and
809
		the record starts near the beginning of each buffer. */
810
		ut_a(extra_size + data_size < sizeof *buf);
811
		ut_a(b + data_size < block[1]);
812
813
		/* Copy the data bytes. */
814
		memcpy(*buf + extra_size, b, data_size);
815
		b += data_size;
816
817
		goto func_exit;
818
	}
819
820
	*mrec = b + extra_size;
821
822
	rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
823
824
	data_size = rec_offs_data_size(offsets);
825
	ut_ad(extra_size + data_size < sizeof *buf);
826
827
	b += extra_size + data_size;
828
829
	if (UNIV_LIKELY(b < block[1])) {
830
		/* The record fits entirely in the block.
831
		This is the normal case. */
832
		goto func_exit;
833
	}
834
835
	/* The record spans two blocks.  Copy it to buf. */
836
837
	b -= extra_size + data_size;
838
	avail_size = block[1] - b;
839
	memcpy(*buf, b, avail_size);
840
	*mrec = *buf + extra_size;
841
	rec_offs_make_valid(*mrec, index, offsets);
842
843
	if (!row_merge_read(fd, ++(*foffs), block)) {
844
845
		goto err_exit;
846
	}
847
848
	/* Wrap around to the beginning of the buffer. */
849
	b = block[0];
850
851
	/* Copy the rest of the record. */
852
	memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
853
	b += extra_size + data_size - avail_size;
854
855
func_exit:
856
#ifdef UNIV_DEBUG
857
	if (row_merge_print_read) {
858
		fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
859
			(const void*) b, (const void*) block,
860
			fd, (ulong) *foffs);
861
		rec_print_comp(stderr, *mrec, offsets);
862
		putc('\n', stderr);
863
	}
864
#endif /* UNIV_DEBUG */
865
866
	return(b);
867
}
868
869
/************************************************************************
870
Write a merge record. */
871
static
872
void
873
row_merge_write_rec_low(
874
/*====================*/
875
	byte*		b,	/* out: buffer */
876
	ulint		e,	/* in: encoded extra_size */
877
#ifdef UNIV_DEBUG
878
	ulint		size,	/* in: total size to write */
879
	int		fd,	/* in: file descriptor */
880
	ulint		foffs,	/* in: file offset */
881
#endif /* UNIV_DEBUG */
882
	const mrec_t*	mrec,	/* in: record to write */
883
	const ulint*	offsets)/* in: offsets of mrec */
884
#ifndef UNIV_DEBUG
885
# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets)	\
886
	row_merge_write_rec_low(b, e, mrec, offsets)
887
#endif /* !UNIV_DEBUG */
888
{
889
#ifdef UNIV_DEBUG
890
	const byte* const end = b + size;
891
	ut_ad(e == rec_offs_extra_size(offsets) + 1);
892
893
	if (row_merge_print_write) {
894
		fprintf(stderr, "row_merge_write %p,%d,%lu ",
895
			(void*) b, fd, (ulong) foffs);
896
		rec_print_comp(stderr, mrec, offsets);
897
		putc('\n', stderr);
898
	}
899
#endif /* UNIV_DEBUG */
900
901
	if (e < 0x80) {
902
		*b++ = (byte) e;
903
	} else {
904
		*b++ = (byte) (0x80 | (e >> 8));
905
		*b++ = (byte) e;
906
	}
907
908
	memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
909
	ut_ad(b + rec_offs_size(offsets) == end);
910
}
911
912
/************************************************************************
913
Write a merge record. */
914
static
915
byte*
916
row_merge_write_rec(
917
/*================*/
918
					/* out: pointer to end of block,
919
					or NULL on error */
920
	row_merge_block_t*	block,	/* in/out: file buffer */
921
	mrec_buf_t*		buf,	/* in/out: secondary buffer */
922
	byte*			b,	/* in: pointer to end of block */
923
	int			fd,	/* in: file descriptor */
924
	ulint*			foffs,	/* in/out: file offset */
925
	const mrec_t*		mrec,	/* in: record to write */
926
	const ulint*		offsets)/* in: offsets of mrec */
927
{
928
	ulint	extra_size;
929
	ulint	size;
930
	ulint	avail_size;
931
932
	ut_ad(block);
933
	ut_ad(buf);
934
	ut_ad(b >= block[0]);
935
	ut_ad(b < block[1]);
936
	ut_ad(mrec);
937
	ut_ad(foffs);
938
	ut_ad(mrec < block[0] || mrec > block[1]);
939
	ut_ad(mrec < buf[0] || mrec > buf[1]);
940
941
	/* Normalize extra_size.  Value 0 signals "end of list". */
942
	extra_size = rec_offs_extra_size(offsets) + 1;
943
944
	size = extra_size + (extra_size >= 0x80)
945
		+ rec_offs_data_size(offsets);
946
947
	if (UNIV_UNLIKELY(b + size >= block[1])) {
948
		/* The record spans two blocks.
949
		Copy it to the temporary buffer first. */
950
		avail_size = block[1] - b;
951
952
		row_merge_write_rec_low(buf[0],
953
					extra_size, size, fd, *foffs,
954
					mrec, offsets);
955
956
		/* Copy the head of the temporary buffer, write
957
		the completed block, and copy the tail of the
958
		record to the head of the new block. */
959
		memcpy(b, buf[0], avail_size);
960
961
		if (!row_merge_write(fd, (*foffs)++, block)) {
962
			return(NULL);
963
		}
964
965
		UNIV_MEM_INVALID(block[0], sizeof block[0]);
966
967
		/* Copy the rest. */
968
		b = block[0];
969
		memcpy(b, buf[0] + avail_size, size - avail_size);
970
		b += size - avail_size;
971
	} else {
972
		row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
973
					mrec, offsets);
974
		b += size;
975
	}
976
977
	return(b);
978
}
979
980
/************************************************************************
981
Write an end-of-list marker. */
982
static
983
byte*
984
row_merge_write_eof(
985
/*================*/
986
					/* out: pointer to end of block,
987
					or NULL on error */
988
	row_merge_block_t*	block,	/* in/out: file buffer */
989
	byte*			b,	/* in: pointer to end of block */
990
	int			fd,	/* in: file descriptor */
991
	ulint*			foffs)	/* in/out: file offset */
992
{
993
	ut_ad(block);
994
	ut_ad(b >= block[0]);
995
	ut_ad(b < block[1]);
996
	ut_ad(foffs);
997
#ifdef UNIV_DEBUG
998
	if (row_merge_print_write) {
999
		fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
1000
			(void*) b, (void*) block, fd, (ulong) *foffs);
1001
	}
1002
#endif /* UNIV_DEBUG */
1003
1004
	*b++ = 0;
1005
	UNIV_MEM_ASSERT_RW(block[0], b - block[0]);
1006
	UNIV_MEM_ASSERT_W(block[0], sizeof block[0]);
1007
#ifdef UNIV_DEBUG_VALGRIND
1008
	/* The rest of the block is uninitialized.  Initialize it
1009
	to avoid bogus warnings. */
1010
	memset(b, 0xff, block[1] - b);
1011
#endif /* UNIV_DEBUG_VALGRIND */
1012
1013
	if (!row_merge_write(fd, (*foffs)++, block)) {
1014
		return(NULL);
1015
	}
1016
1017
	UNIV_MEM_INVALID(block[0], sizeof block[0]);
1018
	return(block[0]);
1019
}
1020
1021
/*****************************************************************
1022
Compare two merge records. */
1023
static
1024
int
1025
row_merge_cmp(
1026
/*==========*/
1027
						/* out: 1, 0, -1 if
1028
						mrec1 is greater, equal, less,
1029
						respectively, than mrec2 */
1030
	const mrec_t*		mrec1,		/* in: first merge
1031
						record to be compared */
1032
	const mrec_t*		mrec2,		/* in: second merge
1033
						record to be compared */
1034
	const ulint*		offsets1,	/* in: first record offsets */
1035
	const ulint*		offsets2,	/* in: second record offsets */
1036
	const dict_index_t*	index)		/* in: index */
1037
{
1038
	int	cmp;
1039
1040
	cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index);
1041
1042
#ifdef UNIV_DEBUG
1043
	if (row_merge_print_cmp) {
1044
		fputs("row_merge_cmp1 ", stderr);
1045
		rec_print_comp(stderr, mrec1, offsets1);
1046
		fputs("\nrow_merge_cmp2 ", stderr);
1047
		rec_print_comp(stderr, mrec2, offsets2);
1048
		fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp);
1049
	}
1050
#endif /* UNIV_DEBUG */
1051
1052
	return(cmp);
1053
}
1054
1055
/************************************************************************
1056
Reads clustered index of the table and create temporary files
1057
containing the index entries for the indexes to be built. */
1058
static
1059
ulint
1060
row_merge_read_clustered_index(
1061
/*===========================*/
1062
					/* out: DB_SUCCESS or error */
1063
	trx_t*			trx,	/* in: transaction */
1064
	TABLE*			table,	/* in/out: MySQL table object,
1065
					for reporting erroneous records */
1066
	const dict_table_t*	old_table,/* in: table where rows are
1067
					read from */
1068
	const dict_table_t*	new_table,/* in: table where indexes are
1069
					created; identical to old_table
1070
					unless creating a PRIMARY KEY */
1071
	dict_index_t**		index,	/* in: indexes to be created */
1072
	merge_file_t*		files,	/* in: temporary files */
1073
	ulint			n_index,/* in: number of indexes to create */
1074
	row_merge_block_t*	block)	/* in/out: file buffer */
1075
{
1076
	dict_index_t*		clust_index;	/* Clustered index */
1077
	mem_heap_t*		row_heap;	/* Heap memory to create
1078
						clustered index records */
1079
	row_merge_buf_t**	merge_buf;	/* Temporary list for records*/
1080
	btr_pcur_t		pcur;		/* Persistent cursor on the
1081
						clustered index */
1082
	mtr_t			mtr;		/* Mini transaction */
1083
	ulint			err = DB_SUCCESS;/* Return code */
1084
	ulint			i;
1085
	ulint			n_nonnull = 0;	/* number of columns
1086
						changed to NOT NULL */
1087
	ulint*			nonnull = NULL;	/* NOT NULL columns */
1088
1089
	trx->op_info = "reading clustered index";
1090
1091
	ut_ad(trx);
1092
	ut_ad(old_table);
1093
	ut_ad(new_table);
1094
	ut_ad(index);
1095
	ut_ad(files);
1096
1097
	/* Create and initialize memory for record buffers */
1098
1099
	merge_buf = mem_alloc(n_index * sizeof *merge_buf);
1100
1101
	for (i = 0; i < n_index; i++) {
1102
		merge_buf[i] = row_merge_buf_create(index[i]);
1103
	}
1104
1105
	mtr_start(&mtr);
1106
1107
	/* Find the clustered index and create a persistent cursor
1108
	based on that. */
1109
1110
	clust_index = dict_table_get_first_index(old_table);
1111
1112
	btr_pcur_open_at_index_side(
1113
		TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
1114
1115
	if (UNIV_UNLIKELY(old_table != new_table)) {
1116
		ulint	n_cols = dict_table_get_n_cols(old_table);
1117
1118
		/* A primary key will be created.  Identify the
1119
		columns that were flagged NOT NULL in the new table,
1120
		so that we can quickly check that the records in the
1121
		(old) clustered index do not violate the added NOT
1122
		NULL constraints. */
1123
1124
		ut_a(n_cols == dict_table_get_n_cols(new_table));
1125
1126
		nonnull = mem_alloc(n_cols * sizeof *nonnull);
1127
1128
		for (i = 0; i < n_cols; i++) {
1129
			if (dict_table_get_nth_col(old_table, i)->prtype
1130
			    & DATA_NOT_NULL) {
1131
1132
				continue;
1133
			}
1134
1135
			if (dict_table_get_nth_col(new_table, i)->prtype
1136
			    & DATA_NOT_NULL) {
1137
1138
				nonnull[n_nonnull++] = i;
1139
			}
1140
		}
1141
1142
		if (!n_nonnull) {
1143
			mem_free(nonnull);
1144
			nonnull = NULL;
1145
		}
1146
	}
1147
1148
	row_heap = mem_heap_create(sizeof(mrec_buf_t));
1149
1150
	/* Scan the clustered index. */
1151
	for (;;) {
1152
		const rec_t*	rec;
1153
		ulint*		offsets;
1154
		dtuple_t*	row		= NULL;
1155
		row_ext_t*	ext;
1156
		ibool		has_next	= TRUE;
1157
1158
		btr_pcur_move_to_next_on_page(&pcur);
1159
1160
		/* When switching pages, commit the mini-transaction
1161
		in order to release the latch on the old page. */
1162
1163
		if (btr_pcur_is_after_last_on_page(&pcur)) {
1164
			btr_pcur_store_position(&pcur, &mtr);
1165
			mtr_commit(&mtr);
1166
			mtr_start(&mtr);
1167
			btr_pcur_restore_position(BTR_SEARCH_LEAF,
1168
						  &pcur, &mtr);
1169
			has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr);
1170
		}
1171
1172
		if (UNIV_LIKELY(has_next)) {
1173
			rec = btr_pcur_get_rec(&pcur);
1174
			offsets = rec_get_offsets(rec, clust_index, NULL,
1175
						  ULINT_UNDEFINED, &row_heap);
1176
1177
			/* Skip delete marked records. */
1178
			if (rec_get_deleted_flag(
1179
				    rec, dict_table_is_comp(old_table))) {
1180
				continue;
1181
			}
1182
1183
			srv_n_rows_inserted++;
1184
1185
			/* Build a row based on the clustered index. */
1186
1187
			row = row_build(ROW_COPY_POINTERS, clust_index,
1188
					rec, offsets,
1189
					new_table, &ext, row_heap);
1190
1191
			if (UNIV_LIKELY_NULL(nonnull)) {
1192
				for (i = 0; i < n_nonnull; i++) {
1193
					dfield_t*	field
1194
						= &row->fields[nonnull[i]];
1195
					dtype_t*	field_type
1196
						= dfield_get_type(field);
1197
1198
					ut_a(!(field_type->prtype
1199
					       & DATA_NOT_NULL));
1200
1201
					if (dfield_is_null(field)) {
1202
						err = DB_PRIMARY_KEY_IS_NULL;
1203
						i = 0;
1204
						goto err_exit;
1205
					}
1206
1207
					field_type->prtype |= DATA_NOT_NULL;
1208
				}
1209
			}
1210
		}
1211
1212
		/* Build all entries for all the indexes to be created
1213
		in a single scan of the clustered index. */
1214
1215
		for (i = 0; i < n_index; i++) {
1216
			row_merge_buf_t*	buf	= merge_buf[i];
1217
			merge_file_t*		file	= &files[i];
1218
			const dict_index_t*	index	= buf->index;
1219
1220
			if (UNIV_LIKELY
1221
			    (row && row_merge_buf_add(buf, row, ext))) {
1222
				continue;
1223
			}
1224
1225
			/* The buffer must be sufficiently large
1226
			to hold at least one record. */
1227
			ut_ad(buf->n_tuples || !has_next);
1228
1229
			/* We have enough data tuples to form a block.
1230
			Sort them and write to disk. */
1231
1232
			if (buf->n_tuples) {
1233
				if (dict_index_is_unique(index)) {
1234
					row_merge_dup_t	dup;
1235
					dup.index = buf->index;
1236
					dup.table = table;
1237
					dup.n_dup = 0;
1238
1239
					row_merge_buf_sort(buf, &dup);
1240
1241
					if (dup.n_dup) {
1242
						err = DB_DUPLICATE_KEY;
1243
err_exit:
1244
						trx->error_key_num = i;
1245
						goto func_exit;
1246
					}
1247
				} else {
1248
					row_merge_buf_sort(buf, NULL);
1249
				}
1250
			}
1251
1252
			row_merge_buf_write(buf, file, block);
1253
1254
			if (!row_merge_write(file->fd, file->offset++,
1255
					     block)) {
1256
				err = DB_OUT_OF_FILE_SPACE;
1257
				goto err_exit;
1258
			}
1259
1260
			UNIV_MEM_INVALID(block[0], sizeof block[0]);
1261
			merge_buf[i] = row_merge_buf_empty(buf);
1262
1263
			/* Try writing the record again, now that
1264
			the buffer has been written out and emptied. */
1265
1266
			if (UNIV_UNLIKELY
1267
			    (row && !row_merge_buf_add(buf, row, ext))) {
1268
				/* An empty buffer should have enough
1269
				room for at least one record. */
1270
				ut_error;
1271
			}
1272
		}
1273
1274
		mem_heap_empty(row_heap);
1275
1276
		if (UNIV_UNLIKELY(!has_next)) {
1277
			goto func_exit;
1278
		}
1279
	}
1280
1281
func_exit:
1282
	btr_pcur_close(&pcur);
1283
	mtr_commit(&mtr);
1284
	mem_heap_free(row_heap);
1285
1286
	if (UNIV_LIKELY_NULL(nonnull)) {
1287
		mem_free(nonnull);
1288
	}
1289
1290
	for (i = 0; i < n_index; i++) {
1291
		row_merge_buf_free(merge_buf[i]);
1292
	}
1293
1294
	mem_free(merge_buf);
1295
1296
	trx->op_info = "";
1297
1298
	return(err);
1299
}
1300
1301
/*****************************************************************
1302
Merge two blocks of linked lists on disk and write a bigger block. */
1303
static
1304
ulint
1305
row_merge_blocks(
1306
/*=============*/
1307
					/* out: DB_SUCCESS or error code */
1308
	const dict_index_t*	index,	/* in: index being created */
1309
	merge_file_t*		file,	/* in/out: file containing
1310
					index entries */
1311
	row_merge_block_t*	block,	/* in/out: 3 buffers */
1312
	ulint*			foffs0,	/* in/out: offset of first
1313
					source list in the file */
1314
	ulint*			foffs1,	/* in/out: offset of second
1315
					source list in the file */
1316
	merge_file_t*		of,	/* in/out: output file */
1317
	TABLE*			table)	/* in/out: MySQL table, for
1318
					reporting erroneous key value
1319
					if applicable */
1320
{
1321
	mem_heap_t*	heap;	/* memory heap for offsets0, offsets1 */
1322
1323
	mrec_buf_t	buf[3];	/* buffer for handling split mrec in block[] */
1324
	const byte*	b0;	/* pointer to block[0] */
1325
	const byte*	b1;	/* pointer to block[1] */
1326
	byte*		b2;	/* pointer to block[2] */
1327
	const mrec_t*	mrec0;	/* merge rec, points to block[0] or buf[0] */
1328
	const mrec_t*	mrec1;	/* merge rec, points to block[1] or buf[1] */
1329
	ulint*		offsets0;/* offsets of mrec0 */
1330
	ulint*		offsets1;/* offsets of mrec1 */
1331
1332
	heap = row_merge_heap_create(index, &offsets0, &offsets1);
1333
1334
	/* Write a record and read the next record.  Split the output
1335
	file in two halves, which can be merged on the following pass. */
1336
#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END)				\
1337
	do {								\
1338
		b2 = row_merge_write_rec(&block[2], &buf[2], b2,	\
1339
					 of->fd, &of->offset,		\
1340
					 mrec##N, offsets##N);		\
1341
		if (UNIV_UNLIKELY(!b2)) {				\
1342
			goto corrupt;					\
1343
		}							\
1344
		b##N = row_merge_read_rec(&block[N], &buf[N],		\
1345
					  b##N, index,			\
1346
					  file->fd, foffs##N,		\
1347
					  &mrec##N, offsets##N);	\
1348
		if (UNIV_UNLIKELY(!b##N)) {				\
1349
			if (mrec##N) {					\
1350
				goto corrupt;				\
1351
			}						\
1352
			AT_END;						\
1353
		}							\
1354
	} while (0)
1355
1356
	if (!row_merge_read(file->fd, *foffs0, &block[0])
1357
	    || !row_merge_read(file->fd, *foffs1, &block[1])) {
1358
corrupt:
1359
		mem_heap_free(heap);
1360
		return(DB_CORRUPTION);
1361
	}
1362
1363
	b0 = block[0];
1364
	b1 = block[1];
1365
	b2 = block[2];
1366
1367
	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
1368
				foffs0, &mrec0, offsets0);
1369
	b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd,
1370
				foffs1, &mrec1, offsets1);
1371
	if (UNIV_UNLIKELY(!b0 && mrec0)
1372
	    || UNIV_UNLIKELY(!b1 && mrec1)) {
1373
1374
		goto corrupt;
1375
	}
1376
1377
	while (mrec0 && mrec1) {
1378
		switch (row_merge_cmp(mrec0, mrec1,
1379
				      offsets0, offsets1, index)) {
1380
		case 0:
1381
			if (UNIV_UNLIKELY
1382
			    (dict_index_is_unique(index))) {
1383
				innobase_rec_to_mysql(table, mrec0,
1384
						      index, offsets0);
1385
				mem_heap_free(heap);
1386
				return(DB_DUPLICATE_KEY);
1387
			}
1388
			/* fall through */
1389
		case -1:
1390
			ROW_MERGE_WRITE_GET_NEXT(0, goto merged);
1391
			break;
1392
		case 1:
1393
			ROW_MERGE_WRITE_GET_NEXT(1, goto merged);
1394
			break;
1395
		default:
1396
			ut_error;
1397
		}
1398
1399
	}
1400
1401
merged:
1402
	if (mrec0) {
1403
		/* append all mrec0 to output */
1404
		for (;;) {
1405
			ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
1406
		}
1407
	}
1408
done0:
1409
	if (mrec1) {
1410
		/* append all mrec1 to output */
1411
		for (;;) {
1412
			ROW_MERGE_WRITE_GET_NEXT(1, goto done1);
1413
		}
1414
	}
1415
done1:
1416
1417
	mem_heap_free(heap);
1418
	b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset);
1419
	return(b2 ? DB_SUCCESS : DB_CORRUPTION);
1420
}
1421
1422
/*****************************************************************
1423
Merge disk files. */
1424
static
1425
ulint
1426
row_merge(
1427
/*======*/
1428
					/* out: DB_SUCCESS or error code */
1429
	const dict_index_t*	index,	/* in: index being created */
1430
	merge_file_t*		file,	/* in/out: file containing
1431
					index entries */
1432
	ulint			half,	/* in: half the file */
1433
	row_merge_block_t*	block,	/* in/out: 3 buffers */
1434
	int*			tmpfd,	/* in/out: temporary file handle */
1435
	TABLE*			table)	/* in/out: MySQL table, for
1436
					reporting erroneous key value
1437
					if applicable */
1438
{
1439
	ulint		foffs0;	/* first input offset */
1440
	ulint		foffs1;	/* second input offset */
1441
	ulint		error;	/* error code */
1442
	merge_file_t	of;	/* output file */
1443
1444
	UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]);
1445
	ut_ad(half > 0);
1446
1447
	of.fd = *tmpfd;
1448
	of.offset = 0;
1449
1450
	/* Merge blocks to the output file. */
1451
	foffs0 = 0;
1452
	foffs1 = half;
1453
1454
	for (; foffs0 < half && foffs1 < file->offset; foffs0++, foffs1++) {
1455
		error = row_merge_blocks(index, file, block,
1456
					 &foffs0, &foffs1, &of, table);
1457
1458
		if (error != DB_SUCCESS) {
1459
			return(error);
1460
		}
1461
	}
1462
1463
	/* Copy the last block, if there is one. */
1464
	while (foffs0 < half) {
1465
		if (!row_merge_read(file->fd, foffs0++, block)
1466
		    || !row_merge_write(of.fd, of.offset++, block)) {
1467
			return(DB_CORRUPTION);
1468
		}
1469
	}
1470
	while (foffs1 < file->offset) {
1471
		if (!row_merge_read(file->fd, foffs1++, block)
1472
		    || !row_merge_write(of.fd, of.offset++, block)) {
1473
			return(DB_CORRUPTION);
1474
		}
1475
	}
1476
1477
	/* Swap file descriptors for the next pass. */
1478
	*tmpfd = file->fd;
1479
	*file = of;
1480
1481
	UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]);
1482
1483
	return(DB_SUCCESS);
1484
}
1485
1486
/*****************************************************************
1487
Merge disk files. */
1488
static
1489
ulint
1490
row_merge_sort(
1491
/*===========*/
1492
					/* out: DB_SUCCESS or error code */
1493
	const dict_index_t*	index,	/* in: index being created */
1494
	merge_file_t*		file,	/* in/out: file containing
1495
					index entries */
1496
	row_merge_block_t*	block,	/* in/out: 3 buffers */
1497
	int*			tmpfd,	/* in/out: temporary file handle */
1498
	TABLE*			table)	/* in/out: MySQL table, for
1499
					reporting erroneous key value
1500
					if applicable */
1501
{
1502
	ulint	blksz;	/* block size */
1503
1504
	for (blksz = 1; blksz < file->offset; blksz *= 2) {
1505
		ulint	half;
1506
		ulint	error;
1507
1508
		ut_ad(ut_is_2pow(blksz));
1509
		half = ut_2pow_round((file->offset + (blksz - 1)) / 2, blksz);
1510
		error = row_merge(index, file, half, block, tmpfd, table);
1511
1512
		if (error != DB_SUCCESS) {
1513
			return(error);
1514
		}
1515
	}
1516
1517
	return(DB_SUCCESS);
1518
}
1519
1520
/*****************************************************************
1521
Copy externally stored columns to the data tuple. */
1522
static
1523
void
1524
row_merge_copy_blobs(
1525
/*=================*/
1526
	const mrec_t*	mrec,	/* in: merge record */
1527
	const ulint*	offsets,/* in: offsets of mrec */
1528
	ulint		zip_size,/* in: compressed page size in bytes, or 0 */
1529
	dtuple_t*	tuple,	/* in/out: data tuple */
1530
	mem_heap_t*	heap)	/* in/out: memory heap */
1531
{
1532
	ulint	i;
1533
	ulint	n_fields = dtuple_get_n_fields(tuple);
1534
1535
	for (i = 0; i < n_fields; i++) {
1536
		ulint		len;
1537
		const void*	data;
1538
		dfield_t*	field = dtuple_get_nth_field(tuple, i);
1539
1540
		if (!dfield_is_ext(field)) {
1541
			continue;
1542
		}
1543
1544
		ut_ad(!dfield_is_null(field));
1545
1546
		/* The table is locked during index creation.
1547
		Therefore, externally stored columns cannot possibly
1548
		be freed between the time the BLOB pointers are read
1549
		(row_merge_read_clustered_index()) and dereferenced
1550
		(below). */
1551
		data = btr_rec_copy_externally_stored_field(
1552
			mrec, offsets, zip_size, i, &len, heap);
1553
1554
		dfield_set_data(field, data, len);
1555
	}
1556
}
1557
1558
/************************************************************************
1559
Read sorted file containing index data tuples and insert these data
1560
tuples to the index */
1561
static
1562
ulint
1563
row_merge_insert_index_tuples(
1564
/*==========================*/
1565
					/* out: DB_SUCCESS or error number */
1566
	trx_t*			trx,	/* in: transaction */
1567
	dict_index_t*		index,	/* in: index */
1568
	dict_table_t*		table,	/* in: new table */
1569
	ulint			zip_size,/* in: compressed page size of
1570
					 the old table, or 0 if uncompressed */
1571
	int			fd,	/* in: file descriptor */
1572
	row_merge_block_t*	block)	/* in/out: file buffer */
1573
{
1574
	mrec_buf_t		buf;
1575
	const byte*		b;
1576
	que_thr_t*		thr;
1577
	ins_node_t*		node;
1578
	mem_heap_t*		tuple_heap;
1579
	mem_heap_t*		graph_heap;
1580
	ulint			error = DB_SUCCESS;
1581
	ulint			foffs = 0;
1582
	ulint*			offsets;
1583
1584
	ut_ad(trx);
1585
	ut_ad(index);
1586
	ut_ad(table);
1587
1588
	/* We use the insert query graph as the dummy graph
1589
	needed in the row module call */
1590
1591
	trx->op_info = "inserting index entries";
1592
1593
	graph_heap = mem_heap_create(500);
1594
	node = ins_node_create(INS_DIRECT, table, graph_heap);
1595
1596
	thr = pars_complete_graph_for_exec(node, trx, graph_heap);
1597
1598
	que_thr_move_to_run_state_for_mysql(thr, trx);
1599
1600
	tuple_heap = mem_heap_create(1000);
1601
1602
	{
1603
		ulint i	= 1 + REC_OFFS_HEADER_SIZE
1604
			+ dict_index_get_n_fields(index);
1605
		offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
1606
		offsets[0] = i;
1607
		offsets[1] = dict_index_get_n_fields(index);
1608
	}
1609
1610
	b = *block;
1611
1612
	if (!row_merge_read(fd, foffs, block)) {
1613
		error = DB_CORRUPTION;
1614
	} else {
1615
		for (;;) {
1616
			const mrec_t*	mrec;
1617
			dtuple_t*	dtuple;
1618
			ulint		n_ext;
1619
1620
			b = row_merge_read_rec(block, &buf, b, index,
1621
					       fd, &foffs, &mrec, offsets);
1622
			if (UNIV_UNLIKELY(!b)) {
1623
				/* End of list, or I/O error */
1624
				if (mrec) {
1625
					error = DB_CORRUPTION;
1626
				}
1627
				break;
1628
			}
1629
1630
			dtuple = row_rec_to_index_entry_low(
1631
				mrec, index, offsets, &n_ext, tuple_heap);
1632
1633
			if (UNIV_UNLIKELY(n_ext)) {
1634
				row_merge_copy_blobs(mrec, offsets, zip_size,
1635
						     dtuple, tuple_heap);
1636
			}
1637
1638
			node->row = dtuple;
1639
			node->table = table;
1640
			node->trx_id = trx->id;
1641
1642
			ut_ad(dtuple_validate(dtuple));
1643
1644
			do {
1645
				thr->run_node = thr;
1646
				thr->prev_node = thr->common.parent;
1647
1648
				error = row_ins_index_entry(index, dtuple,
1649
							    0, FALSE, thr);
1650
1651
				if (UNIV_LIKELY(error == DB_SUCCESS)) {
1652
1653
					goto next_rec;
1654
				}
1655
1656
				thr->lock_state = QUE_THR_LOCK_ROW;
1657
				trx->error_state = error;
1658
				que_thr_stop_for_mysql(thr);
1659
				thr->lock_state = QUE_THR_LOCK_NOLOCK;
1660
			} while (row_mysql_handle_errors(&error, trx,
1661
							 thr, NULL));
1662
1663
			goto err_exit;
1664
next_rec:
1665
			mem_heap_empty(tuple_heap);
1666
		}
1667
	}
1668
1669
	que_thr_stop_for_mysql_no_error(thr, trx);
1670
err_exit:
1671
	que_graph_free(thr->graph);
1672
1673
	trx->op_info = "";
1674
1675
	mem_heap_free(tuple_heap);
1676
1677
	return(error);
1678
}
1679
1680
/*************************************************************************
1681
Sets an exclusive lock on a table, for the duration of creating indexes. */
1682
UNIV_INTERN
1683
ulint
1684
row_merge_lock_table(
1685
/*=================*/
1686
					/* out: error code or DB_SUCCESS */
1687
	trx_t*		trx,		/* in/out: transaction */
1688
	dict_table_t*	table,		/* in: table to lock */
1689
	enum lock_mode	mode)		/* in: LOCK_X or LOCK_S */
1690
{
1691
	mem_heap_t*	heap;
1692
	que_thr_t*	thr;
1693
	ulint		err;
1694
	sel_node_t*	node;
1695
1696
	ut_ad(trx);
1697
	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
1698
	ut_ad(mode == LOCK_X || mode == LOCK_S);
1699
1700
	heap = mem_heap_create(512);
1701
1702
	trx->op_info = "setting table lock for creating or dropping index";
1703
1704
	node = sel_node_create(heap);
1705
	thr = pars_complete_graph_for_exec(node, trx, heap);
1706
	thr->graph->state = QUE_FORK_ACTIVE;
1707
1708
	/* We use the select query graph as the dummy graph needed
1709
	in the lock module call */
1710
1711
	thr = que_fork_get_first_thr(que_node_get_parent(thr));
1712
	que_thr_move_to_run_state_for_mysql(thr, trx);
1713
1714
run_again:
1715
	thr->run_node = thr;
1716
	thr->prev_node = thr->common.parent;
1717
1718
	err = lock_table(0, table, mode, thr);
1719
1720
	trx->error_state = err;
1721
1722
	if (UNIV_LIKELY(err == DB_SUCCESS)) {
1723
		que_thr_stop_for_mysql_no_error(thr, trx);
1724
	} else {
1725
		que_thr_stop_for_mysql(thr);
1726
1727
		if (err != DB_QUE_THR_SUSPENDED) {
1728
			ibool	was_lock_wait;
1729
1730
			was_lock_wait = row_mysql_handle_errors(
1731
				&err, trx, thr, NULL);
1732
1733
			if (was_lock_wait) {
1734
				goto run_again;
1735
			}
1736
		} else {
1737
			que_thr_t*	run_thr;
1738
			que_node_t*	parent;
1739
1740
			parent = que_node_get_parent(thr);
1741
			run_thr = que_fork_start_command(parent);
1742
1743
			ut_a(run_thr == thr);
1744
1745
			/* There was a lock wait but the thread was not
1746
			in a ready to run or running state. */
1747
			trx->error_state = DB_LOCK_WAIT;
1748
1749
			goto run_again;
1750
		}
1751
	}
1752
1753
	que_graph_free(thr->graph);
1754
	trx->op_info = "";
1755
1756
	return(err);
1757
}
1758
1759
/*************************************************************************
641.2.1 by Monty Taylor
InnoDB Plugin 1.0.2
1760
Drop an index from the InnoDB system tables.  The data dictionary must
1761
have been locked exclusively by the caller, because the transaction
1762
will not be committed. */
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
1763
UNIV_INTERN
1764
void
1765
row_merge_drop_index(
1766
/*=================*/
1767
	dict_index_t*	index,	/* in: index to be removed */
1768
	dict_table_t*	table,	/* in: table */
1769
	trx_t*		trx)	/* in: transaction handle */
1770
{
1771
	ulint		err;
1772
	pars_info_t*	info = pars_info_create();
1773
1774
	/* We use the private SQL parser of Innobase to generate the
1775
	query graphs needed in deleting the dictionary data from system
1776
	tables in Innobase. Deleting a row from SYS_INDEXES table also
1777
	frees the file segments of the B-tree associated with the index. */
1778
1779
	static const char str1[] =
1780
		"PROCEDURE DROP_INDEX_PROC () IS\n"
1781
		"BEGIN\n"
1782
		"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
1783
		"DELETE FROM SYS_INDEXES WHERE ID = :indexid\n"
1784
		"		AND TABLE_ID = :tableid;\n"
1785
		"END;\n";
1786
1787
	ut_ad(index && table && trx);
1788
1789
	pars_info_add_dulint_literal(info, "indexid", index->id);
1790
	pars_info_add_dulint_literal(info, "tableid", table->id);
1791
1792
	trx_start_if_not_started(trx);
1793
	trx->op_info = "dropping index";
1794
641.2.1 by Monty Taylor
InnoDB Plugin 1.0.2
1795
	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
1796
1797
	err = que_eval_sql(info, str1, FALSE, trx);
1798
1799
	ut_a(err == DB_SUCCESS);
1800
1801
	/* Replace this index with another equivalent index for all
1802
	foreign key constraints on this table where this index is used */
1803
1804
	dict_table_replace_index_in_foreign_list(table, index);
1805
	dict_index_remove_from_cache(table, index);
1806
1807
	trx->op_info = "";
1808
}
1809
1810
/*************************************************************************
641.2.1 by Monty Taylor
InnoDB Plugin 1.0.2
1811
Drop those indexes which were created before an error occurred when
1812
building an index.  The data dictionary must have been locked
1813
exclusively by the caller, because the transaction will not be
1814
committed. */
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
1815
UNIV_INTERN
1816
void
1817
row_merge_drop_indexes(
1818
/*===================*/
1819
	trx_t*		trx,		/* in: transaction */
1820
	dict_table_t*	table,		/* in: table containing the indexes */
1821
	dict_index_t**	index,		/* in: indexes to drop */
1822
	ulint		num_created)	/* in: number of elements in index[] */
1823
{
1824
	ulint	key_num;
1825
1826
	for (key_num = 0; key_num < num_created; key_num++) {
1827
		row_merge_drop_index(index[key_num], table, trx);
1828
	}
1829
}
1830
1831
/*************************************************************************
1832
Drop all partially created indexes during crash recovery. */
1833
UNIV_INTERN
1834
void
1835
row_merge_drop_temp_indexes(void)
1836
/*=============================*/
1837
{
1838
	trx_t*		trx;
1839
	ulint		err;
1840
1841
	/* We use the private SQL parser of Innobase to generate the
1842
	query graphs needed in deleting the dictionary data from system
1843
	tables in Innobase. Deleting a row from SYS_INDEXES table also
1844
	frees the file segments of the B-tree associated with the index. */
1845
#if TEMP_INDEX_PREFIX != '\377'
1846
# error "TEMP_INDEX_PREFIX != '\377'"
1847
#endif
1848
	static const char drop_temp_indexes[] =
1849
		"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
1850
		"indexid CHAR;\n"
1851
		"DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n"
641.2.2 by Monty Taylor
InnoDB Plugin 1.0.3
1852
		"WHERE SUBSTR(NAME,0,1)='\377';\n"
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
1853
		"BEGIN\n"
1854
		"\tOPEN c;\n"
641.2.2 by Monty Taylor
InnoDB Plugin 1.0.3
1855
		"\tWHILE 1=1 LOOP\n"
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
1856
		"\t\tFETCH c INTO indexid;\n"
1857
		"\t\tIF (SQL % NOTFOUND) THEN\n"
1858
		"\t\t\tEXIT;\n"
1859
		"\t\tEND IF;\n"
1860
		"\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n"
641.2.2 by Monty Taylor
InnoDB Plugin 1.0.3
1861
		"\t\tDELETE FROM SYS_INDEXES WHERE ID = indexid;\n"
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
1862
		"\tEND LOOP;\n"
1863
		"\tCLOSE c;\n"
1864
		"\tCOMMIT WORK;\n"
1865
		"END;\n";
1866
1867
	trx = trx_allocate_for_background();
1868
	trx->op_info = "dropping partially created indexes";
1869
	row_mysql_lock_data_dictionary(trx);
1870
641.2.2 by Monty Taylor
InnoDB Plugin 1.0.3
1871
	/* Incomplete transactions may be holding some locks on the
1872
	data dictionary tables.  However, they should never have been
1873
	able to lock the records corresponding to the partially
1874
	created indexes that we are attempting to delete, because the
1875
	table was locked when the indexes were being created.  We will
1876
	drop the partially created indexes before the rollback of
1877
	incomplete transactions is initiated.  Thus, this should not
1878
	interfere with the incomplete transactions. */
1879
	trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
1880
	err = que_eval_sql(NULL, drop_temp_indexes, FALSE, trx);
1881
	ut_a(err == DB_SUCCESS);
1882
1883
	row_mysql_unlock_data_dictionary(trx);
1884
	trx_free_for_background(trx);
1885
}
1886
1887
/*************************************************************************
1888
Create a merge file. */
1889
static
1890
void
1891
row_merge_file_create(
1892
/*==================*/
1893
	merge_file_t*	merge_file)	/* out: merge file structure */
1894
{
1895
	merge_file->fd = innobase_mysql_tmpfile();
1896
	merge_file->offset = 0;
1897
}
1898
1899
/*************************************************************************
1900
Destroy a merge file. */
1901
static
1902
void
1903
row_merge_file_destroy(
1904
/*===================*/
1905
	merge_file_t*	merge_file)	/* out: merge file structure */
1906
{
1907
	if (merge_file->fd != -1) {
1908
		close(merge_file->fd);
1909
		merge_file->fd = -1;
1910
	}
1911
}
1912
1913
/*************************************************************************
1914
Determine the precise type of a column that is added to a tem
1915
if a column must be constrained NOT NULL. */
1916
UNIV_INLINE
1917
ulint
1918
row_merge_col_prtype(
1919
/*=================*/
1920
						/* out: col->prtype, possibly
1921
						ORed with DATA_NOT_NULL */
1922
	const dict_col_t*	col,		/* in: column */
1923
	const char*		col_name,	/* in: name of the column */
1924
	const merge_index_def_t*index_def)	/* in: the index definition
1925
						of the primary key */
1926
{
1927
	ulint	prtype = col->prtype;
1928
	ulint	i;
1929
1930
	ut_ad(index_def->ind_type & DICT_CLUSTERED);
1931
1932
	if (prtype & DATA_NOT_NULL) {
1933
1934
		return(prtype);
1935
	}
1936
1937
	/* All columns that are included
1938
	in the PRIMARY KEY must be NOT NULL. */
1939
1940
	for (i = 0; i < index_def->n_fields; i++) {
1941
		if (!strcmp(col_name, index_def->fields[i].field_name)) {
1942
			return(prtype | DATA_NOT_NULL);
1943
		}
1944
	}
1945
1946
	return(prtype);
1947
}
1948
1949
/*************************************************************************
1950
Create a temporary table for creating a primary key, using the definition
1951
of an existing table. */
1952
UNIV_INTERN
1953
dict_table_t*
1954
row_merge_create_temporary_table(
1955
/*=============================*/
1956
						/* out: table,
1957
						or NULL on error */
1958
	const char*		table_name,	/* in: new table name */
1959
	const merge_index_def_t*index_def,	/* in: the index definition
1960
						of the primary key */
1961
	const dict_table_t*	table,		/* in: old table definition */
1962
	trx_t*			trx)		/* in/out: transaction
1963
						(sets error_state) */
1964
{
1965
	ulint		i;
1966
	dict_table_t*	new_table = NULL;
1967
	ulint		n_cols = dict_table_get_n_user_cols(table);
1968
	ulint		error;
1969
	mem_heap_t*	heap = mem_heap_create(1000);
1970
1971
	ut_ad(table_name);
1972
	ut_ad(index_def);
1973
	ut_ad(table);
1974
	ut_ad(mutex_own(&dict_sys->mutex));
1975
1976
	new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags);
1977
1978
	for (i = 0; i < n_cols; i++) {
1979
		const dict_col_t*	col;
1980
		const char*		col_name;
1981
1982
		col = dict_table_get_nth_col(table, i);
1983
		col_name = dict_table_get_col_name(table, i);
1984
1985
		dict_mem_table_add_col(new_table, heap, col_name, col->mtype,
1986
				       row_merge_col_prtype(col, col_name,
1987
							    index_def),
1988
				       col->len);
1989
	}
1990
1991
	error = row_create_table_for_mysql(new_table, trx);
1992
	mem_heap_free(heap);
1993
1994
	if (error != DB_SUCCESS) {
1995
		trx->error_state = error;
1996
		new_table = NULL;
1997
	}
1998
1999
	return(new_table);
2000
}
2001
2002
/*************************************************************************
641.2.1 by Monty Taylor
InnoDB Plugin 1.0.2
2003
Rename the temporary indexes in the dictionary to permanent ones.  The
2004
data dictionary must have been locked exclusively by the caller,
2005
because the transaction will not be committed. */
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
2006
UNIV_INTERN
2007
ulint
2008
row_merge_rename_indexes(
2009
/*=====================*/
2010
					/* out: DB_SUCCESS if all OK */
2011
	trx_t*		trx,		/* in/out: transaction */
2012
	dict_table_t*	table)		/* in/out: table with new indexes */
2013
{
2014
	ulint		err = DB_SUCCESS;
2015
	pars_info_t*	info = pars_info_create();
2016
2017
	/* We use the private SQL parser of Innobase to generate the
2018
	query graphs needed in renaming indexes. */
2019
2020
#if TEMP_INDEX_PREFIX != '\377'
2021
# error "TEMP_INDEX_PREFIX != '\377'"
2022
#endif
2023
2024
	static const char rename_indexes[] =
2025
		"PROCEDURE RENAME_INDEXES_PROC () IS\n"
2026
		"BEGIN\n"
2027
		"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
2028
		"WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='\377';\n"
2029
		"END;\n";
2030
641.2.1 by Monty Taylor
InnoDB Plugin 1.0.2
2031
	ut_ad(table);
2032
	ut_ad(trx);
2033
	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
2034
2035
	trx->op_info = "renaming indexes";
2036
2037
	pars_info_add_dulint_literal(info, "tableid", table->id);
2038
2039
	err = que_eval_sql(info, rename_indexes, FALSE, trx);
2040
2041
	if (err == DB_SUCCESS) {
2042
		dict_index_t*	index = dict_table_get_first_index(table);
2043
		do {
2044
			if (*index->name == TEMP_INDEX_PREFIX) {
2045
				index->name++;
2046
			}
2047
			index = dict_table_get_next_index(index);
2048
		} while (index);
2049
	}
2050
2051
	trx->op_info = "";
2052
2053
	return(err);
2054
}
2055
2056
/*************************************************************************
641.2.1 by Monty Taylor
InnoDB Plugin 1.0.2
2057
Rename the tables in the data dictionary.  The data dictionary must
2058
have been locked exclusively by the caller, because the transaction
2059
will not be committed. */
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
2060
UNIV_INTERN
2061
ulint
2062
row_merge_rename_tables(
2063
/*====================*/
2064
					/* out: error code or DB_SUCCESS */
2065
	dict_table_t*	old_table,	/* in/out: old table, renamed to
2066
					tmp_name */
2067
	dict_table_t*	new_table,	/* in/out: new table, renamed to
2068
					old_table->name */
2069
	const char*	tmp_name,	/* in: new name for old_table */
2070
	trx_t*		trx)		/* in: transaction handle */
2071
{
2072
	ulint		err	= DB_ERROR;
2073
	pars_info_t*	info;
2074
	const char*	old_name= old_table->name;
2075
2076
	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
2077
	ut_ad(old_table != new_table);
2078
	ut_ad(mutex_own(&dict_sys->mutex));
2079
641.2.1 by Monty Taylor
InnoDB Plugin 1.0.2
2080
	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
2081
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
2082
	trx->op_info = "renaming tables";
2083
2084
	/* We use the private SQL parser of Innobase to generate the query
2085
	graphs needed in updating the dictionary data in system tables. */
2086
2087
	info = pars_info_create();
2088
2089
	pars_info_add_str_literal(info, "new_name", new_table->name);
2090
	pars_info_add_str_literal(info, "old_name", old_name);
2091
	pars_info_add_str_literal(info, "tmp_name", tmp_name);
2092
2093
	err = que_eval_sql(info,
2094
			   "PROCEDURE RENAME_TABLES () IS\n"
2095
			   "BEGIN\n"
2096
			   "UPDATE SYS_TABLES SET NAME = :tmp_name\n"
2097
			   " WHERE NAME = :old_name;\n"
2098
			   "UPDATE SYS_TABLES SET NAME = :old_name\n"
2099
			   " WHERE NAME = :new_name;\n"
2100
			   "END;\n", FALSE, trx);
2101
2102
	if (err != DB_SUCCESS) {
2103
2104
		goto err_exit;
2105
	}
2106
2107
	/* The following calls will also rename the .ibd data files if
2108
	the tables are stored in a single-table tablespace */
2109
2110
	if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE)
2111
	    || !dict_table_rename_in_cache(new_table, old_name, FALSE)) {
2112
2113
		err = DB_ERROR;
2114
		goto err_exit;
2115
	}
2116
2117
	err = dict_load_foreigns(old_name, TRUE);
2118
2119
	if (err != DB_SUCCESS) {
2120
err_exit:
2121
		trx->error_state = DB_SUCCESS;
2122
		trx_general_rollback_for_mysql(trx, FALSE, NULL);
2123
		trx->error_state = DB_SUCCESS;
2124
	}
2125
2126
	trx->op_info = "";
2127
2128
	return(err);
2129
}
2130
2131
/*************************************************************************
2132
Create and execute a query graph for creating an index. */
2133
static
2134
ulint
2135
row_merge_create_index_graph(
2136
/*=========================*/
2137
					/* out: DB_SUCCESS or error code */
2138
	trx_t*		trx,		/* in: trx */
2139
	dict_table_t*	table,		/* in: table */
2140
	dict_index_t*	index)		/* in: index */
2141
{
2142
	ind_node_t*	node;		/* Index creation node */
2143
	mem_heap_t*	heap;		/* Memory heap */
2144
	que_thr_t*	thr;		/* Query thread */
2145
	ulint		err;
2146
2147
	ut_ad(trx);
2148
	ut_ad(table);
2149
	ut_ad(index);
2150
2151
	heap = mem_heap_create(512);
2152
2153
	index->table = table;
2154
	node = ind_create_graph_create(index, heap);
2155
	thr = pars_complete_graph_for_exec(node, trx, heap);
2156
2157
	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
2158
2159
	que_run_threads(thr);
2160
2161
	err = trx->error_state;
2162
2163
	que_graph_free((que_t*) que_node_get_parent(thr));
2164
2165
	return(err);
2166
}
2167
2168
/*************************************************************************
2169
Create the index and load in to the dictionary. */
2170
UNIV_INTERN
2171
dict_index_t*
2172
row_merge_create_index(
2173
/*===================*/
2174
					/* out: index, or NULL on error */
2175
	trx_t*		trx,		/* in/out: trx (sets error_state) */
2176
	dict_table_t*	table,		/* in: the index is on this table */
2177
	const merge_index_def_t*	/* in: the index definition */
2178
			index_def)
2179
{
2180
	dict_index_t*	index;
2181
	ulint		err;
2182
	ulint		n_fields = index_def->n_fields;
2183
	ulint		i;
2184
2185
	/* Create the index prototype, using the passed in def, this is not
2186
	a persistent operation. We pass 0 as the space id, and determine at
2187
	a lower level the space id where to store the table. */
2188
2189
	index = dict_mem_index_create(table->name, index_def->name,
2190
				      0, index_def->ind_type, n_fields);
2191
2192
	ut_a(index);
2193
2194
	for (i = 0; i < n_fields; i++) {
2195
		merge_index_field_t*	ifield = &index_def->fields[i];
2196
2197
		dict_mem_index_add_field(index, ifield->field_name,
2198
					 ifield->prefix_len);
2199
	}
2200
641.2.1 by Monty Taylor
InnoDB Plugin 1.0.2
2201
	/* Add the index to SYS_INDEXES, using the index prototype. */
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
2202
	err = row_merge_create_index_graph(trx, table, index);
2203
2204
	if (err == DB_SUCCESS) {
2205
2206
		index = row_merge_dict_table_get_index(
2207
			table, index_def);
2208
2209
		ut_a(index);
2210
2211
#ifdef ROW_MERGE_IS_INDEX_USABLE
2212
		/* Note the id of the transaction that created this
2213
		index, we use it to restrict readers from accessing
2214
		this index, to ensure read consistency. */
2215
		index->trx_id = trx->id;
2216
#endif /* ROW_MERGE_IS_INDEX_USABLE */
2217
	} else {
2218
		index = NULL;
2219
	}
2220
2221
	return(index);
2222
}
2223
2224
#ifdef ROW_MERGE_IS_INDEX_USABLE
2225
/*************************************************************************
2226
Check if a transaction can use an index. */
2227
UNIV_INTERN
2228
ibool
2229
row_merge_is_index_usable(
2230
/*======================*/
2231
	const trx_t*		trx,	/* in: transaction */
2232
	const dict_index_t*	index)	/* in: index to check */
2233
{
2234
	if (!trx->read_view) {
2235
		return(TRUE);
2236
	}
2237
2238
	return(ut_dulint_cmp(index->trx_id, trx->read_view->low_limit_id) < 0);
2239
}
2240
#endif /* ROW_MERGE_IS_INDEX_USABLE */
2241
2242
/*************************************************************************
2243
Drop the old table. */
2244
UNIV_INTERN
2245
ulint
2246
row_merge_drop_table(
2247
/*=================*/
2248
					/* out: DB_SUCCESS or error code */
2249
	trx_t*		trx,		/* in: transaction */
2250
	dict_table_t*	table)		/* in: table to drop */
2251
{
2252
	/* There must be no open transactions on the table. */
2253
	ut_a(table->n_mysql_handles_opened == 0);
2254
641.2.1 by Monty Taylor
InnoDB Plugin 1.0.2
2255
	return(row_drop_table_for_mysql(table->name, trx, FALSE));
641.1.2 by Monty Taylor
Imported 1.0.1 with clean - with no changes.
2256
}
2257
2258
/*************************************************************************
2259
Build indexes on a table by reading a clustered index,
2260
creating a temporary file containing index entries, merge sorting
2261
these index entries and inserting sorted index entries to indexes. */
2262
UNIV_INTERN
2263
ulint
2264
row_merge_build_indexes(
2265
/*====================*/
2266
					/* out: DB_SUCCESS or error code */
2267
	trx_t*		trx,		/* in: transaction */
2268
	dict_table_t*	old_table,	/* in: table where rows are
2269
					read from */
2270
	dict_table_t*	new_table,	/* in: table where indexes are
2271
					created; identical to old_table
2272
					unless creating a PRIMARY KEY */
2273
	dict_index_t**	indexes,	/* in: indexes to be created */
2274
	ulint		n_indexes,	/* in: size of indexes[] */
2275
	TABLE*		table)		/* in/out: MySQL table, for
2276
					reporting erroneous key value
2277
					if applicable */
2278
{
2279
	merge_file_t*		merge_files;
2280
	row_merge_block_t*	block;
2281
	ulint			block_size;
2282
	ulint			i;
2283
	ulint			error;
2284
	int			tmpfd;
2285
2286
	ut_ad(trx);
2287
	ut_ad(old_table);
2288
	ut_ad(new_table);
2289
	ut_ad(indexes);
2290
	ut_ad(n_indexes);
2291
2292
	trx_start_if_not_started(trx);
2293
2294
	/* Allocate memory for merge file data structure and initialize
2295
	fields */
2296
2297
	merge_files = mem_alloc(n_indexes * sizeof *merge_files);
2298
	block_size = 3 * sizeof *block;
2299
	block = os_mem_alloc_large(&block_size);
2300
2301
	for (i = 0; i < n_indexes; i++) {
2302
2303
		row_merge_file_create(&merge_files[i]);
2304
	}
2305
2306
	tmpfd = innobase_mysql_tmpfile();
2307
2308
	/* Reset the MySQL row buffer that is used when reporting
2309
	duplicate keys. */
2310
	innobase_rec_reset(table);
2311
2312
	/* Read clustered index of the table and create files for
2313
	secondary index entries for merge sort */
2314
2315
	error = row_merge_read_clustered_index(
2316
		trx, table, old_table, new_table, indexes,
2317
		merge_files, n_indexes, block);
2318
2319
	if (error != DB_SUCCESS) {
2320
2321
		goto func_exit;
2322
	}
2323
2324
	/* Now we have files containing index entries ready for
2325
	sorting and inserting. */
2326
2327
	for (i = 0; i < n_indexes; i++) {
2328
		error = row_merge_sort(indexes[i], &merge_files[i],
2329
				       block, &tmpfd, table);
2330
2331
		if (error == DB_SUCCESS) {
2332
			error = row_merge_insert_index_tuples(
2333
				trx, indexes[i], new_table,
2334
				dict_table_zip_size(old_table),
2335
				merge_files[i].fd, block);
2336
		}
2337
2338
		/* Close the temporary file to free up space. */
2339
		row_merge_file_destroy(&merge_files[i]);
2340
2341
		if (error != DB_SUCCESS) {
2342
			trx->error_key_num = i;
2343
			goto func_exit;
2344
		}
2345
	}
2346
2347
func_exit:
2348
	close(tmpfd);
2349
2350
	for (i = 0; i < n_indexes; i++) {
2351
		row_merge_file_destroy(&merge_files[i]);
2352
	}
2353
2354
	mem_free(merge_files);
2355
	os_mem_free_large(block, block_size);
2356
2357
	return(error);
2358
}