1
/*****************************************************************************
3
Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15
St, Fifth Floor, Boston, MA 02110-1301 USA
17
*****************************************************************************/
19
/**************************************************//**
21
New index creation routines using a merge sort
23
Created 12/4/2005 Jan Lindstrom
24
Completed by Sunny Bains and Marko Makela
25
*******************************************************/
27
#include "row0merge.h"
33
#include "dict0dict.h"
35
#include "dict0boot.h"
36
#include "dict0crea.h"
37
#include "dict0load.h"
39
#include "mach0data.h"
44
#include "trx0purge.h"
48
#include "read0read.h"
50
#include "lock0lock.h"
51
#include "data0data.h"
52
#include "data0type.h"
54
#include "pars0pars.h"
58
#include "handler0alter.h"
62
/** Set these in order ot enable debug printout. */
64
/** Log the outcome of each row_merge_cmp() call, comparing records. */
65
static ibool row_merge_print_cmp;
66
/** Log each record read from temporary file. */
67
static ibool row_merge_print_read;
68
/** Log each record write to temporary file. */
69
static ibool row_merge_print_write;
70
/** Log each row_merge_blocks() call, merging two blocks of records to
72
static ibool row_merge_print_block;
73
/** Log each block read from temporary file. */
74
static ibool row_merge_print_block_read;
75
/** Log each block read from temporary file. */
76
static ibool row_merge_print_block_write;
78
#endif /* UNIV_DEBUG */
80
/** @brief Block size for I/O operations in merge sort.
82
The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty()
83
rounded to a power of 2.
85
When not creating a PRIMARY KEY that contains column prefixes, this
86
can be set as small as UNIV_PAGE_SIZE / 2. See the comment above
87
ut_ad(data_size < sizeof(row_merge_block_t)). */
88
typedef byte row_merge_block_t[1048576];
90
/** @brief Secondary buffer for I/O operations of merge records.
92
This buffer is used for writing or reading a record that spans two
93
row_merge_block_t. Thus, it must be able to hold one merge record,
94
whose maximum size is the same as the minimum size of
96
typedef byte mrec_buf_t[UNIV_PAGE_SIZE];
98
/** @brief Merge record in row_merge_block_t.
100
The format is the same as a record in ROW_FORMAT=COMPACT with the
101
exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
104
/** Buffer for sorting in main memory. */
105
struct row_merge_buf_struct {
106
mem_heap_t* heap; /*!< memory heap where allocated */
107
dict_index_t* index; /*!< the index the tuples belong to */
108
ulint total_size; /*!< total amount of data bytes */
109
ulint n_tuples; /*!< number of data tuples */
110
ulint max_tuples; /*!< maximum number of data tuples */
111
const dfield_t**tuples; /*!< array of pointers to
112
arrays of fields that form
114
const dfield_t**tmp_tuples; /*!< temporary copy of tuples,
118
/** Buffer for sorting in main memory. */
119
typedef struct row_merge_buf_struct row_merge_buf_t;
121
/** Information about temporary files used in merge sort */
122
struct merge_file_struct {
123
int fd; /*!< file descriptor */
124
ulint offset; /*!< file offset (end of file) */
125
ib_uint64_t n_rec; /*!< number of records in the file */
128
/** Information about temporary files used in merge sort */
129
typedef struct merge_file_struct merge_file_t;
132
/******************************************************//**
133
Display a merge tuple. */
136
row_merge_tuple_print(
137
/*==================*/
138
FILE* f, /*!< in: output stream */
139
const dfield_t* entry, /*!< in: tuple to print */
140
ulint n_fields)/*!< in: number of fields in the tuple */
144
for (j = 0; j < n_fields; j++) {
145
const dfield_t* field = &entry[j];
147
if (dfield_is_null(field)) {
148
fputs("\n NULL;", f);
150
ulint field_len = dfield_get_len(field);
151
ulint len = ut_min(field_len, 20);
152
if (dfield_is_ext(field)) {
157
ut_print_buf(f, dfield_get_data(field), len);
158
if (len != field_len) {
159
fprintf(f, " (total %lu bytes)", field_len);
165
#endif /* UNIV_DEBUG */
167
/******************************************************//**
168
Allocate a sort buffer.
169
@return own: sort buffer */
172
row_merge_buf_create_low(
173
/*=====================*/
174
mem_heap_t* heap, /*!< in: heap where allocated */
175
dict_index_t* index, /*!< in: secondary index */
176
ulint max_tuples, /*!< in: maximum number of data tuples */
177
ulint buf_size) /*!< in: size of the buffer, in bytes */
179
row_merge_buf_t* buf;
181
ut_ad(max_tuples > 0);
182
ut_ad(max_tuples <= sizeof(row_merge_block_t));
183
ut_ad(max_tuples < buf_size);
185
buf = mem_heap_zalloc(heap, buf_size);
188
buf->max_tuples = max_tuples;
189
buf->tuples = mem_heap_alloc(heap,
190
2 * max_tuples * sizeof *buf->tuples);
191
buf->tmp_tuples = buf->tuples + max_tuples;
196
/******************************************************//**
197
Allocate a sort buffer.
198
@return own: sort buffer */
201
row_merge_buf_create(
202
/*=================*/
203
dict_index_t* index) /*!< in: secondary index */
205
row_merge_buf_t* buf;
210
max_tuples = sizeof(row_merge_block_t)
211
/ ut_max(1, dict_index_get_min_size(index));
213
buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
215
heap = mem_heap_create(buf_size + sizeof(row_merge_block_t));
217
buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
222
/******************************************************//**
224
@return sort buffer */
229
row_merge_buf_t* buf) /*!< in,own: sort buffer */
232
ulint max_tuples = buf->max_tuples;
233
mem_heap_t* heap = buf->heap;
234
dict_index_t* index = buf->index;
236
buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
238
mem_heap_empty(heap);
240
return(row_merge_buf_create_low(heap, index, max_tuples, buf_size));
243
/******************************************************//**
244
Deallocate a sort buffer. */
249
row_merge_buf_t* buf) /*!< in,own: sort buffer, to be freed */
251
mem_heap_free(buf->heap);
254
/******************************************************//**
255
Insert a data tuple into a sort buffer.
256
@return TRUE if added, FALSE if out of space */
261
row_merge_buf_t* buf, /*!< in/out: sort buffer */
262
const dtuple_t* row, /*!< in: row in clustered index */
263
const row_ext_t* ext) /*!< in: cache of externally stored
264
column prefixes, or NULL */
270
const dict_index_t* index;
274
if (buf->n_tuples >= buf->max_tuples) {
278
UNIV_PREFETCH_R(row->fields);
282
n_fields = dict_index_get_n_fields(index);
284
entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
285
buf->tuples[buf->n_tuples] = entry;
289
extra_size = UT_BITS_IN_BYTES(index->n_nullable);
291
for (i = 0; i < n_fields; i++, field++) {
292
const dict_field_t* ifield;
293
const dict_col_t* col;
295
const dfield_t* row_field;
298
ifield = dict_index_get_nth_field(index, i);
300
col_no = dict_col_get_no(col);
301
row_field = dtuple_get_nth_field(row, col_no);
302
dfield_copy(field, row_field);
303
len = dfield_get_len(field);
305
if (dfield_is_null(field)) {
306
ut_ad(!(col->prtype & DATA_NOT_NULL));
308
} else if (UNIV_LIKELY(!ext)) {
309
} else if (dict_index_is_clust(index)) {
310
/* Flag externally stored fields. */
311
const byte* buf = row_ext_lookup(ext, col_no,
313
if (UNIV_LIKELY_NULL(buf)) {
314
ut_a(buf != field_ref_zero);
315
if (i < dict_index_get_n_unique(index)) {
316
dfield_set_data(field, buf, len);
318
dfield_set_ext(field);
319
len = dfield_get_len(field);
323
const byte* buf = row_ext_lookup(ext, col_no,
325
if (UNIV_LIKELY_NULL(buf)) {
326
ut_a(buf != field_ref_zero);
327
dfield_set_data(field, buf, len);
331
/* If a column prefix index, take only the prefix */
333
if (ifield->prefix_len) {
334
len = dtype_get_at_most_n_mbchars(
336
col->mbminlen, col->mbmaxlen,
338
len, dfield_get_data(field));
339
dfield_set_len(field, len);
342
ut_ad(len <= col->len || col->mtype == DATA_BLOB);
344
if (ifield->fixed_len) {
345
ut_ad(len == ifield->fixed_len);
346
ut_ad(!dfield_is_ext(field));
347
} else if (dfield_is_ext(field)) {
350
|| (col->len < 256 && col->mtype != DATA_BLOB)) {
353
/* For variable-length columns, we look up the
354
maximum length from the column itself. If this
355
is a prefix index column shorter than 256 bytes,
356
this will waste one byte. */
367
size = rec_get_converted_size_comp(index,
369
entry, n_fields, &extra);
371
ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size);
372
ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra);
374
#endif /* UNIV_DEBUG */
376
/* Add to the total size of the record in row_merge_block_t
377
the encoded length of extra_size and the extra bytes (extra_size).
378
See row_merge_buf_write() for the variable-length encoding
380
data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
382
/* The following assertion may fail if row_merge_block_t is
383
declared very small and a PRIMARY KEY is being created with
384
many prefix columns. In that case, the record may exceed the
385
page_zip_rec_needs_ext() limit. However, no further columns
386
will be moved to external storage until the record is inserted
387
to the clustered index B-tree. */
388
ut_ad(data_size < sizeof(row_merge_block_t));
390
/* Reserve one byte for the end marker of row_merge_block_t. */
391
if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) {
395
buf->total_size += data_size;
400
/* Copy the data fields. */
403
dfield_dup(field++, buf->heap);
404
} while (--n_fields);
409
/** Structure for reporting duplicate records. */
410
struct row_merge_dup_struct {
411
const dict_index_t* index; /*!< index being sorted */
412
TABLE* table; /*!< MySQL table object */
413
ulint n_dup; /*!< number of duplicates */
416
/** Structure for reporting duplicate records. */
417
typedef struct row_merge_dup_struct row_merge_dup_t;
419
/*************************************************************//**
420
Report a duplicate key. */
423
row_merge_dup_report(
424
/*=================*/
425
row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
426
const dfield_t* entry) /*!< in: duplicate index entry */
429
const dtuple_t* tuple;
430
dtuple_t tuple_store;
432
const dict_index_t* index = dup->index;
433
ulint n_fields= dict_index_get_n_fields(index);
439
/* Only report the first duplicate record,
440
but count all duplicate records. */
444
/* Convert the tuple to a record and then to MySQL format. */
445
heap = mem_heap_create((1 + REC_OFFS_HEADER_SIZE + n_fields)
449
buf = mem_heap_alloc(heap, sizeof *buf);
451
tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
452
n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
454
rec = rec_convert_dtuple_to_rec(*buf, index, tuple, n_ext);
455
offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
457
innobase_rec_to_mysql(dup->table, rec, index, offsets);
462
/*************************************************************//**
464
@return 1, 0, -1 if a is greater, equal, less, respectively, than b */
469
ulint n_field,/*!< in: number of fields */
470
const dfield_t* a, /*!< in: first tuple to be compared */
471
const dfield_t* b, /*!< in: second tuple to be compared */
472
row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */
475
const dfield_t* field = a;
477
/* Compare the fields of the tuples until a difference is
478
found or we run out of fields to compare. If !cmp at the
479
end, the tuples are equal. */
481
cmp = cmp_dfield_dfield(a++, b++);
482
} while (!cmp && --n_field);
484
if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) {
485
/* Report a duplicate value error if the tuples are
486
logically equal. NULL columns are logically inequal,
487
although they are equal in the sorting order. Find
488
out if any of the fields are NULL. */
489
for (b = field; b != a; b++) {
490
if (dfield_is_null(b)) {
496
row_merge_dup_report(dup, field);
503
/** Wrapper for row_merge_tuple_sort() to inject some more context to
504
UT_SORT_FUNCTION_BODY().
505
@param a array of tuples that being sorted
506
@param b aux (work area), same size as tuples[]
507
@param c lower bound of the sorting area, inclusive
508
@param d upper bound of the sorting area, inclusive */
509
#define row_merge_tuple_sort_ctx(a,b,c,d) \
510
row_merge_tuple_sort(n_field, dup, a, b, c, d)
511
/** Wrapper for row_merge_tuple_cmp() to inject some more context to
512
UT_SORT_FUNCTION_BODY().
513
@param a first tuple to be compared
514
@param b second tuple to be compared
515
@return 1, 0, -1 if a is greater, equal, less, respectively, than b */
516
#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup)
518
/**********************************************************************//**
519
Merge sort the tuple buffer in main memory. */
522
row_merge_tuple_sort(
523
/*=================*/
524
ulint n_field,/*!< in: number of fields */
525
row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
526
const dfield_t** tuples, /*!< in/out: tuples */
527
const dfield_t** aux, /*!< in/out: work area */
528
ulint low, /*!< in: lower bound of the
529
sorting area, inclusive */
530
ulint high) /*!< in: upper bound of the
531
sorting area, exclusive */
533
UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
534
tuples, aux, low, high, row_merge_tuple_cmp_ctx);
537
/******************************************************//**
543
row_merge_buf_t* buf, /*!< in/out: sort buffer */
544
row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */
546
row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup,
547
buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
550
/******************************************************//**
551
Write a buffer to a block. */
556
const row_merge_buf_t* buf, /*!< in: sorted buffer */
558
const merge_file_t* of, /*!< in: output file */
559
#endif /* UNIV_DEBUG */
560
row_merge_block_t* block) /*!< out: buffer for writing to file */
562
# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block)
563
#endif /* !UNIV_DEBUG */
565
const dict_index_t* index = buf->index;
566
ulint n_fields= dict_index_get_n_fields(index);
567
byte* b = &(*block)[0];
571
for (i = 0; i < buf->n_tuples; i++) {
574
const dfield_t* entry = buf->tuples[i];
576
size = rec_get_converted_size_comp(index,
580
ut_ad(size > extra_size);
581
ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES);
582
extra_size -= REC_N_NEW_EXTRA_BYTES;
583
size -= REC_N_NEW_EXTRA_BYTES;
585
/* Encode extra_size + 1 */
586
if (extra_size + 1 < 0x80) {
587
*b++ = (byte) (extra_size + 1);
589
ut_ad((extra_size + 1) < 0x8000);
590
*b++ = (byte) (0x80 | ((extra_size + 1) >> 8));
591
*b++ = (byte) (extra_size + 1);
594
ut_ad(b + size < block[1]);
596
rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index,
603
if (row_merge_print_write) {
604
fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
605
(void*) b, of->fd, (ulong) of->offset,
607
row_merge_tuple_print(stderr, entry, n_fields);
609
#endif /* UNIV_DEBUG */
612
/* Write an "end-of-chunk" marker. */
614
ut_a(b == block[0] + buf->total_size);
616
#ifdef UNIV_DEBUG_VALGRIND
617
/* The rest of the block is uninitialized. Initialize it
618
to avoid bogus warnings. */
619
memset(b, 0xff, block[1] - b);
620
#endif /* UNIV_DEBUG_VALGRIND */
622
if (row_merge_print_write) {
623
fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
624
(void*) b, of->fd, (ulong) of->offset);
626
#endif /* UNIV_DEBUG */
629
/******************************************************//**
630
Create a memory heap and allocate space for row_merge_rec_offsets()
632
@return memory heap */
635
row_merge_heap_create(
636
/*==================*/
637
const dict_index_t* index, /*!< in: record descriptor */
638
mrec_buf_t** buf, /*!< out: 3 buffers */
639
ulint** offsets1, /*!< out: offsets */
640
ulint** offsets2) /*!< out: offsets */
642
ulint i = 1 + REC_OFFS_HEADER_SIZE
643
+ dict_index_get_n_fields(index);
644
mem_heap_t* heap = mem_heap_create(2 * i * sizeof **offsets1
647
*buf = mem_heap_alloc(heap, 3 * sizeof **buf);
648
*offsets1 = mem_heap_alloc(heap, i * sizeof **offsets1);
649
*offsets2 = mem_heap_alloc(heap, i * sizeof **offsets2);
651
(*offsets1)[0] = (*offsets2)[0] = i;
652
(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
657
/**********************************************************************//**
658
Search an index object by name and column names. If several indexes match,
659
return the index with the max id.
660
@return matching index, NULL if not found */
663
row_merge_dict_table_get_index(
664
/*===========================*/
665
dict_table_t* table, /*!< in: table */
666
const merge_index_def_t*index_def) /*!< in: index definition */
670
const char** column_names;
672
column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
674
for (i = 0; i < index_def->n_fields; ++i) {
675
column_names[i] = index_def->fields[i].field_name;
678
index = dict_table_get_index_by_max_id(
679
table, index_def->name, column_names, index_def->n_fields);
681
mem_free((void*) column_names);
686
/********************************************************************//**
687
Read a merge block from the file system.
688
@return TRUE if request was successful, FALSE if fail */
693
int fd, /*!< in: file descriptor */
694
ulint offset, /*!< in: offset where to read */
695
row_merge_block_t* buf) /*!< out: data */
697
ib_uint64_t ofs = ((ib_uint64_t) offset) * sizeof *buf;
701
if (row_merge_print_block_read) {
702
fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
705
#endif /* UNIV_DEBUG */
707
success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
708
(ulint) (ofs & 0xFFFFFFFF),
711
if (UNIV_UNLIKELY(!success)) {
712
ut_print_timestamp(stderr);
714
" InnoDB: failed to read merge block at %"PRIu64"\n", ofs);
717
return(UNIV_LIKELY(success));
720
/********************************************************************//**
721
Write a merge block to the file system.
722
@return TRUE if request was successful, FALSE if fail */
727
int fd, /*!< in: file descriptor */
728
ulint offset, /*!< in: offset where to read
729
in number of row_merge_block_t
731
const void* buf) /*!< in: data */
733
ib_uint64_t ofs = ((ib_uint64_t) offset)
734
* sizeof(row_merge_block_t);
737
if (row_merge_print_block_write) {
738
fprintf(stderr, "row_merge_write fd=%d ofs=%lu\n",
741
#endif /* UNIV_DEBUG */
743
return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
744
(ulint) (ofs & 0xFFFFFFFF),
746
sizeof(row_merge_block_t))));
749
/********************************************************************//**
751
@return pointer to next record, or NULL on I/O error or end of list */
752
static __attribute__((nonnull))
756
row_merge_block_t* block, /*!< in/out: file buffer */
757
mrec_buf_t* buf, /*!< in/out: secondary buffer */
758
const byte* b, /*!< in: pointer to record */
759
const dict_index_t* index, /*!< in: index of the record */
760
int fd, /*!< in: file descriptor */
761
ulint* foffs, /*!< in/out: file offset */
762
const mrec_t** mrec, /*!< out: pointer to merge record,
763
or NULL on end of list
764
(non-NULL on I/O error) */
765
ulint* offsets)/*!< out: offsets of mrec */
773
ut_ad(b >= block[0]);
780
ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
781
+ dict_index_get_n_fields(index));
785
if (UNIV_UNLIKELY(!extra_size)) {
789
if (row_merge_print_read) {
790
fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
791
(const void*) b, (const void*) block,
794
#endif /* UNIV_DEBUG */
798
if (extra_size >= 0x80) {
799
/* Read another byte of extra_size. */
801
if (UNIV_UNLIKELY(b >= block[1])) {
802
if (!row_merge_read(fd, ++(*foffs), block)) {
804
/* Signal I/O error. */
809
/* Wrap around to the beginning of the buffer. */
813
extra_size = (extra_size & 0x7f) << 8;
817
/* Normalize extra_size. Above, value 0 signals "end of list". */
820
/* Read the extra bytes. */
822
if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
823
/* The record spans two blocks. Copy the entire record
824
to the auxiliary buffer and handle this as a special
827
avail_size = block[1] - b;
829
memcpy(*buf, b, avail_size);
831
if (!row_merge_read(fd, ++(*foffs), block)) {
836
/* Wrap around to the beginning of the buffer. */
839
/* Copy the record. */
840
memcpy(*buf + avail_size, b, extra_size - avail_size);
841
b += extra_size - avail_size;
843
*mrec = *buf + extra_size;
845
rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
847
data_size = rec_offs_data_size(offsets);
849
/* These overflows should be impossible given that
850
records are much smaller than either buffer, and
851
the record starts near the beginning of each buffer. */
852
ut_a(extra_size + data_size < sizeof *buf);
853
ut_a(b + data_size < block[1]);
855
/* Copy the data bytes. */
856
memcpy(*buf + extra_size, b, data_size);
862
*mrec = b + extra_size;
864
rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
866
data_size = rec_offs_data_size(offsets);
867
ut_ad(extra_size + data_size < sizeof *buf);
869
b += extra_size + data_size;
871
if (UNIV_LIKELY(b < block[1])) {
872
/* The record fits entirely in the block.
873
This is the normal case. */
877
/* The record spans two blocks. Copy it to buf. */
879
b -= extra_size + data_size;
880
avail_size = block[1] - b;
881
memcpy(*buf, b, avail_size);
882
*mrec = *buf + extra_size;
884
/* We cannot invoke rec_offs_make_valid() here, because there
885
are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size.
886
Similarly, rec_offs_validate() would fail, because it invokes
888
offsets[2] = (ulint) *mrec;
889
offsets[3] = (ulint) index;
890
#endif /* UNIV_DEBUG */
892
if (!row_merge_read(fd, ++(*foffs), block)) {
897
/* Wrap around to the beginning of the buffer. */
900
/* Copy the rest of the record. */
901
memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
902
b += extra_size + data_size - avail_size;
906
if (row_merge_print_read) {
907
fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
908
(const void*) b, (const void*) block,
910
rec_print_comp(stderr, *mrec, offsets);
913
#endif /* UNIV_DEBUG */
918
/********************************************************************//**
919
Write a merge record. */
922
row_merge_write_rec_low(
923
/*====================*/
924
byte* b, /*!< out: buffer */
925
ulint e, /*!< in: encoded extra_size */
927
ulint size, /*!< in: total size to write */
928
int fd, /*!< in: file descriptor */
929
ulint foffs, /*!< in: file offset */
930
#endif /* UNIV_DEBUG */
931
const mrec_t* mrec, /*!< in: record to write */
932
const ulint* offsets)/*!< in: offsets of mrec */
934
# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \
935
row_merge_write_rec_low(b, e, mrec, offsets)
936
#endif /* !UNIV_DEBUG */
939
const byte* const end = b + size;
940
ut_ad(e == rec_offs_extra_size(offsets) + 1);
942
if (row_merge_print_write) {
943
fprintf(stderr, "row_merge_write %p,%d,%lu ",
944
(void*) b, fd, (ulong) foffs);
945
rec_print_comp(stderr, mrec, offsets);
948
#endif /* UNIV_DEBUG */
953
*b++ = (byte) (0x80 | (e >> 8));
957
memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
958
ut_ad(b + rec_offs_size(offsets) == end);
961
/********************************************************************//**
962
Write a merge record.
963
@return pointer to end of block, or NULL on error */
968
row_merge_block_t* block, /*!< in/out: file buffer */
969
mrec_buf_t* buf, /*!< in/out: secondary buffer */
970
byte* b, /*!< in: pointer to end of block */
971
int fd, /*!< in: file descriptor */
972
ulint* foffs, /*!< in/out: file offset */
973
const mrec_t* mrec, /*!< in: record to write */
974
const ulint* offsets)/*!< in: offsets of mrec */
982
ut_ad(b >= block[0]);
986
ut_ad(mrec < block[0] || mrec > block[1]);
987
ut_ad(mrec < buf[0] || mrec > buf[1]);
989
/* Normalize extra_size. Value 0 signals "end of list". */
990
extra_size = rec_offs_extra_size(offsets) + 1;
992
size = extra_size + (extra_size >= 0x80)
993
+ rec_offs_data_size(offsets);
995
if (UNIV_UNLIKELY(b + size >= block[1])) {
996
/* The record spans two blocks.
997
Copy it to the temporary buffer first. */
998
avail_size = block[1] - b;
1000
row_merge_write_rec_low(buf[0],
1001
extra_size, size, fd, *foffs,
1004
/* Copy the head of the temporary buffer, write
1005
the completed block, and copy the tail of the
1006
record to the head of the new block. */
1007
memcpy(b, buf[0], avail_size);
1009
if (!row_merge_write(fd, (*foffs)++, block)) {
1013
UNIV_MEM_INVALID(block[0], sizeof block[0]);
1015
/* Copy the rest. */
1017
memcpy(b, buf[0] + avail_size, size - avail_size);
1018
b += size - avail_size;
1020
row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
1028
/********************************************************************//**
1029
Write an end-of-list marker.
1030
@return pointer to end of block, or NULL on error */
1033
row_merge_write_eof(
1034
/*================*/
1035
row_merge_block_t* block, /*!< in/out: file buffer */
1036
byte* b, /*!< in: pointer to end of block */
1037
int fd, /*!< in: file descriptor */
1038
ulint* foffs) /*!< in/out: file offset */
1041
ut_ad(b >= block[0]);
1042
ut_ad(b < block[1]);
1045
if (row_merge_print_write) {
1046
fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
1047
(void*) b, (void*) block, fd, (ulong) *foffs);
1049
#endif /* UNIV_DEBUG */
1052
UNIV_MEM_ASSERT_RW(block[0], b - block[0]);
1053
UNIV_MEM_ASSERT_W(block[0], sizeof block[0]);
1054
#ifdef UNIV_DEBUG_VALGRIND
1055
/* The rest of the block is uninitialized. Initialize it
1056
to avoid bogus warnings. */
1057
memset(b, 0xff, block[1] - b);
1058
#endif /* UNIV_DEBUG_VALGRIND */
1060
if (!row_merge_write(fd, (*foffs)++, block)) {
1064
UNIV_MEM_INVALID(block[0], sizeof block[0]);
1068
/*************************************************************//**
1069
Compare two merge records.
1070
@return 1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */
1075
const mrec_t* mrec1, /*!< in: first merge
1076
record to be compared */
1077
const mrec_t* mrec2, /*!< in: second merge
1078
record to be compared */
1079
const ulint* offsets1, /*!< in: first record offsets */
1080
const ulint* offsets2, /*!< in: second record offsets */
1081
const dict_index_t* index, /*!< in: index */
1082
ibool* null_eq) /*!< out: set to TRUE if
1083
found matching null values */
1087
cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index,
1091
if (row_merge_print_cmp) {
1092
fputs("row_merge_cmp1 ", stderr);
1093
rec_print_comp(stderr, mrec1, offsets1);
1094
fputs("\nrow_merge_cmp2 ", stderr);
1095
rec_print_comp(stderr, mrec2, offsets2);
1096
fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp);
1098
#endif /* UNIV_DEBUG */
1103
/********************************************************************//**
1104
Reads clustered index of the table and create temporary files
1105
containing the index entries for the indexes to be built.
1106
@return DB_SUCCESS or error */
1107
static __attribute__((nonnull))
1109
row_merge_read_clustered_index(
1110
/*===========================*/
1111
trx_t* trx, /*!< in: transaction */
1112
TABLE* table, /*!< in/out: MySQL table object,
1113
for reporting erroneous records */
1114
const dict_table_t* old_table,/*!< in: table where rows are
1116
const dict_table_t* new_table,/*!< in: table where indexes are
1117
created; identical to old_table
1118
unless creating a PRIMARY KEY */
1119
dict_index_t** index, /*!< in: indexes to be created */
1120
merge_file_t* files, /*!< in: temporary files */
1121
ulint n_index,/*!< in: number of indexes to create */
1122
row_merge_block_t* block) /*!< in/out: file buffer */
1124
dict_index_t* clust_index; /* Clustered index */
1125
mem_heap_t* row_heap; /* Heap memory to create
1126
clustered index records */
1127
row_merge_buf_t** merge_buf; /* Temporary list for records*/
1128
btr_pcur_t pcur; /* Persistent cursor on the
1130
mtr_t mtr; /* Mini transaction */
1131
ulint err = DB_SUCCESS;/* Return code */
1133
ulint n_nonnull = 0; /* number of columns
1134
changed to NOT NULL */
1135
ulint* nonnull = NULL; /* NOT NULL columns */
1137
trx->op_info = "reading clustered index";
1145
/* Create and initialize memory for record buffers */
1147
merge_buf = mem_alloc(n_index * sizeof *merge_buf);
1149
for (i = 0; i < n_index; i++) {
1150
merge_buf[i] = row_merge_buf_create(index[i]);
1155
/* Find the clustered index and create a persistent cursor
1158
clust_index = dict_table_get_first_index(old_table);
1160
btr_pcur_open_at_index_side(
1161
TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
1163
if (UNIV_UNLIKELY(old_table != new_table)) {
1164
ulint n_cols = dict_table_get_n_cols(old_table);
1166
/* A primary key will be created. Identify the
1167
columns that were flagged NOT NULL in the new table,
1168
so that we can quickly check that the records in the
1169
(old) clustered index do not violate the added NOT
1170
NULL constraints. */
1172
ut_a(n_cols == dict_table_get_n_cols(new_table));
1174
nonnull = mem_alloc(n_cols * sizeof *nonnull);
1176
for (i = 0; i < n_cols; i++) {
1177
if (dict_table_get_nth_col(old_table, i)->prtype
1183
if (dict_table_get_nth_col(new_table, i)->prtype
1186
nonnull[n_nonnull++] = i;
1196
row_heap = mem_heap_create(sizeof(mrec_buf_t));
1198
/* Scan the clustered index. */
1202
dtuple_t* row = NULL;
1204
ibool has_next = TRUE;
1206
btr_pcur_move_to_next_on_page(&pcur);
1208
/* When switching pages, commit the mini-transaction
1209
in order to release the latch on the old page. */
1211
if (btr_pcur_is_after_last_on_page(&pcur)) {
1212
if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
1213
err = DB_INTERRUPTED;
1214
trx->error_key_num = 0;
1218
btr_pcur_store_position(&pcur, &mtr);
1221
btr_pcur_restore_position(BTR_SEARCH_LEAF,
1223
has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr);
1226
if (UNIV_LIKELY(has_next)) {
1227
rec = btr_pcur_get_rec(&pcur);
1228
offsets = rec_get_offsets(rec, clust_index, NULL,
1229
ULINT_UNDEFINED, &row_heap);
1231
/* Skip delete marked records. */
1232
if (rec_get_deleted_flag(
1233
rec, dict_table_is_comp(old_table))) {
1237
srv_n_rows_inserted++;
1239
/* Build a row based on the clustered index. */
1241
row = row_build(ROW_COPY_POINTERS, clust_index,
1243
new_table, &ext, row_heap);
1245
if (UNIV_LIKELY_NULL(nonnull)) {
1246
for (i = 0; i < n_nonnull; i++) {
1248
= &row->fields[nonnull[i]];
1250
= dfield_get_type(field);
1252
ut_a(!(field_type->prtype
1255
if (dfield_is_null(field)) {
1256
err = DB_PRIMARY_KEY_IS_NULL;
1257
trx->error_key_num = 0;
1261
field_type->prtype |= DATA_NOT_NULL;
1266
/* Build all entries for all the indexes to be created
1267
in a single scan of the clustered index. */
1269
for (i = 0; i < n_index; i++) {
1270
row_merge_buf_t* buf = merge_buf[i];
1271
merge_file_t* file = &files[i];
1272
const dict_index_t* index = buf->index;
1275
(row && row_merge_buf_add(buf, row, ext))) {
1280
/* The buffer must be sufficiently large
1281
to hold at least one record. */
1282
ut_ad(buf->n_tuples || !has_next);
1284
/* We have enough data tuples to form a block.
1285
Sort them and write to disk. */
1287
if (buf->n_tuples) {
1288
if (dict_index_is_unique(index)) {
1289
row_merge_dup_t dup;
1290
dup.index = buf->index;
1294
row_merge_buf_sort(buf, &dup);
1297
err = DB_DUPLICATE_KEY;
1298
trx->error_key_num = i;
1302
row_merge_buf_sort(buf, NULL);
1306
row_merge_buf_write(buf, file, block);
1308
if (!row_merge_write(file->fd, file->offset++,
1310
err = DB_OUT_OF_FILE_SPACE;
1311
trx->error_key_num = i;
1315
UNIV_MEM_INVALID(block[0], sizeof block[0]);
1316
merge_buf[i] = row_merge_buf_empty(buf);
1318
if (UNIV_LIKELY(row != NULL)) {
1319
/* Try writing the record again, now
1320
that the buffer has been written out
1324
(!row_merge_buf_add(buf, row, ext))) {
1325
/* An empty buffer should have enough
1326
room for at least one record. */
1334
mem_heap_empty(row_heap);
1336
if (UNIV_UNLIKELY(!has_next)) {
1342
btr_pcur_close(&pcur);
1344
mem_heap_free(row_heap);
1346
if (UNIV_LIKELY_NULL(nonnull)) {
1350
for (i = 0; i < n_index; i++) {
1351
row_merge_buf_free(merge_buf[i]);
1354
mem_free(merge_buf);
1361
/** Write a record via buffer 2 and read the next record to buffer N.
1362
@param N number of the buffer (0 or 1)
1363
@param AT_END statement to execute at end of input */
1364
#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END) \
1366
b2 = row_merge_write_rec(&block[2], &buf[2], b2, \
1367
of->fd, &of->offset, \
1368
mrec##N, offsets##N); \
1369
if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) { \
1372
b##N = row_merge_read_rec(&block[N], &buf[N], \
1374
file->fd, foffs##N, \
1375
&mrec##N, offsets##N); \
1376
if (UNIV_UNLIKELY(!b##N)) { \
1384
/*************************************************************//**
1385
Merge two blocks of records on disk and write a bigger block.
1386
@return DB_SUCCESS or error code */
1391
const dict_index_t* index, /*!< in: index being created */
1392
const merge_file_t* file, /*!< in: file containing
1394
row_merge_block_t* block, /*!< in/out: 3 buffers */
1395
ulint* foffs0, /*!< in/out: offset of first
1396
source list in the file */
1397
ulint* foffs1, /*!< in/out: offset of second
1398
source list in the file */
1399
merge_file_t* of, /*!< in/out: output file */
1400
TABLE* table) /*!< in/out: MySQL table, for
1401
reporting erroneous key value
1404
mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
1406
mrec_buf_t* buf; /*!< buffer for handling
1407
split mrec in block[] */
1408
const byte* b0; /*!< pointer to block[0] */
1409
const byte* b1; /*!< pointer to block[1] */
1410
byte* b2; /*!< pointer to block[2] */
1411
const mrec_t* mrec0; /*!< merge rec, points to block[0] or buf[0] */
1412
const mrec_t* mrec1; /*!< merge rec, points to block[1] or buf[1] */
1413
ulint* offsets0;/* offsets of mrec0 */
1414
ulint* offsets1;/* offsets of mrec1 */
1417
if (row_merge_print_block) {
1419
"row_merge_blocks fd=%d ofs=%lu + fd=%d ofs=%lu"
1420
" = fd=%d ofs=%lu\n",
1421
file->fd, (ulong) *foffs0,
1422
file->fd, (ulong) *foffs1,
1423
of->fd, (ulong) of->offset);
1425
#endif /* UNIV_DEBUG */
1427
heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
1429
buf = mem_heap_alloc(heap, sizeof(mrec_buf_t) * 3);
1431
/* Write a record and read the next record. Split the output
1432
file in two halves, which can be merged on the following pass. */
1434
if (!row_merge_read(file->fd, *foffs0, &block[0])
1435
|| !row_merge_read(file->fd, *foffs1, &block[1])) {
1437
mem_heap_free(heap);
1438
return(DB_CORRUPTION);
1445
b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
1446
foffs0, &mrec0, offsets0);
1447
b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd,
1448
foffs1, &mrec1, offsets1);
1449
if (UNIV_UNLIKELY(!b0 && mrec0)
1450
|| UNIV_UNLIKELY(!b1 && mrec1)) {
1455
while (mrec0 && mrec1) {
1456
ibool null_eq = FALSE;
1457
switch (row_merge_cmp(mrec0, mrec1,
1458
offsets0, offsets1, index,
1462
(dict_index_is_unique(index) && !null_eq)) {
1463
innobase_rec_to_mysql(table, mrec0,
1465
mem_heap_free(heap);
1466
return(DB_DUPLICATE_KEY);
1470
ROW_MERGE_WRITE_GET_NEXT(0, goto merged);
1473
ROW_MERGE_WRITE_GET_NEXT(1, goto merged);
1483
/* append all mrec0 to output */
1485
ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
1490
/* append all mrec1 to output */
1492
ROW_MERGE_WRITE_GET_NEXT(1, goto done1);
1497
mem_heap_free(heap);
1498
b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset);
1499
return(b2 ? DB_SUCCESS : DB_CORRUPTION);
1502
/*************************************************************//**
1503
Copy a block of index entries.
1504
@return TRUE on success, FALSE on failure */
1505
static __attribute__((nonnull))
1507
row_merge_blocks_copy(
1508
/*==================*/
1509
const dict_index_t* index, /*!< in: index being created */
1510
const merge_file_t* file, /*!< in: input file */
1511
row_merge_block_t* block, /*!< in/out: 3 buffers */
1512
ulint* foffs0, /*!< in/out: input file offset */
1513
merge_file_t* of) /*!< in/out: output file */
1515
mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
1517
mrec_buf_t* buf; /*!< buffer for handling
1518
split mrec in block[] */
1519
const byte* b0; /*!< pointer to block[0] */
1520
byte* b2; /*!< pointer to block[2] */
1521
const mrec_t* mrec0; /*!< merge rec, points to block[0] */
1522
ulint* offsets0;/* offsets of mrec0 */
1523
ulint* offsets1;/* dummy offsets */
1526
if (row_merge_print_block) {
1528
"row_merge_blocks_copy fd=%d ofs=%lu"
1529
" = fd=%d ofs=%lu\n",
1530
file->fd, (ulong) foffs0,
1531
of->fd, (ulong) of->offset);
1533
#endif /* UNIV_DEBUG */
1535
heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
1537
buf = mem_heap_alloc(heap, sizeof(mrec_buf_t) * 3);
1539
/* Write a record and read the next record. Split the output
1540
file in two halves, which can be merged on the following pass. */
1542
if (!row_merge_read(file->fd, *foffs0, &block[0])) {
1544
mem_heap_free(heap);
1551
b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
1552
foffs0, &mrec0, offsets0);
1553
if (UNIV_UNLIKELY(!b0 && mrec0)) {
1559
/* append all mrec0 to output */
1561
ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
1566
/* The file offset points to the beginning of the last page
1567
that has been read. Update it to point to the next block. */
1570
mem_heap_free(heap);
1571
return(row_merge_write_eof(&block[2], b2, of->fd, &of->offset)
1575
/*************************************************************//**
1577
@return DB_SUCCESS or error code */
1578
static __attribute__((nonnull))
1582
trx_t* trx, /*!< in: transaction */
1583
const dict_index_t* index, /*!< in: index being created */
1584
merge_file_t* file, /*!< in/out: file containing
1586
ulint* half, /*!< in/out: half the file */
1587
row_merge_block_t* block, /*!< in/out: 3 buffers */
1588
int* tmpfd, /*!< in/out: temporary file handle */
1589
TABLE* table) /*!< in/out: MySQL table, for
1590
reporting erroneous key value
1593
ulint foffs0; /*!< first input offset */
1594
ulint foffs1; /*!< second input offset */
1595
ulint error; /*!< error code */
1596
merge_file_t of; /*!< output file */
1597
const ulint ihalf = *half;
1598
/*!< half the input file */
1599
ulint ohalf; /*!< half the output file */
1601
UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]);
1602
ut_ad(ihalf < file->offset);
1608
/* Merge blocks to the output file. */
1613
for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
1614
ulint ahalf; /*!< arithmetic half the input file */
1616
if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
1617
return(DB_INTERRUPTED);
1620
error = row_merge_blocks(index, file, block,
1621
&foffs0, &foffs1, &of, table);
1623
if (error != DB_SUCCESS) {
1627
/* Record the offset of the output file when
1628
approximately half the output has been generated. In
1629
this way, the next invocation of row_merge() will
1630
spend most of the time in this loop. The initial
1631
estimate is ohalf==0. */
1632
ahalf = file->offset / 2;
1633
ut_ad(ohalf <= of.offset);
1635
/* Improve the estimate until reaching half the input
1636
file size, or we can not get any closer to it. All
1637
comparands should be non-negative when !(ohalf < ahalf)
1638
because ohalf <= of.offset. */
1639
if (ohalf < ahalf || of.offset - ahalf < ohalf - ahalf) {
1644
/* Copy the last blocks, if there are any. */
1646
while (foffs0 < ihalf) {
1647
if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
1648
return(DB_INTERRUPTED);
1651
if (!row_merge_blocks_copy(index, file, block, &foffs0, &of)) {
1652
return(DB_CORRUPTION);
1656
ut_ad(foffs0 == ihalf);
1658
while (foffs1 < file->offset) {
1659
if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
1660
return(DB_INTERRUPTED);
1663
if (!row_merge_blocks_copy(index, file, block, &foffs1, &of)) {
1664
return(DB_CORRUPTION);
1668
ut_ad(foffs1 == file->offset);
1670
if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
1671
return(DB_CORRUPTION);
1674
/* Swap file descriptors for the next pass. */
1679
UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]);
1684
/*************************************************************//**
1686
@return DB_SUCCESS or error code */
1691
trx_t* trx, /*!< in: transaction */
1692
const dict_index_t* index, /*!< in: index being created */
1693
merge_file_t* file, /*!< in/out: file containing
1695
row_merge_block_t* block, /*!< in/out: 3 buffers */
1696
int* tmpfd, /*!< in/out: temporary file handle */
1697
TABLE* table) /*!< in/out: MySQL table, for
1698
reporting erroneous key value
1701
ulint half = file->offset / 2;
1703
/* The file should always contain at least one byte (the end
1704
of file marker). Thus, it must be at least one block. */
1705
ut_ad(file->offset > 0);
1710
error = row_merge(trx, index, file, &half,
1711
block, tmpfd, table);
1713
if (error != DB_SUCCESS) {
1717
/* half > 0 should hold except when the file consists
1718
of one block. No need to merge further then. */
1719
ut_ad(half > 0 || file->offset == 1);
1720
} while (half < file->offset && half > 0);
1725
/*************************************************************//**
1726
Copy externally stored columns to the data tuple. */
1729
row_merge_copy_blobs(
1730
/*=================*/
1731
const mrec_t* mrec, /*!< in: merge record */
1732
const ulint* offsets,/*!< in: offsets of mrec */
1733
ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
1734
dtuple_t* tuple, /*!< in/out: data tuple */
1735
mem_heap_t* heap) /*!< in/out: memory heap */
1738
ulint n_fields = dtuple_get_n_fields(tuple);
1740
for (i = 0; i < n_fields; i++) {
1743
dfield_t* field = dtuple_get_nth_field(tuple, i);
1745
if (!dfield_is_ext(field)) {
1749
ut_ad(!dfield_is_null(field));
1751
/* The table is locked during index creation.
1752
Therefore, externally stored columns cannot possibly
1753
be freed between the time the BLOB pointers are read
1754
(row_merge_read_clustered_index()) and dereferenced
1756
data = btr_rec_copy_externally_stored_field(
1757
mrec, offsets, zip_size, i, &len, heap);
1759
dfield_set_data(field, data, len);
1763
/********************************************************************//**
1764
Read sorted file containing index data tuples and insert these data
1766
@return DB_SUCCESS or error number */
1769
row_merge_insert_index_tuples(
1770
/*==========================*/
1771
trx_t* trx, /*!< in: transaction */
1772
dict_index_t* index, /*!< in: index */
1773
dict_table_t* table, /*!< in: new table */
1774
ulint zip_size,/*!< in: compressed page size of
1775
the old table, or 0 if uncompressed */
1776
int fd, /*!< in: file descriptor */
1777
row_merge_block_t* block) /*!< in/out: file buffer */
1782
mem_heap_t* tuple_heap;
1783
mem_heap_t* graph_heap;
1784
ulint error = DB_SUCCESS;
1792
/* We use the insert query graph as the dummy graph
1793
needed in the row module call */
1795
trx->op_info = "inserting index entries";
1797
graph_heap = mem_heap_create(500 + sizeof(mrec_buf_t));
1798
node = ins_node_create(INS_DIRECT, table, graph_heap);
1800
thr = pars_complete_graph_for_exec(node, trx, graph_heap);
1802
que_thr_move_to_run_state_for_mysql(thr, trx);
1804
tuple_heap = mem_heap_create(1000);
1807
ulint i = 1 + REC_OFFS_HEADER_SIZE
1808
+ dict_index_get_n_fields(index);
1809
offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
1811
offsets[1] = dict_index_get_n_fields(index);
1816
if (!row_merge_read(fd, foffs, block)) {
1817
error = DB_CORRUPTION;
1819
mrec_buf_t* buf = mem_heap_alloc(graph_heap, sizeof *buf);
1826
b = row_merge_read_rec(block, buf, b, index,
1827
fd, &foffs, &mrec, offsets);
1828
if (UNIV_UNLIKELY(!b)) {
1829
/* End of list, or I/O error */
1831
error = DB_CORRUPTION;
1836
dtuple = row_rec_to_index_entry_low(
1837
mrec, index, offsets, &n_ext, tuple_heap);
1839
if (UNIV_UNLIKELY(n_ext)) {
1840
row_merge_copy_blobs(mrec, offsets, zip_size,
1841
dtuple, tuple_heap);
1845
node->table = table;
1846
node->trx_id = trx->id;
1848
ut_ad(dtuple_validate(dtuple));
1851
thr->run_node = thr;
1852
thr->prev_node = thr->common.parent;
1854
error = row_ins_index_entry(index, dtuple,
1857
if (UNIV_LIKELY(error == DB_SUCCESS)) {
1862
thr->lock_state = QUE_THR_LOCK_ROW;
1863
trx->error_state = error;
1864
que_thr_stop_for_mysql(thr);
1865
thr->lock_state = QUE_THR_LOCK_NOLOCK;
1866
} while (row_mysql_handle_errors(&error, trx,
1871
mem_heap_empty(tuple_heap);
1875
que_thr_stop_for_mysql_no_error(thr, trx);
1877
que_graph_free(thr->graph);
1881
mem_heap_free(tuple_heap);
1886
/*********************************************************************//**
1887
Sets an exclusive lock on a table, for the duration of creating indexes.
1888
@return error code or DB_SUCCESS */
1891
row_merge_lock_table(
1892
/*=================*/
1893
trx_t* trx, /*!< in/out: transaction */
1894
dict_table_t* table, /*!< in: table to lock */
1895
enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */
1903
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
1904
ut_ad(mode == LOCK_X || mode == LOCK_S);
1906
heap = mem_heap_create(512);
1908
trx->op_info = "setting table lock for creating or dropping index";
1910
node = sel_node_create(heap);
1911
thr = pars_complete_graph_for_exec(node, trx, heap);
1912
thr->graph->state = QUE_FORK_ACTIVE;
1914
/* We use the select query graph as the dummy graph needed
1915
in the lock module call */
1917
thr = que_fork_get_first_thr(que_node_get_parent(thr));
1918
que_thr_move_to_run_state_for_mysql(thr, trx);
1921
thr->run_node = thr;
1922
thr->prev_node = thr->common.parent;
1924
err = lock_table(0, table, mode, thr);
1926
trx->error_state = err;
1928
if (UNIV_LIKELY(err == DB_SUCCESS)) {
1929
que_thr_stop_for_mysql_no_error(thr, trx);
1931
que_thr_stop_for_mysql(thr);
1933
if (err != DB_QUE_THR_SUSPENDED) {
1934
ibool was_lock_wait;
1936
was_lock_wait = row_mysql_handle_errors(
1937
&err, trx, thr, NULL);
1939
if (was_lock_wait) {
1946
parent = que_node_get_parent(thr);
1947
run_thr = que_fork_start_command(parent);
1949
ut_a(run_thr == thr);
1951
/* There was a lock wait but the thread was not
1952
in a ready to run or running state. */
1953
trx->error_state = DB_LOCK_WAIT;
1959
que_graph_free(thr->graph);
1965
/*********************************************************************//**
1966
Drop an index from the InnoDB system tables. The data dictionary must
1967
have been locked exclusively by the caller, because the transaction
1968
will not be committed. */
1971
row_merge_drop_index(
1972
/*=================*/
1973
dict_index_t* index, /*!< in: index to be removed */
1974
dict_table_t* table, /*!< in: table */
1975
trx_t* trx) /*!< in: transaction handle */
1978
pars_info_t* info = pars_info_create();
1980
/* We use the private SQL parser of Innobase to generate the
1981
query graphs needed in deleting the dictionary data from system
1982
tables in Innobase. Deleting a row from SYS_INDEXES table also
1983
frees the file segments of the B-tree associated with the index. */
1985
static const char str1[] =
1986
"PROCEDURE DROP_INDEX_PROC () IS\n"
1988
/* Rename the index, so that it will be dropped by
1989
row_merge_drop_temp_indexes() at crash recovery
1990
if the server crashes before this trx is committed. */
1991
"UPDATE SYS_INDEXES SET NAME=CONCAT('"
1992
TEMP_INDEX_PREFIX_STR "', NAME) WHERE ID = :indexid;\n"
1994
/* Drop the field definitions of the index. */
1995
"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
1996
/* Drop the index definition and the B-tree. */
1997
"DELETE FROM SYS_INDEXES WHERE ID = :indexid;\n"
2000
ut_ad(index && table && trx);
2002
pars_info_add_dulint_literal(info, "indexid", index->id);
2004
trx_start_if_not_started(trx);
2005
trx->op_info = "dropping index";
2007
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
2009
err = que_eval_sql(info, str1, FALSE, trx);
2011
ut_a(err == DB_SUCCESS);
2013
/* Replace this index with another equivalent index for all
2014
foreign key constraints on this table where this index is used */
2016
dict_table_replace_index_in_foreign_list(table, index);
2017
dict_index_remove_from_cache(table, index);
2022
/*********************************************************************//**
2023
Drop those indexes which were created before an error occurred when
2024
building an index. The data dictionary must have been locked
2025
exclusively by the caller, because the transaction will not be
2029
row_merge_drop_indexes(
2030
/*===================*/
2031
trx_t* trx, /*!< in: transaction */
2032
dict_table_t* table, /*!< in: table containing the indexes */
2033
dict_index_t** index, /*!< in: indexes to drop */
2034
ulint num_created) /*!< in: number of elements in index[] */
2038
for (key_num = 0; key_num < num_created; key_num++) {
2039
row_merge_drop_index(index[key_num], table, trx);
2043
/*********************************************************************//**
2044
Drop all partially created indexes during crash recovery. */
2047
row_merge_drop_temp_indexes(void)
2048
/*=============================*/
2054
/* Load the table definitions that contain partially defined
2055
indexes, so that the data dictionary information can be checked
2056
when accessing the tablename.ibd files. */
2058
trx = trx_allocate_for_background();
2059
trx->op_info = "dropping partially created indexes";
2060
row_mysql_lock_data_dictionary(trx);
2064
btr_pcur_open_at_index_side(
2066
dict_table_get_first_index(dict_sys->sys_indexes),
2067
BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
2074
dict_table_t* table;
2076
btr_pcur_move_to_next_user_rec(&pcur, &mtr);
2078
if (!btr_pcur_is_on_user_rec(&pcur)) {
2082
rec = btr_pcur_get_rec(&pcur);
2083
field = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_NAME_FIELD,
2085
if (len == UNIV_SQL_NULL || len == 0
2086
|| (char) *field != TEMP_INDEX_PREFIX) {
2090
/* This is a temporary index. */
2092
field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
2094
/* Corrupted TABLE_ID */
2098
table_id = mach_read_from_8(field);
2100
btr_pcur_store_position(&pcur, &mtr);
2101
btr_pcur_commit_specify_mtr(&pcur, &mtr);
2103
table = dict_table_get_on_id_low(table_id);
2106
dict_index_t* index;
2107
dict_index_t* next_index;
2109
for (index = dict_table_get_first_index(table);
2110
index; index = next_index) {
2112
next_index = dict_table_get_next_index(index);
2114
if (*index->name == TEMP_INDEX_PREFIX) {
2115
row_merge_drop_index(index, table, trx);
2116
trx_commit_for_mysql(trx);
2122
btr_pcur_restore_position(BTR_SEARCH_LEAF,
2126
btr_pcur_close(&pcur);
2128
row_mysql_unlock_data_dictionary(trx);
2129
trx_free_for_background(trx);
2132
/*********************************************************************//**
2133
Create a merge file. */
2136
row_merge_file_create(
2137
/*==================*/
2138
merge_file_t* merge_file) /*!< out: merge file structure */
2140
merge_file->fd = innobase_mysql_tmpfile();
2141
merge_file->offset = 0;
2142
merge_file->n_rec = 0;
2145
/*********************************************************************//**
2146
Destroy a merge file. */
2149
row_merge_file_destroy(
2150
/*===================*/
2151
merge_file_t* merge_file) /*!< out: merge file structure */
2153
if (merge_file->fd != -1) {
2154
close(merge_file->fd);
2155
merge_file->fd = -1;
2159
/*********************************************************************//**
2160
Determine the precise type of a column that is added to a tem
2161
if a column must be constrained NOT NULL.
2162
@return col->prtype, possibly ORed with DATA_NOT_NULL */
2165
row_merge_col_prtype(
2166
/*=================*/
2167
const dict_col_t* col, /*!< in: column */
2168
const char* col_name, /*!< in: name of the column */
2169
const merge_index_def_t*index_def) /*!< in: the index definition
2170
of the primary key */
2172
ulint prtype = col->prtype;
2175
ut_ad(index_def->ind_type & DICT_CLUSTERED);
2177
if (prtype & DATA_NOT_NULL) {
2182
/* All columns that are included
2183
in the PRIMARY KEY must be NOT NULL. */
2185
for (i = 0; i < index_def->n_fields; i++) {
2186
if (!strcmp(col_name, index_def->fields[i].field_name)) {
2187
return(prtype | DATA_NOT_NULL);
2194
/*********************************************************************//**
2195
Create a temporary table for creating a primary key, using the definition
2196
of an existing table.
2197
@return table, or NULL on error */
2200
row_merge_create_temporary_table(
2201
/*=============================*/
2202
const char* table_name, /*!< in: new table name */
2203
const merge_index_def_t*index_def, /*!< in: the index definition
2204
of the primary key */
2205
const dict_table_t* table, /*!< in: old table definition */
2206
trx_t* trx) /*!< in/out: transaction
2207
(sets error_state) */
2210
dict_table_t* new_table = NULL;
2211
ulint n_cols = dict_table_get_n_user_cols(table);
2213
mem_heap_t* heap = mem_heap_create(1000);
2218
ut_ad(mutex_own(&dict_sys->mutex));
2220
new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags);
2222
for (i = 0; i < n_cols; i++) {
2223
const dict_col_t* col;
2224
const char* col_name;
2226
col = dict_table_get_nth_col(table, i);
2227
col_name = dict_table_get_col_name(table, i);
2229
dict_mem_table_add_col(new_table, heap, col_name, col->mtype,
2230
row_merge_col_prtype(col, col_name,
2235
error = row_create_table_for_mysql(new_table, trx);
2236
mem_heap_free(heap);
2238
if (error != DB_SUCCESS) {
2239
trx->error_state = error;
2246
/*********************************************************************//**
2247
Rename the temporary indexes in the dictionary to permanent ones. The
2248
data dictionary must have been locked exclusively by the caller,
2249
because the transaction will not be committed.
2250
@return DB_SUCCESS if all OK */
2253
row_merge_rename_indexes(
2254
/*=====================*/
2255
trx_t* trx, /*!< in/out: transaction */
2256
dict_table_t* table) /*!< in/out: table with new indexes */
2258
ulint err = DB_SUCCESS;
2259
pars_info_t* info = pars_info_create();
2261
/* We use the private SQL parser of Innobase to generate the
2262
query graphs needed in renaming indexes. */
2264
static const char rename_indexes[] =
2265
"PROCEDURE RENAME_INDEXES_PROC () IS\n"
2267
"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
2268
"WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='"
2269
TEMP_INDEX_PREFIX_STR "';\n"
2274
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
2276
trx->op_info = "renaming indexes";
2278
pars_info_add_dulint_literal(info, "tableid", table->id);
2280
err = que_eval_sql(info, rename_indexes, FALSE, trx);
2282
if (err == DB_SUCCESS) {
2283
dict_index_t* index = dict_table_get_first_index(table);
2285
if (*index->name == TEMP_INDEX_PREFIX) {
2288
index = dict_table_get_next_index(index);
2297
/*********************************************************************//**
2298
Rename the tables in the data dictionary. The data dictionary must
2299
have been locked exclusively by the caller, because the transaction
2300
will not be committed.
2301
@return error code or DB_SUCCESS */
2304
row_merge_rename_tables(
2305
/*====================*/
2306
dict_table_t* old_table, /*!< in/out: old table, renamed to
2308
dict_table_t* new_table, /*!< in/out: new table, renamed to
2310
const char* tmp_name, /*!< in: new name for old_table */
2311
trx_t* trx) /*!< in: transaction handle */
2313
ulint err = DB_ERROR;
2315
const char* old_name= old_table->name;
2317
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
2318
ut_ad(old_table != new_table);
2319
ut_ad(mutex_own(&dict_sys->mutex));
2321
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
2323
trx->op_info = "renaming tables";
2325
/* We use the private SQL parser of Innobase to generate the query
2326
graphs needed in updating the dictionary data in system tables. */
2328
info = pars_info_create();
2330
pars_info_add_str_literal(info, "new_name", new_table->name);
2331
pars_info_add_str_literal(info, "old_name", old_name);
2332
pars_info_add_str_literal(info, "tmp_name", tmp_name);
2334
err = que_eval_sql(info,
2335
"PROCEDURE RENAME_TABLES () IS\n"
2337
"UPDATE SYS_TABLES SET NAME = :tmp_name\n"
2338
" WHERE NAME = :old_name;\n"
2339
"UPDATE SYS_TABLES SET NAME = :old_name\n"
2340
" WHERE NAME = :new_name;\n"
2341
"END;\n", FALSE, trx);
2343
if (err != DB_SUCCESS) {
2348
/* The following calls will also rename the .ibd data files if
2349
the tables are stored in a single-table tablespace */
2351
if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE)
2352
|| !dict_table_rename_in_cache(new_table, old_name, FALSE)) {
2358
err = dict_load_foreigns(old_name, TRUE);
2360
if (err != DB_SUCCESS) {
2362
trx->error_state = DB_SUCCESS;
2363
trx_general_rollback_for_mysql(trx, NULL);
2364
trx->error_state = DB_SUCCESS;
2372
/*********************************************************************//**
2373
Create and execute a query graph for creating an index.
2374
@return DB_SUCCESS or error code */
2377
row_merge_create_index_graph(
2378
/*=========================*/
2379
trx_t* trx, /*!< in: trx */
2380
dict_table_t* table, /*!< in: table */
2381
dict_index_t* index) /*!< in: index */
2383
ind_node_t* node; /*!< Index creation node */
2384
mem_heap_t* heap; /*!< Memory heap */
2385
que_thr_t* thr; /*!< Query thread */
2392
heap = mem_heap_create(512);
2394
index->table = table;
2395
node = ind_create_graph_create(index, heap);
2396
thr = pars_complete_graph_for_exec(node, trx, heap);
2398
ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
2400
que_run_threads(thr);
2402
err = trx->error_state;
2404
que_graph_free((que_t*) que_node_get_parent(thr));
2409
/*********************************************************************//**
2410
Create the index and load in to the dictionary.
2411
@return index, or NULL on error */
2414
row_merge_create_index(
2415
/*===================*/
2416
trx_t* trx, /*!< in/out: trx (sets error_state) */
2417
dict_table_t* table, /*!< in: the index is on this table */
2418
const merge_index_def_t*index_def)
2419
/*!< in: the index definition */
2421
dict_index_t* index;
2423
ulint n_fields = index_def->n_fields;
2426
/* Create the index prototype, using the passed in def, this is not
2427
a persistent operation. We pass 0 as the space id, and determine at
2428
a lower level the space id where to store the table. */
2430
index = dict_mem_index_create(table->name, index_def->name,
2431
0, index_def->ind_type, n_fields);
2435
for (i = 0; i < n_fields; i++) {
2436
merge_index_field_t* ifield = &index_def->fields[i];
2438
dict_mem_index_add_field(index, ifield->field_name,
2439
ifield->prefix_len);
2442
/* Add the index to SYS_INDEXES, using the index prototype. */
2443
err = row_merge_create_index_graph(trx, table, index);
2445
if (err == DB_SUCCESS) {
2447
index = row_merge_dict_table_get_index(
2452
/* Note the id of the transaction that created this
2453
index, we use it to restrict readers from accessing
2454
this index, to ensure read consistency. */
2455
index->trx_id = (ib_uint64_t)
2456
ut_conv_dulint_to_longlong(trx->id);
2464
/*********************************************************************//**
2465
Check if a transaction can use an index. */
2468
row_merge_is_index_usable(
2469
/*======================*/
2470
const trx_t* trx, /*!< in: transaction */
2471
const dict_index_t* index) /*!< in: index to check */
2473
return(!trx->read_view || read_view_sees_trx_id(
2475
ut_dulint_create((ulint) (index->trx_id >> 32),
2476
(ulint) index->trx_id & 0xFFFFFFFF)));
2479
/*********************************************************************//**
2481
@return DB_SUCCESS or error code */
2484
row_merge_drop_table(
2485
/*=================*/
2486
trx_t* trx, /*!< in: transaction */
2487
dict_table_t* table) /*!< in: table to drop */
2489
/* There must be no open transactions on the table. */
2490
ut_a(table->n_mysql_handles_opened == 0);
2492
return(row_drop_table_for_mysql(table->name, trx, FALSE));
2495
/*********************************************************************//**
2496
Build indexes on a table by reading a clustered index,
2497
creating a temporary file containing index entries, merge sorting
2498
these index entries and inserting sorted index entries to indexes.
2499
@return DB_SUCCESS or error code */
2502
row_merge_build_indexes(
2503
/*====================*/
2504
trx_t* trx, /*!< in: transaction */
2505
dict_table_t* old_table, /*!< in: table where rows are
2507
dict_table_t* new_table, /*!< in: table where indexes are
2508
created; identical to old_table
2509
unless creating a PRIMARY KEY */
2510
dict_index_t** indexes, /*!< in: indexes to be created */
2511
ulint n_indexes, /*!< in: size of indexes[] */
2512
TABLE* table) /*!< in/out: MySQL table, for
2513
reporting erroneous key value
2516
merge_file_t* merge_files;
2517
row_merge_block_t* block;
2529
trx_start_if_not_started(trx);
2531
/* Allocate memory for merge file data structure and initialize
2534
merge_files = mem_alloc(n_indexes * sizeof *merge_files);
2535
block_size = 3 * sizeof *block;
2536
block = os_mem_alloc_large(&block_size);
2538
for (i = 0; i < n_indexes; i++) {
2540
row_merge_file_create(&merge_files[i]);
2543
tmpfd = innobase_mysql_tmpfile();
2545
/* Reset the MySQL row buffer that is used when reporting
2547
innobase_rec_reset(table);
2549
/* Read clustered index of the table and create files for
2550
secondary index entries for merge sort */
2552
error = row_merge_read_clustered_index(
2553
trx, table, old_table, new_table, indexes,
2554
merge_files, n_indexes, block);
2556
if (error != DB_SUCCESS) {
2561
/* Now we have files containing index entries ready for
2562
sorting and inserting. */
2564
for (i = 0; i < n_indexes; i++) {
2565
error = row_merge_sort(trx, indexes[i], &merge_files[i],
2566
block, &tmpfd, table);
2568
if (error == DB_SUCCESS) {
2569
error = row_merge_insert_index_tuples(
2570
trx, indexes[i], new_table,
2571
dict_table_zip_size(old_table),
2572
merge_files[i].fd, block);
2575
/* Close the temporary file to free up space. */
2576
row_merge_file_destroy(&merge_files[i]);
2578
if (error != DB_SUCCESS) {
2579
trx->error_key_num = i;
2587
for (i = 0; i < n_indexes; i++) {
2588
row_merge_file_destroy(&merge_files[i]);
2591
mem_free(merge_files);
2592
os_mem_free_large(block, block_size);