1
/******************************************************
2
New index creation routines using a merge sort
4
(c) 2005,2007 Innobase Oy
6
Created 12/4/2005 Jan Lindstrom
7
Completed by Sunny Bains and Marko Makela
8
*******************************************************/
10
#include "row0merge.h"
16
#include "dict0dict.h"
18
#include "dict0boot.h"
19
#include "dict0crea.h"
20
#include "dict0load.h"
22
#include "mach0data.h"
27
#include "trx0purge.h"
31
#include "read0read.h"
33
#include "lock0lock.h"
34
#include "data0data.h"
35
#include "data0type.h"
37
#include "pars0pars.h"
41
#include "handler0alter.h"
44
/* Set these in order ot enable debug printout. */
45
static ibool row_merge_print_cmp;
46
static ibool row_merge_print_read;
47
static ibool row_merge_print_write;
48
#endif /* UNIV_DEBUG */
50
/* Block size for I/O operations in merge sort. The minimum is
51
UNIV_PAGE_SIZE, or page_get_free_space_of_empty() rounded to a power of 2.
53
When not creating a PRIMARY KEY that contains column prefixes, this
54
can be set as small as UNIV_PAGE_SIZE / 2. See the comment above
55
ut_ad(data_size < sizeof(row_merge_block_t)). */
57
typedef byte row_merge_block_t[1048576];
59
/* Secondary buffer for I/O operations of merge records. This buffer
60
is used for writing or reading a record that spans two row_merge_block_t.
61
Thus, it must be able to hold one merge record, whose maximum size is
62
the same as the minimum size of row_merge_block_t. */
64
typedef byte mrec_buf_t[UNIV_PAGE_SIZE];
66
/* Merge record in row_merge_block_t. The format is the same as a
67
record in ROW_FORMAT=COMPACT with the exception that the
68
REC_N_NEW_EXTRA_BYTES are omitted. */
71
/* Buffer for sorting in main memory. */
72
struct row_merge_buf_struct {
73
mem_heap_t* heap; /* memory heap where allocated */
74
dict_index_t* index; /* the index the tuples belong to */
75
ulint total_size; /* total amount of data bytes */
76
ulint n_tuples; /* number of data tuples */
77
ulint max_tuples; /* maximum number of data tuples */
78
const dfield_t**tuples; /* array of pointers to
79
arrays of fields that form
81
const dfield_t**tmp_tuples; /* temporary copy of tuples,
85
typedef struct row_merge_buf_struct row_merge_buf_t;
87
/* Information about temporary files used in merge sort are stored
90
struct merge_file_struct {
91
int fd; /* File descriptor */
92
ulint offset; /* File offset */
95
typedef struct merge_file_struct merge_file_t;
98
/**********************************************************
99
Display a merge tuple. */
102
row_merge_tuple_print(
103
/*==================*/
104
FILE* f, /* in: output stream */
105
const dfield_t* entry, /* in: tuple to print */
106
ulint n_fields)/* in: number of fields in the tuple */
110
for (j = 0; j < n_fields; j++) {
111
const dfield_t* field = &entry[j];
113
if (dfield_is_null(field)) {
114
fputs("\n NULL;", f);
116
ulint field_len = dfield_get_len(field);
117
ulint len = ut_min(field_len, 20);
118
if (dfield_is_ext(field)) {
123
ut_print_buf(f, dfield_get_data(field), len);
124
if (len != field_len) {
125
fprintf(f, " (total %lu bytes)", field_len);
131
#endif /* UNIV_DEBUG */
133
/**********************************************************
134
Allocate a sort buffer. */
137
row_merge_buf_create_low(
138
/*=====================*/
139
/* out,own: sort buffer */
140
mem_heap_t* heap, /* in: heap where allocated */
141
dict_index_t* index, /* in: secondary index */
142
ulint max_tuples, /* in: maximum number of data tuples */
143
ulint buf_size) /* in: size of the buffer, in bytes */
145
row_merge_buf_t* buf;
147
ut_ad(max_tuples > 0);
148
ut_ad(max_tuples <= sizeof(row_merge_block_t));
149
ut_ad(max_tuples < buf_size);
151
buf = mem_heap_zalloc(heap, buf_size);
154
buf->max_tuples = max_tuples;
155
buf->tuples = mem_heap_alloc(heap,
156
2 * max_tuples * sizeof *buf->tuples);
157
buf->tmp_tuples = buf->tuples + max_tuples;
162
/**********************************************************
163
Allocate a sort buffer. */
166
row_merge_buf_create(
167
/*=================*/
168
/* out,own: sort buffer */
169
dict_index_t* index) /* in: secondary index */
171
row_merge_buf_t* buf;
176
max_tuples = sizeof(row_merge_block_t)
177
/ ut_max(1, dict_index_get_min_size(index));
179
buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
181
heap = mem_heap_create(buf_size + sizeof(row_merge_block_t));
183
buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
188
/**********************************************************
189
Empty a sort buffer. */
194
/* out: sort buffer */
195
row_merge_buf_t* buf) /* in,own: sort buffer */
198
ulint max_tuples = buf->max_tuples;
199
mem_heap_t* heap = buf->heap;
200
dict_index_t* index = buf->index;
202
buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
204
mem_heap_empty(heap);
206
return(row_merge_buf_create_low(heap, index, max_tuples, buf_size));
209
/**********************************************************
210
Deallocate a sort buffer. */
215
row_merge_buf_t* buf) /* in,own: sort buffer, to be freed */
217
mem_heap_free(buf->heap);
220
/**********************************************************
221
Insert a data tuple into a sort buffer. */
226
/* out: TRUE if added,
227
FALSE if out of space */
228
row_merge_buf_t* buf, /* in/out: sort buffer */
229
const dtuple_t* row, /* in: row in clustered index */
230
const row_ext_t* ext) /* in: cache of externally stored
231
column prefixes, or NULL */
237
const dict_index_t* index;
241
if (buf->n_tuples >= buf->max_tuples) {
245
UNIV_PREFETCH_R(row->fields);
249
n_fields = dict_index_get_n_fields(index);
251
entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
252
buf->tuples[buf->n_tuples] = entry;
256
extra_size = UT_BITS_IN_BYTES(index->n_nullable);
258
for (i = 0; i < n_fields; i++, field++) {
259
const dict_field_t* ifield;
260
const dict_col_t* col;
262
const dfield_t* row_field;
265
ifield = dict_index_get_nth_field(index, i);
267
col_no = dict_col_get_no(col);
268
row_field = dtuple_get_nth_field(row, col_no);
269
dfield_copy(field, row_field);
270
len = dfield_get_len(field);
272
if (dfield_is_null(field)) {
273
ut_ad(!(col->prtype & DATA_NOT_NULL));
275
} else if (UNIV_LIKELY(!ext)) {
276
} else if (dict_index_is_clust(index)) {
277
/* Flag externally stored fields. */
278
const byte* buf = row_ext_lookup(ext, col_no,
280
if (UNIV_LIKELY_NULL(buf)) {
281
ut_a(buf != field_ref_zero);
282
if (i < dict_index_get_n_unique(index)) {
283
dfield_set_data(field, buf, len);
285
dfield_set_ext(field);
286
len = dfield_get_len(field);
290
const byte* buf = row_ext_lookup(ext, col_no,
292
if (UNIV_LIKELY_NULL(buf)) {
293
ut_a(buf != field_ref_zero);
294
dfield_set_data(field, buf, len);
298
/* If a column prefix index, take only the prefix */
300
if (ifield->prefix_len) {
301
len = dtype_get_at_most_n_mbchars(
303
col->mbminlen, col->mbmaxlen,
305
len, dfield_get_data(field));
306
dfield_set_len(field, len);
309
ut_ad(len <= col->len || col->mtype == DATA_BLOB);
311
if (ifield->fixed_len) {
312
ut_ad(len == ifield->fixed_len);
313
ut_ad(!dfield_is_ext(field));
314
} else if (dfield_is_ext(field)) {
317
|| (col->len < 256 && col->mtype != DATA_BLOB)) {
320
/* For variable-length columns, we look up the
321
maximum length from the column itself. If this
322
is a prefix index column shorter than 256 bytes,
323
this will waste one byte. */
334
size = rec_get_converted_size_comp(index,
336
entry, n_fields, &extra);
338
ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size);
339
ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra);
341
#endif /* UNIV_DEBUG */
343
/* Add to the total size of the record in row_merge_block_t
344
the encoded length of extra_size and the extra bytes (extra_size).
345
See row_merge_buf_write() for the variable-length encoding
347
data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
349
/* The following assertion may fail if row_merge_block_t is
350
declared very small and a PRIMARY KEY is being created with
351
many prefix columns. In that case, the record may exceed the
352
page_zip_rec_needs_ext() limit. However, no further columns
353
will be moved to external storage until the record is inserted
354
to the clustered index B-tree. */
355
ut_ad(data_size < sizeof(row_merge_block_t));
357
/* Reserve one byte for the end marker of row_merge_block_t. */
358
if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) {
362
buf->total_size += data_size;
367
/* Copy the data fields. */
370
dfield_dup(field++, buf->heap);
371
} while (--n_fields);
376
/* Structure for reporting duplicate records. */
377
struct row_merge_dup_struct {
378
const dict_index_t* index; /* index being sorted */
379
TABLE* table; /* MySQL table object */
380
ulint n_dup; /* number of duplicates */
383
typedef struct row_merge_dup_struct row_merge_dup_t;
385
/*****************************************************************
386
Report a duplicate key. */
389
row_merge_dup_report(
390
/*=================*/
391
row_merge_dup_t* dup, /* in/out: for reporting duplicates */
392
const dfield_t* entry) /* in: duplicate index entry */
395
const dtuple_t* tuple;
396
dtuple_t tuple_store;
398
const dict_index_t* index = dup->index;
399
ulint n_fields= dict_index_get_n_fields(index);
400
mem_heap_t* heap = NULL;
401
ulint offsets_[REC_OFFS_NORMAL_SIZE];
406
/* Only report the first duplicate record,
407
but count all duplicate records. */
411
rec_offs_init(offsets_);
413
/* Convert the tuple to a record and then to MySQL format. */
415
tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
416
n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
418
rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext);
419
offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED,
422
innobase_rec_to_mysql(dup->table, rec, index, offsets);
424
if (UNIV_LIKELY_NULL(heap)) {
429
/*****************************************************************
430
Compare two tuples. */
435
/* out: 1, 0, -1 if a is greater,
436
equal, less, respectively, than b */
437
ulint n_field,/* in: number of fields */
438
const dfield_t* a, /* in: first tuple to be compared */
439
const dfield_t* b, /* in: second tuple to be compared */
440
row_merge_dup_t* dup) /* in/out: for reporting duplicates */
443
const dfield_t* field = a;
446
cmp = cmp_dfield_dfield(a++, b++);
447
} while (!cmp && --n_field);
449
if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) {
450
row_merge_dup_report(dup, field);
456
/**************************************************************************
457
Merge sort the tuple buffer in main memory. */
460
row_merge_tuple_sort(
461
/*=================*/
462
ulint n_field,/* in: number of fields */
463
row_merge_dup_t* dup, /* in/out: for reporting duplicates */
464
const dfield_t** tuples, /* in/out: tuples */
465
const dfield_t** aux, /* in/out: work area */
466
ulint low, /* in: lower bound of the
467
sorting area, inclusive */
468
ulint high) /* in: upper bound of the
469
sorting area, exclusive */
471
#define row_merge_tuple_sort_ctx(a,b,c,d) \
472
row_merge_tuple_sort(n_field, dup, a, b, c, d)
473
#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup)
475
UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
476
tuples, aux, low, high, row_merge_tuple_cmp_ctx);
479
/**********************************************************
485
row_merge_buf_t* buf, /* in/out: sort buffer */
486
row_merge_dup_t* dup) /* in/out: for reporting duplicates */
488
row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup,
489
buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
492
/**********************************************************
493
Write a buffer to a block. */
498
const row_merge_buf_t* buf, /* in: sorted buffer */
500
const merge_file_t* of, /* in: output file */
501
#endif /* UNIV_DEBUG */
502
row_merge_block_t* block) /* out: buffer for writing to file */
504
# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block)
505
#endif /* !UNIV_DEBUG */
507
const dict_index_t* index = buf->index;
508
ulint n_fields= dict_index_get_n_fields(index);
509
byte* b = &(*block)[0];
513
for (i = 0; i < buf->n_tuples; i++) {
516
const dfield_t* entry = buf->tuples[i];
518
size = rec_get_converted_size_comp(index,
522
ut_ad(size > extra_size);
523
ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES);
524
extra_size -= REC_N_NEW_EXTRA_BYTES;
525
size -= REC_N_NEW_EXTRA_BYTES;
527
/* Encode extra_size + 1 */
528
if (extra_size + 1 < 0x80) {
529
*b++ = (byte) (extra_size + 1);
531
ut_ad((extra_size + 1) < 0x8000);
532
*b++ = (byte) (0x80 | ((extra_size + 1) >> 8));
533
*b++ = (byte) (extra_size + 1);
536
ut_ad(b + size < block[1]);
538
rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index,
545
if (row_merge_print_write) {
546
fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
547
(void*) b, of->fd, (ulong) of->offset,
549
row_merge_tuple_print(stderr, entry, n_fields);
551
#endif /* UNIV_DEBUG */
554
/* Write an "end-of-chunk" marker. */
556
ut_a(b == block[0] + buf->total_size);
558
#ifdef UNIV_DEBUG_VALGRIND
559
/* The rest of the block is uninitialized. Initialize it
560
to avoid bogus warnings. */
561
memset(b, 0xff, block[1] - b);
562
#endif /* UNIV_DEBUG_VALGRIND */
564
if (row_merge_print_write) {
565
fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
566
(void*) b, of->fd, (ulong) of->offset);
568
#endif /* UNIV_DEBUG */
571
/**********************************************************
572
Create a memory heap and allocate space for row_merge_rec_offsets(). */
575
row_merge_heap_create(
576
/*==================*/
577
/* out: memory heap */
578
const dict_index_t* index, /* in: record descriptor */
579
ulint** offsets1, /* out: offsets */
580
ulint** offsets2) /* out: offsets */
582
ulint i = 1 + REC_OFFS_HEADER_SIZE
583
+ dict_index_get_n_fields(index);
584
mem_heap_t* heap = mem_heap_create(2 * i * sizeof *offsets1);
586
*offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1);
587
*offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2);
589
(*offsets1)[0] = (*offsets2)[0] = i;
590
(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
595
/**************************************************************************
596
Search an index object by name and column names. If several indexes match,
597
return the index with the max id. */
600
row_merge_dict_table_get_index(
601
/*===========================*/
602
/* out: matching index,
604
dict_table_t* table, /* in: table */
605
const merge_index_def_t*index_def) /* in: index definition */
609
const char** column_names;
611
column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
613
for (i = 0; i < index_def->n_fields; ++i) {
614
column_names[i] = index_def->fields[i].field_name;
617
index = dict_table_get_index_by_max_id(
618
table, index_def->name, column_names, index_def->n_fields);
620
mem_free((void*) column_names);
625
/************************************************************************
626
Read a merge block from the file system. */
631
/* out: TRUE if request was
632
successful, FALSE if fail */
633
int fd, /* in: file descriptor */
634
ulint offset, /* in: offset where to read */
635
row_merge_block_t* buf) /* out: data */
637
ib_uint64_t ofs = ((ib_uint64_t) offset) * sizeof *buf;
640
success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
641
(ulint) (ofs & 0xFFFFFFFF),
644
if (UNIV_UNLIKELY(!success)) {
645
ut_print_timestamp(stderr);
647
" InnoDB: failed to read merge block at %"PRIu64"\n", ofs);
650
return(UNIV_LIKELY(success));
653
/************************************************************************
654
Read a merge block from the file system. */
659
/* out: TRUE if request was
660
successful, FALSE if fail */
661
int fd, /* in: file descriptor */
662
ulint offset, /* in: offset where to write */
663
const void* buf) /* in: data */
665
ib_uint64_t ofs = ((ib_uint64_t) offset)
666
* sizeof(row_merge_block_t);
668
return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
669
(ulint) (ofs & 0xFFFFFFFF),
671
sizeof(row_merge_block_t))));
674
/************************************************************************
675
Read a merge record. */
680
/* out: pointer to next record,
683
row_merge_block_t* block, /* in/out: file buffer */
684
mrec_buf_t* buf, /* in/out: secondary buffer */
685
const byte* b, /* in: pointer to record */
686
const dict_index_t* index, /* in: index of the record */
687
int fd, /* in: file descriptor */
688
ulint* foffs, /* in/out: file offset */
689
const mrec_t** mrec, /* out: pointer to merge record,
690
or NULL on end of list
691
(non-NULL on I/O error) */
692
ulint* offsets)/* out: offsets of mrec */
700
ut_ad(b >= block[0]);
707
ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
708
+ dict_index_get_n_fields(index));
712
if (UNIV_UNLIKELY(!extra_size)) {
716
if (row_merge_print_read) {
717
fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
718
(const void*) b, (const void*) block,
721
#endif /* UNIV_DEBUG */
725
if (extra_size >= 0x80) {
726
/* Read another byte of extra_size. */
728
if (UNIV_UNLIKELY(b >= block[1])) {
729
if (!row_merge_read(fd, ++(*foffs), block)) {
731
/* Signal I/O error. */
736
/* Wrap around to the beginning of the buffer. */
740
extra_size = (extra_size & 0x7f) << 8;
744
/* Normalize extra_size. Above, value 0 signals "end of list". */
747
/* Read the extra bytes. */
749
if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
750
/* The record spans two blocks. Copy the entire record
751
to the auxiliary buffer and handle this as a special
754
avail_size = block[1] - b;
756
memcpy(*buf, b, avail_size);
758
if (!row_merge_read(fd, ++(*foffs), block)) {
763
/* Wrap around to the beginning of the buffer. */
766
/* Copy the record. */
767
memcpy(*buf + avail_size, b, extra_size - avail_size);
768
b += extra_size - avail_size;
770
*mrec = *buf + extra_size;
772
rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
774
data_size = rec_offs_data_size(offsets);
776
/* These overflows should be impossible given that
777
records are much smaller than either buffer, and
778
the record starts near the beginning of each buffer. */
779
ut_a(extra_size + data_size < sizeof *buf);
780
ut_a(b + data_size < block[1]);
782
/* Copy the data bytes. */
783
memcpy(*buf + extra_size, b, data_size);
789
*mrec = b + extra_size;
791
rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
793
data_size = rec_offs_data_size(offsets);
794
ut_ad(extra_size + data_size < sizeof *buf);
796
b += extra_size + data_size;
798
if (UNIV_LIKELY(b < block[1])) {
799
/* The record fits entirely in the block.
800
This is the normal case. */
804
/* The record spans two blocks. Copy it to buf. */
806
b -= extra_size + data_size;
807
avail_size = block[1] - b;
808
memcpy(*buf, b, avail_size);
809
*mrec = *buf + extra_size;
810
rec_offs_make_valid(*mrec, index, offsets);
812
if (!row_merge_read(fd, ++(*foffs), block)) {
817
/* Wrap around to the beginning of the buffer. */
820
/* Copy the rest of the record. */
821
memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
822
b += extra_size + data_size - avail_size;
826
if (row_merge_print_read) {
827
fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
828
(const void*) b, (const void*) block,
830
rec_print_comp(stderr, *mrec, offsets);
833
#endif /* UNIV_DEBUG */
838
/************************************************************************
839
Write a merge record. */
842
row_merge_write_rec_low(
843
/*====================*/
844
byte* b, /* out: buffer */
845
ulint e, /* in: encoded extra_size */
847
ulint size, /* in: total size to write */
848
int fd, /* in: file descriptor */
849
ulint foffs, /* in: file offset */
850
#endif /* UNIV_DEBUG */
851
const mrec_t* mrec, /* in: record to write */
852
const ulint* offsets)/* in: offsets of mrec */
854
# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \
855
row_merge_write_rec_low(b, e, mrec, offsets)
856
#endif /* !UNIV_DEBUG */
859
const byte* const end = b + size;
860
ut_ad(e == rec_offs_extra_size(offsets) + 1);
862
if (row_merge_print_write) {
863
fprintf(stderr, "row_merge_write %p,%d,%lu ",
864
(void*) b, fd, (ulong) foffs);
865
rec_print_comp(stderr, mrec, offsets);
868
#endif /* UNIV_DEBUG */
873
*b++ = (byte) (0x80 | (e >> 8));
877
memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
878
ut_ad(b + rec_offs_size(offsets) == end);
881
/************************************************************************
882
Write a merge record. */
887
/* out: pointer to end of block,
889
row_merge_block_t* block, /* in/out: file buffer */
890
mrec_buf_t* buf, /* in/out: secondary buffer */
891
byte* b, /* in: pointer to end of block */
892
int fd, /* in: file descriptor */
893
ulint* foffs, /* in/out: file offset */
894
const mrec_t* mrec, /* in: record to write */
895
const ulint* offsets)/* in: offsets of mrec */
903
ut_ad(b >= block[0]);
907
ut_ad(mrec < block[0] || mrec > block[1]);
908
ut_ad(mrec < buf[0] || mrec > buf[1]);
910
/* Normalize extra_size. Value 0 signals "end of list". */
911
extra_size = rec_offs_extra_size(offsets) + 1;
913
size = extra_size + (extra_size >= 0x80)
914
+ rec_offs_data_size(offsets);
916
if (UNIV_UNLIKELY(b + size >= block[1])) {
917
/* The record spans two blocks.
918
Copy it to the temporary buffer first. */
919
avail_size = block[1] - b;
921
row_merge_write_rec_low(buf[0],
922
extra_size, size, fd, *foffs,
925
/* Copy the head of the temporary buffer, write
926
the completed block, and copy the tail of the
927
record to the head of the new block. */
928
memcpy(b, buf[0], avail_size);
930
if (!row_merge_write(fd, (*foffs)++, block)) {
934
UNIV_MEM_INVALID(block[0], sizeof block[0]);
938
memcpy(b, buf[0] + avail_size, size - avail_size);
939
b += size - avail_size;
941
row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
949
/************************************************************************
950
Write an end-of-list marker. */
955
/* out: pointer to end of block,
957
row_merge_block_t* block, /* in/out: file buffer */
958
byte* b, /* in: pointer to end of block */
959
int fd, /* in: file descriptor */
960
ulint* foffs) /* in/out: file offset */
963
ut_ad(b >= block[0]);
967
if (row_merge_print_write) {
968
fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
969
(void*) b, (void*) block, fd, (ulong) *foffs);
971
#endif /* UNIV_DEBUG */
974
UNIV_MEM_ASSERT_RW(block[0], b - block[0]);
975
UNIV_MEM_ASSERT_W(block[0], sizeof block[0]);
976
#ifdef UNIV_DEBUG_VALGRIND
977
/* The rest of the block is uninitialized. Initialize it
978
to avoid bogus warnings. */
979
memset(b, 0xff, block[1] - b);
980
#endif /* UNIV_DEBUG_VALGRIND */
982
if (!row_merge_write(fd, (*foffs)++, block)) {
986
UNIV_MEM_INVALID(block[0], sizeof block[0]);
990
/*****************************************************************
991
Compare two merge records. */
997
mrec1 is greater, equal, less,
998
respectively, than mrec2 */
999
const mrec_t* mrec1, /* in: first merge
1000
record to be compared */
1001
const mrec_t* mrec2, /* in: second merge
1002
record to be compared */
1003
const ulint* offsets1, /* in: first record offsets */
1004
const ulint* offsets2, /* in: second record offsets */
1005
const dict_index_t* index) /* in: index */
1009
cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index);
1012
if (row_merge_print_cmp) {
1013
fputs("row_merge_cmp1 ", stderr);
1014
rec_print_comp(stderr, mrec1, offsets1);
1015
fputs("\nrow_merge_cmp2 ", stderr);
1016
rec_print_comp(stderr, mrec2, offsets2);
1017
fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp);
1019
#endif /* UNIV_DEBUG */
1024
/************************************************************************
1025
Reads clustered index of the table and create temporary files
1026
containing the index entries for the indexes to be built. */
1029
row_merge_read_clustered_index(
1030
/*===========================*/
1031
/* out: DB_SUCCESS or error */
1032
trx_t* trx, /* in: transaction */
1033
TABLE* table, /* in/out: MySQL table object,
1034
for reporting erroneous records */
1035
const dict_table_t* old_table,/* in: table where rows are
1037
const dict_table_t* new_table,/* in: table where indexes are
1038
created; identical to old_table
1039
unless creating a PRIMARY KEY */
1040
dict_index_t** index, /* in: indexes to be created */
1041
merge_file_t* files, /* in: temporary files */
1042
ulint n_index,/* in: number of indexes to create */
1043
row_merge_block_t* block) /* in/out: file buffer */
1045
dict_index_t* clust_index; /* Clustered index */
1046
mem_heap_t* row_heap; /* Heap memory to create
1047
clustered index records */
1048
row_merge_buf_t** merge_buf; /* Temporary list for records*/
1049
btr_pcur_t pcur; /* Persistent cursor on the
1051
mtr_t mtr; /* Mini transaction */
1052
ulint err = DB_SUCCESS;/* Return code */
1054
ulint n_nonnull = 0; /* number of columns
1055
changed to NOT NULL */
1056
ulint* nonnull = NULL; /* NOT NULL columns */
1058
trx->op_info = "reading clustered index";
1066
/* Create and initialize memory for record buffers */
1068
merge_buf = mem_alloc(n_index * sizeof *merge_buf);
1070
for (i = 0; i < n_index; i++) {
1071
merge_buf[i] = row_merge_buf_create(index[i]);
1076
/* Find the clustered index and create a persistent cursor
1079
clust_index = dict_table_get_first_index(old_table);
1081
btr_pcur_open_at_index_side(
1082
TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
1084
if (UNIV_UNLIKELY(old_table != new_table)) {
1085
ulint n_cols = dict_table_get_n_cols(old_table);
1087
/* A primary key will be created. Identify the
1088
columns that were flagged NOT NULL in the new table,
1089
so that we can quickly check that the records in the
1090
(old) clustered index do not violate the added NOT
1091
NULL constraints. */
1093
ut_a(n_cols == dict_table_get_n_cols(new_table));
1095
nonnull = mem_alloc(n_cols * sizeof *nonnull);
1097
for (i = 0; i < n_cols; i++) {
1098
if (dict_table_get_nth_col(old_table, i)->prtype
1104
if (dict_table_get_nth_col(new_table, i)->prtype
1107
nonnull[n_nonnull++] = i;
1117
row_heap = mem_heap_create(sizeof(mrec_buf_t));
1119
/* Scan the clustered index. */
1123
dtuple_t* row = NULL;
1125
ibool has_next = TRUE;
1127
btr_pcur_move_to_next_on_page(&pcur);
1129
/* When switching pages, commit the mini-transaction
1130
in order to release the latch on the old page. */
1132
if (btr_pcur_is_after_last_on_page(&pcur)) {
1133
btr_pcur_store_position(&pcur, &mtr);
1136
btr_pcur_restore_position(BTR_SEARCH_LEAF,
1138
has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr);
1141
if (UNIV_LIKELY(has_next)) {
1142
rec = btr_pcur_get_rec(&pcur);
1143
offsets = rec_get_offsets(rec, clust_index, NULL,
1144
ULINT_UNDEFINED, &row_heap);
1146
/* Skip delete marked records. */
1147
if (rec_get_deleted_flag(
1148
rec, dict_table_is_comp(old_table))) {
1152
srv_n_rows_inserted++;
1154
/* Build a row based on the clustered index. */
1156
row = row_build(ROW_COPY_POINTERS, clust_index,
1158
new_table, &ext, row_heap);
1160
if (UNIV_LIKELY_NULL(nonnull)) {
1161
for (i = 0; i < n_nonnull; i++) {
1163
= &row->fields[nonnull[i]];
1165
= dfield_get_type(field);
1167
ut_a(!(field_type->prtype
1170
if (dfield_is_null(field)) {
1171
err = DB_PRIMARY_KEY_IS_NULL;
1176
field_type->prtype |= DATA_NOT_NULL;
1181
/* Build all entries for all the indexes to be created
1182
in a single scan of the clustered index. */
1184
for (i = 0; i < n_index; i++) {
1185
row_merge_buf_t* buf = merge_buf[i];
1186
merge_file_t* file = &files[i];
1187
const dict_index_t* index = buf->index;
1190
(row && row_merge_buf_add(buf, row, ext))) {
1194
/* The buffer must be sufficiently large
1195
to hold at least one record. */
1196
ut_ad(buf->n_tuples || !has_next);
1198
/* We have enough data tuples to form a block.
1199
Sort them and write to disk. */
1201
if (buf->n_tuples) {
1202
if (dict_index_is_unique(index)) {
1203
row_merge_dup_t dup;
1204
dup.index = buf->index;
1208
row_merge_buf_sort(buf, &dup);
1211
err = DB_DUPLICATE_KEY;
1213
trx->error_key_num = i;
1217
row_merge_buf_sort(buf, NULL);
1221
row_merge_buf_write(buf, file, block);
1223
if (!row_merge_write(file->fd, file->offset++,
1225
err = DB_OUT_OF_FILE_SPACE;
1229
UNIV_MEM_INVALID(block[0], sizeof block[0]);
1230
merge_buf[i] = row_merge_buf_empty(buf);
1232
/* Try writing the record again, now that
1233
the buffer has been written out and emptied. */
1236
(row && !row_merge_buf_add(buf, row, ext))) {
1237
/* An empty buffer should have enough
1238
room for at least one record. */
1243
mem_heap_empty(row_heap);
1245
if (UNIV_UNLIKELY(!has_next)) {
1251
btr_pcur_close(&pcur);
1253
mem_heap_free(row_heap);
1255
if (UNIV_LIKELY_NULL(nonnull)) {
1259
for (i = 0; i < n_index; i++) {
1260
row_merge_buf_free(merge_buf[i]);
1263
mem_free(merge_buf);
1270
/*****************************************************************
1271
Merge two blocks of linked lists on disk and write a bigger block. */
1276
/* out: DB_SUCCESS or error code */
1277
const dict_index_t* index, /* in: index being created */
1278
merge_file_t* file, /* in/out: file containing
1280
row_merge_block_t* block, /* in/out: 3 buffers */
1281
ulint* foffs0, /* in/out: offset of first
1282
source list in the file */
1283
ulint* foffs1, /* in/out: offset of second
1284
source list in the file */
1285
merge_file_t* of, /* in/out: output file */
1286
TABLE* table) /* in/out: MySQL table, for
1287
reporting erroneous key value
1290
mem_heap_t* heap; /* memory heap for offsets0, offsets1 */
1292
mrec_buf_t buf[3]; /* buffer for handling split mrec in block[] */
1293
const byte* b0; /* pointer to block[0] */
1294
const byte* b1; /* pointer to block[1] */
1295
byte* b2; /* pointer to block[2] */
1296
const mrec_t* mrec0; /* merge rec, points to block[0] or buf[0] */
1297
const mrec_t* mrec1; /* merge rec, points to block[1] or buf[1] */
1298
ulint* offsets0;/* offsets of mrec0 */
1299
ulint* offsets1;/* offsets of mrec1 */
1301
heap = row_merge_heap_create(index, &offsets0, &offsets1);
1303
/* Write a record and read the next record. Split the output
1304
file in two halves, which can be merged on the following pass. */
1305
#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END) \
1307
b2 = row_merge_write_rec(&block[2], &buf[2], b2, \
1308
of->fd, &of->offset, \
1309
mrec##N, offsets##N); \
1310
if (UNIV_UNLIKELY(!b2)) { \
1313
b##N = row_merge_read_rec(&block[N], &buf[N], \
1315
file->fd, foffs##N, \
1316
&mrec##N, offsets##N); \
1317
if (UNIV_UNLIKELY(!b##N)) { \
1325
if (!row_merge_read(file->fd, *foffs0, &block[0])
1326
|| !row_merge_read(file->fd, *foffs1, &block[1])) {
1328
mem_heap_free(heap);
1329
return(DB_CORRUPTION);
1336
b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
1337
foffs0, &mrec0, offsets0);
1338
b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd,
1339
foffs1, &mrec1, offsets1);
1340
if (UNIV_UNLIKELY(!b0 && mrec0)
1341
|| UNIV_UNLIKELY(!b1 && mrec1)) {
1346
while (mrec0 && mrec1) {
1347
switch (row_merge_cmp(mrec0, mrec1,
1348
offsets0, offsets1, index)) {
1351
(dict_index_is_unique(index))) {
1352
innobase_rec_to_mysql(table, mrec0,
1354
mem_heap_free(heap);
1355
return(DB_DUPLICATE_KEY);
1359
ROW_MERGE_WRITE_GET_NEXT(0, goto merged);
1362
ROW_MERGE_WRITE_GET_NEXT(1, goto merged);
1372
/* append all mrec0 to output */
1374
ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
1379
/* append all mrec1 to output */
1381
ROW_MERGE_WRITE_GET_NEXT(1, goto done1);
1386
mem_heap_free(heap);
1387
b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset);
1388
return(b2 ? DB_SUCCESS : DB_CORRUPTION);
1391
/*****************************************************************
1392
Merge disk files. */
1397
/* out: DB_SUCCESS or error code */
1398
const dict_index_t* index, /* in: index being created */
1399
merge_file_t* file, /* in/out: file containing
1401
ulint half, /* in: half the file */
1402
row_merge_block_t* block, /* in/out: 3 buffers */
1403
int* tmpfd, /* in/out: temporary file handle */
1404
TABLE* table) /* in/out: MySQL table, for
1405
reporting erroneous key value
1408
ulint foffs0; /* first input offset */
1409
ulint foffs1; /* second input offset */
1410
ulint error; /* error code */
1411
merge_file_t of; /* output file */
1413
UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]);
1419
/* Merge blocks to the output file. */
1423
for (; foffs0 < half && foffs1 < file->offset; foffs0++, foffs1++) {
1424
error = row_merge_blocks(index, file, block,
1425
&foffs0, &foffs1, &of, table);
1427
if (error != DB_SUCCESS) {
1432
/* Copy the last block, if there is one. */
1433
while (foffs0 < half) {
1434
if (!row_merge_read(file->fd, foffs0++, block)
1435
|| !row_merge_write(of.fd, of.offset++, block)) {
1436
return(DB_CORRUPTION);
1439
while (foffs1 < file->offset) {
1440
if (!row_merge_read(file->fd, foffs1++, block)
1441
|| !row_merge_write(of.fd, of.offset++, block)) {
1442
return(DB_CORRUPTION);
1446
/* Swap file descriptors for the next pass. */
1450
UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]);
1455
/*****************************************************************
1456
Merge disk files. */
1461
/* out: DB_SUCCESS or error code */
1462
const dict_index_t* index, /* in: index being created */
1463
merge_file_t* file, /* in/out: file containing
1465
row_merge_block_t* block, /* in/out: 3 buffers */
1466
int* tmpfd, /* in/out: temporary file handle */
1467
TABLE* table) /* in/out: MySQL table, for
1468
reporting erroneous key value
1471
ulint blksz; /* block size */
1473
for (blksz = 1; blksz < file->offset; blksz *= 2) {
1477
ut_ad(ut_is_2pow(blksz));
1478
half = ut_2pow_round((file->offset + (blksz - 1)) / 2, blksz);
1479
error = row_merge(index, file, half, block, tmpfd, table);
1481
if (error != DB_SUCCESS) {
1489
/*****************************************************************
1490
Copy externally stored columns to the data tuple. */
1493
row_merge_copy_blobs(
1494
/*=================*/
1495
const mrec_t* mrec, /* in: merge record */
1496
const ulint* offsets,/* in: offsets of mrec */
1497
ulint zip_size,/* in: compressed page size in bytes, or 0 */
1498
dtuple_t* tuple, /* in/out: data tuple */
1499
mem_heap_t* heap) /* in/out: memory heap */
1502
ulint n_fields = dtuple_get_n_fields(tuple);
1504
for (i = 0; i < n_fields; i++) {
1507
dfield_t* field = dtuple_get_nth_field(tuple, i);
1509
if (!dfield_is_ext(field)) {
1513
ut_ad(!dfield_is_null(field));
1515
/* The table is locked during index creation.
1516
Therefore, externally stored columns cannot possibly
1517
be freed between the time the BLOB pointers are read
1518
(row_merge_read_clustered_index()) and dereferenced
1520
data = btr_rec_copy_externally_stored_field(
1521
mrec, offsets, zip_size, i, &len, heap);
1523
dfield_set_data(field, data, len);
1527
/************************************************************************
1528
Read sorted file containing index data tuples and insert these data
1529
tuples to the index */
1532
row_merge_insert_index_tuples(
1533
/*==========================*/
1534
/* out: DB_SUCCESS or error number */
1535
trx_t* trx, /* in: transaction */
1536
dict_index_t* index, /* in: index */
1537
dict_table_t* table, /* in: new table */
1538
ulint zip_size,/* in: compressed page size of
1539
the old table, or 0 if uncompressed */
1540
int fd, /* in: file descriptor */
1541
row_merge_block_t* block) /* in/out: file buffer */
1547
mem_heap_t* tuple_heap;
1548
mem_heap_t* graph_heap;
1549
ulint error = DB_SUCCESS;
1557
/* We use the insert query graph as the dummy graph
1558
needed in the row module call */
1560
trx->op_info = "inserting index entries";
1562
graph_heap = mem_heap_create(500);
1563
node = ins_node_create(INS_DIRECT, table, graph_heap);
1565
thr = pars_complete_graph_for_exec(node, trx, graph_heap);
1567
que_thr_move_to_run_state_for_mysql(thr, trx);
1569
tuple_heap = mem_heap_create(1000);
1572
ulint i = 1 + REC_OFFS_HEADER_SIZE
1573
+ dict_index_get_n_fields(index);
1574
offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
1576
offsets[1] = dict_index_get_n_fields(index);
1581
if (!row_merge_read(fd, foffs, block)) {
1582
error = DB_CORRUPTION;
1589
b = row_merge_read_rec(block, &buf, b, index,
1590
fd, &foffs, &mrec, offsets);
1591
if (UNIV_UNLIKELY(!b)) {
1592
/* End of list, or I/O error */
1594
error = DB_CORRUPTION;
1599
dtuple = row_rec_to_index_entry_low(
1600
mrec, index, offsets, &n_ext, tuple_heap);
1602
if (UNIV_UNLIKELY(n_ext)) {
1603
row_merge_copy_blobs(mrec, offsets, zip_size,
1604
dtuple, tuple_heap);
1608
node->table = table;
1609
node->trx_id = trx->id;
1611
ut_ad(dtuple_validate(dtuple));
1614
thr->run_node = thr;
1615
thr->prev_node = thr->common.parent;
1617
error = row_ins_index_entry(index, dtuple,
1620
if (UNIV_LIKELY(error == DB_SUCCESS)) {
1625
thr->lock_state = QUE_THR_LOCK_ROW;
1626
trx->error_state = error;
1627
que_thr_stop_for_mysql(thr);
1628
thr->lock_state = QUE_THR_LOCK_NOLOCK;
1629
} while (row_mysql_handle_errors(&error, trx,
1634
mem_heap_empty(tuple_heap);
1638
que_thr_stop_for_mysql_no_error(thr, trx);
1640
que_graph_free(thr->graph);
1644
mem_heap_free(tuple_heap);
1649
/*************************************************************************
1650
Sets an exclusive lock on a table, for the duration of creating indexes. */
1653
row_merge_lock_table(
1654
/*=================*/
1655
/* out: error code or DB_SUCCESS */
1656
trx_t* trx, /* in/out: transaction */
1657
dict_table_t* table, /* in: table to lock */
1658
enum lock_mode mode) /* in: LOCK_X or LOCK_S */
1666
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
1667
ut_ad(mode == LOCK_X || mode == LOCK_S);
1669
heap = mem_heap_create(512);
1671
trx->op_info = "setting table lock for creating or dropping index";
1673
node = sel_node_create(heap);
1674
thr = pars_complete_graph_for_exec(node, trx, heap);
1675
thr->graph->state = QUE_FORK_ACTIVE;
1677
/* We use the select query graph as the dummy graph needed
1678
in the lock module call */
1680
thr = que_fork_get_first_thr(que_node_get_parent(thr));
1681
que_thr_move_to_run_state_for_mysql(thr, trx);
1684
thr->run_node = thr;
1685
thr->prev_node = thr->common.parent;
1687
err = lock_table(0, table, mode, thr);
1689
trx->error_state = err;
1691
if (UNIV_LIKELY(err == DB_SUCCESS)) {
1692
que_thr_stop_for_mysql_no_error(thr, trx);
1694
que_thr_stop_for_mysql(thr);
1696
if (err != DB_QUE_THR_SUSPENDED) {
1697
ibool was_lock_wait;
1699
was_lock_wait = row_mysql_handle_errors(
1700
&err, trx, thr, NULL);
1702
if (was_lock_wait) {
1709
parent = que_node_get_parent(thr);
1710
run_thr = que_fork_start_command(parent);
1712
ut_a(run_thr == thr);
1714
/* There was a lock wait but the thread was not
1715
in a ready to run or running state. */
1716
trx->error_state = DB_LOCK_WAIT;
1722
que_graph_free(thr->graph);
1728
/*************************************************************************
1729
Drop an index from the InnoDB system tables. The data dictionary must
1730
have been locked exclusively by the caller, because the transaction
1731
will not be committed. */
1734
row_merge_drop_index(
1735
/*=================*/
1736
dict_index_t* index, /* in: index to be removed */
1737
dict_table_t* table, /* in: table */
1738
trx_t* trx) /* in: transaction handle */
1741
pars_info_t* info = pars_info_create();
1743
/* We use the private SQL parser of Innobase to generate the
1744
query graphs needed in deleting the dictionary data from system
1745
tables in Innobase. Deleting a row from SYS_INDEXES table also
1746
frees the file segments of the B-tree associated with the index. */
1748
static const char str1[] =
1749
"PROCEDURE DROP_INDEX_PROC () IS\n"
1751
"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
1752
"DELETE FROM SYS_INDEXES WHERE ID = :indexid\n"
1753
" AND TABLE_ID = :tableid;\n"
1756
ut_ad(index && table && trx);
1758
pars_info_add_dulint_literal(info, "indexid", index->id);
1759
pars_info_add_dulint_literal(info, "tableid", table->id);
1761
trx_start_if_not_started(trx);
1762
trx->op_info = "dropping index";
1764
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
1766
err = que_eval_sql(info, str1, FALSE, trx);
1768
ut_a(err == DB_SUCCESS);
1770
/* Replace this index with another equivalent index for all
1771
foreign key constraints on this table where this index is used */
1773
dict_table_replace_index_in_foreign_list(table, index);
1774
dict_index_remove_from_cache(table, index);
1779
/*************************************************************************
1780
Drop those indexes which were created before an error occurred when
1781
building an index. The data dictionary must have been locked
1782
exclusively by the caller, because the transaction will not be
1786
row_merge_drop_indexes(
1787
/*===================*/
1788
trx_t* trx, /* in: transaction */
1789
dict_table_t* table, /* in: table containing the indexes */
1790
dict_index_t** index, /* in: indexes to drop */
1791
ulint num_created) /* in: number of elements in index[] */
1795
for (key_num = 0; key_num < num_created; key_num++) {
1796
row_merge_drop_index(index[key_num], table, trx);
1800
/*************************************************************************
1801
Drop all partially created indexes during crash recovery. */
1804
row_merge_drop_temp_indexes(void)
1805
/*=============================*/
1810
/* We use the private SQL parser of Innobase to generate the
1811
query graphs needed in deleting the dictionary data from system
1812
tables in Innobase. Deleting a row from SYS_INDEXES table also
1813
frees the file segments of the B-tree associated with the index. */
1814
#if TEMP_INDEX_PREFIX != '\377'
1815
# error "TEMP_INDEX_PREFIX != '\377'"
1817
static const char drop_temp_indexes[] =
1818
"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
1820
"DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n"
1821
"WHERE SUBSTR(NAME,0,1)='\377' FOR UPDATE;\n"
1825
"\t\tFETCH c INTO indexid;\n"
1826
"\t\tIF (SQL % NOTFOUND) THEN\n"
1829
"\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n"
1830
"\t\tDELETE FROM SYS_INDEXES WHERE CURRENT OF c;\n"
1836
trx = trx_allocate_for_background();
1837
trx->op_info = "dropping partially created indexes";
1838
row_mysql_lock_data_dictionary(trx);
1840
err = que_eval_sql(NULL, drop_temp_indexes, FALSE, trx);
1841
ut_a(err == DB_SUCCESS);
1843
row_mysql_unlock_data_dictionary(trx);
1844
trx_free_for_background(trx);
1847
/*************************************************************************
1848
Create a merge file. */
1851
row_merge_file_create(
1852
/*==================*/
1853
merge_file_t* merge_file) /* out: merge file structure */
1855
merge_file->fd = innobase_mysql_tmpfile();
1856
merge_file->offset = 0;
1859
/*************************************************************************
1860
Destroy a merge file. */
1863
row_merge_file_destroy(
1864
/*===================*/
1865
merge_file_t* merge_file) /* out: merge file structure */
1867
if (merge_file->fd != -1) {
1868
close(merge_file->fd);
1869
merge_file->fd = -1;
1873
/*************************************************************************
1874
Determine the precise type of a column that is added to a tem
1875
if a column must be constrained NOT NULL. */
1878
row_merge_col_prtype(
1879
/*=================*/
1880
/* out: col->prtype, possibly
1881
ORed with DATA_NOT_NULL */
1882
const dict_col_t* col, /* in: column */
1883
const char* col_name, /* in: name of the column */
1884
const merge_index_def_t*index_def) /* in: the index definition
1885
of the primary key */
1887
ulint prtype = col->prtype;
1890
ut_ad(index_def->ind_type & DICT_CLUSTERED);
1892
if (prtype & DATA_NOT_NULL) {
1897
/* All columns that are included
1898
in the PRIMARY KEY must be NOT NULL. */
1900
for (i = 0; i < index_def->n_fields; i++) {
1901
if (!strcmp(col_name, index_def->fields[i].field_name)) {
1902
return(prtype | DATA_NOT_NULL);
1909
/*************************************************************************
1910
Create a temporary table for creating a primary key, using the definition
1911
of an existing table. */
1914
row_merge_create_temporary_table(
1915
/*=============================*/
1918
const char* table_name, /* in: new table name */
1919
const merge_index_def_t*index_def, /* in: the index definition
1920
of the primary key */
1921
const dict_table_t* table, /* in: old table definition */
1922
trx_t* trx) /* in/out: transaction
1923
(sets error_state) */
1926
dict_table_t* new_table = NULL;
1927
ulint n_cols = dict_table_get_n_user_cols(table);
1929
mem_heap_t* heap = mem_heap_create(1000);
1934
ut_ad(mutex_own(&dict_sys->mutex));
1936
new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags);
1938
for (i = 0; i < n_cols; i++) {
1939
const dict_col_t* col;
1940
const char* col_name;
1942
col = dict_table_get_nth_col(table, i);
1943
col_name = dict_table_get_col_name(table, i);
1945
dict_mem_table_add_col(new_table, heap, col_name, col->mtype,
1946
row_merge_col_prtype(col, col_name,
1951
error = row_create_table_for_mysql(new_table, trx);
1952
mem_heap_free(heap);
1954
if (error != DB_SUCCESS) {
1955
trx->error_state = error;
1956
dict_mem_table_free(new_table);
1963
/*************************************************************************
1964
Rename the temporary indexes in the dictionary to permanent ones. The
1965
data dictionary must have been locked exclusively by the caller,
1966
because the transaction will not be committed. */
1969
row_merge_rename_indexes(
1970
/*=====================*/
1971
/* out: DB_SUCCESS if all OK */
1972
trx_t* trx, /* in/out: transaction */
1973
dict_table_t* table) /* in/out: table with new indexes */
1975
ulint err = DB_SUCCESS;
1976
pars_info_t* info = pars_info_create();
1978
/* We use the private SQL parser of Innobase to generate the
1979
query graphs needed in renaming indexes. */
1981
#if TEMP_INDEX_PREFIX != '\377'
1982
# error "TEMP_INDEX_PREFIX != '\377'"
1985
static const char rename_indexes[] =
1986
"PROCEDURE RENAME_INDEXES_PROC () IS\n"
1988
"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
1989
"WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='\377';\n"
1994
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
1996
trx->op_info = "renaming indexes";
1998
pars_info_add_dulint_literal(info, "tableid", table->id);
2000
err = que_eval_sql(info, rename_indexes, FALSE, trx);
2002
if (err == DB_SUCCESS) {
2003
dict_index_t* index = dict_table_get_first_index(table);
2005
if (*index->name == TEMP_INDEX_PREFIX) {
2008
index = dict_table_get_next_index(index);
2017
/*************************************************************************
2018
Rename the tables in the data dictionary. The data dictionary must
2019
have been locked exclusively by the caller, because the transaction
2020
will not be committed. */
2023
row_merge_rename_tables(
2024
/*====================*/
2025
/* out: error code or DB_SUCCESS */
2026
dict_table_t* old_table, /* in/out: old table, renamed to
2028
dict_table_t* new_table, /* in/out: new table, renamed to
2030
const char* tmp_name, /* in: new name for old_table */
2031
trx_t* trx) /* in: transaction handle */
2033
ulint err = DB_ERROR;
2035
const char* old_name= old_table->name;
2037
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
2038
ut_ad(old_table != new_table);
2039
ut_ad(mutex_own(&dict_sys->mutex));
2041
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
2043
trx->op_info = "renaming tables";
2045
/* We use the private SQL parser of Innobase to generate the query
2046
graphs needed in updating the dictionary data in system tables. */
2048
info = pars_info_create();
2050
pars_info_add_str_literal(info, "new_name", new_table->name);
2051
pars_info_add_str_literal(info, "old_name", old_name);
2052
pars_info_add_str_literal(info, "tmp_name", tmp_name);
2054
err = que_eval_sql(info,
2055
"PROCEDURE RENAME_TABLES () IS\n"
2057
"UPDATE SYS_TABLES SET NAME = :tmp_name\n"
2058
" WHERE NAME = :old_name;\n"
2059
"UPDATE SYS_TABLES SET NAME = :old_name\n"
2060
" WHERE NAME = :new_name;\n"
2061
"END;\n", FALSE, trx);
2063
if (err != DB_SUCCESS) {
2068
/* The following calls will also rename the .ibd data files if
2069
the tables are stored in a single-table tablespace */
2071
if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE)
2072
|| !dict_table_rename_in_cache(new_table, old_name, FALSE)) {
2078
err = dict_load_foreigns(old_name, TRUE);
2080
if (err != DB_SUCCESS) {
2082
trx->error_state = DB_SUCCESS;
2083
trx_general_rollback_for_mysql(trx, FALSE, NULL);
2084
trx->error_state = DB_SUCCESS;
2092
/*************************************************************************
2093
Create and execute a query graph for creating an index. */
2096
row_merge_create_index_graph(
2097
/*=========================*/
2098
/* out: DB_SUCCESS or error code */
2099
trx_t* trx, /* in: trx */
2100
dict_table_t* table, /* in: table */
2101
dict_index_t* index) /* in: index */
2103
ind_node_t* node; /* Index creation node */
2104
mem_heap_t* heap; /* Memory heap */
2105
que_thr_t* thr; /* Query thread */
2112
heap = mem_heap_create(512);
2114
index->table = table;
2115
node = ind_create_graph_create(index, heap);
2116
thr = pars_complete_graph_for_exec(node, trx, heap);
2118
ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
2120
que_run_threads(thr);
2122
err = trx->error_state;
2124
que_graph_free((que_t*) que_node_get_parent(thr));
2129
/*************************************************************************
2130
Create the index and load in to the dictionary. */
2133
row_merge_create_index(
2134
/*===================*/
2135
/* out: index, or NULL on error */
2136
trx_t* trx, /* in/out: trx (sets error_state) */
2137
dict_table_t* table, /* in: the index is on this table */
2138
const merge_index_def_t* /* in: the index definition */
2141
dict_index_t* index;
2143
ulint n_fields = index_def->n_fields;
2146
/* Create the index prototype, using the passed in def, this is not
2147
a persistent operation. We pass 0 as the space id, and determine at
2148
a lower level the space id where to store the table. */
2150
index = dict_mem_index_create(table->name, index_def->name,
2151
0, index_def->ind_type, n_fields);
2155
for (i = 0; i < n_fields; i++) {
2156
merge_index_field_t* ifield = &index_def->fields[i];
2158
dict_mem_index_add_field(index, ifield->field_name,
2159
ifield->prefix_len);
2162
/* Add the index to SYS_INDEXES, using the index prototype. */
2163
err = row_merge_create_index_graph(trx, table, index);
2165
if (err == DB_SUCCESS) {
2167
index = row_merge_dict_table_get_index(
2172
#ifdef ROW_MERGE_IS_INDEX_USABLE
2173
/* Note the id of the transaction that created this
2174
index, we use it to restrict readers from accessing
2175
this index, to ensure read consistency. */
2176
index->trx_id = trx->id;
2177
#endif /* ROW_MERGE_IS_INDEX_USABLE */
2185
#ifdef ROW_MERGE_IS_INDEX_USABLE
2186
/*************************************************************************
2187
Check if a transaction can use an index. */
2190
row_merge_is_index_usable(
2191
/*======================*/
2192
const trx_t* trx, /* in: transaction */
2193
const dict_index_t* index) /* in: index to check */
2195
if (!trx->read_view) {
2199
return(ut_dulint_cmp(index->trx_id, trx->read_view->low_limit_id) < 0);
2201
#endif /* ROW_MERGE_IS_INDEX_USABLE */
2203
/*************************************************************************
2204
Drop the old table. */
2207
row_merge_drop_table(
2208
/*=================*/
2209
/* out: DB_SUCCESS or error code */
2210
trx_t* trx, /* in: transaction */
2211
dict_table_t* table) /* in: table to drop */
2213
/* There must be no open transactions on the table. */
2214
ut_a(table->n_mysql_handles_opened == 0);
2216
return(row_drop_table_for_mysql(table->name, trx, FALSE));
2219
/*************************************************************************
2220
Build indexes on a table by reading a clustered index,
2221
creating a temporary file containing index entries, merge sorting
2222
these index entries and inserting sorted index entries to indexes. */
2225
row_merge_build_indexes(
2226
/*====================*/
2227
/* out: DB_SUCCESS or error code */
2228
trx_t* trx, /* in: transaction */
2229
dict_table_t* old_table, /* in: table where rows are
2231
dict_table_t* new_table, /* in: table where indexes are
2232
created; identical to old_table
2233
unless creating a PRIMARY KEY */
2234
dict_index_t** indexes, /* in: indexes to be created */
2235
ulint n_indexes, /* in: size of indexes[] */
2236
TABLE* table) /* in/out: MySQL table, for
2237
reporting erroneous key value
2240
merge_file_t* merge_files;
2241
row_merge_block_t* block;
2253
trx_start_if_not_started(trx);
2255
/* Allocate memory for merge file data structure and initialize
2258
merge_files = mem_alloc(n_indexes * sizeof *merge_files);
2259
block_size = 3 * sizeof *block;
2260
block = os_mem_alloc_large(&block_size);
2262
for (i = 0; i < n_indexes; i++) {
2264
row_merge_file_create(&merge_files[i]);
2267
tmpfd = innobase_mysql_tmpfile();
2269
/* Reset the MySQL row buffer that is used when reporting
2271
innobase_rec_reset(table);
2273
/* Read clustered index of the table and create files for
2274
secondary index entries for merge sort */
2276
error = row_merge_read_clustered_index(
2277
trx, table, old_table, new_table, indexes,
2278
merge_files, n_indexes, block);
2280
if (error != DB_SUCCESS) {
2285
/* Now we have files containing index entries ready for
2286
sorting and inserting. */
2288
for (i = 0; i < n_indexes; i++) {
2289
error = row_merge_sort(indexes[i], &merge_files[i],
2290
block, &tmpfd, table);
2292
if (error == DB_SUCCESS) {
2293
error = row_merge_insert_index_tuples(
2294
trx, indexes[i], new_table,
2295
dict_table_zip_size(old_table),
2296
merge_files[i].fd, block);
2299
/* Close the temporary file to free up space. */
2300
row_merge_file_destroy(&merge_files[i]);
2302
if (error != DB_SUCCESS) {
2303
trx->error_key_num = i;
2311
for (i = 0; i < n_indexes; i++) {
2312
row_merge_file_destroy(&merge_files[i]);
2315
mem_free(merge_files);
2316
os_mem_free_large(block, block_size);