1
/*****************************************************************************
3
Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
Place, Suite 330, Boston, MA 02111-1307 USA
17
*****************************************************************************/
19
/**************************************************//**
21
New index creation routines using a merge sort
23
Created 12/4/2005 Jan Lindstrom
24
Completed by Sunny Bains and Marko Makela
25
*******************************************************/
27
#include "row0merge.h"
33
#include "dict0dict.h"
35
#include "dict0boot.h"
36
#include "dict0crea.h"
37
#include "dict0load.h"
39
#include "mach0data.h"
44
#include "trx0purge.h"
48
#include "read0read.h"
50
#include "lock0lock.h"
51
#include "data0data.h"
52
#include "data0type.h"
54
#include "pars0pars.h"
58
#include "handler0alter.h"
61
/** Set these in order ot enable debug printout. */
63
static ibool row_merge_print_cmp;
64
static ibool row_merge_print_read;
65
static ibool row_merge_print_write;
67
#endif /* UNIV_DEBUG */
69
/** @brief Block size for I/O operations in merge sort.
71
The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty()
72
rounded to a power of 2.
74
When not creating a PRIMARY KEY that contains column prefixes, this
75
can be set as small as UNIV_PAGE_SIZE / 2. See the comment above
76
ut_ad(data_size < sizeof(row_merge_block_t)). */
77
typedef byte row_merge_block_t[1048576];
79
/** @brief Secondary buffer for I/O operations of merge records.
81
This buffer is used for writing or reading a record that spans two
82
row_merge_block_t. Thus, it must be able to hold one merge record,
83
whose maximum size is the same as the minimum size of
85
typedef byte mrec_buf_t[UNIV_PAGE_SIZE];
87
/** @brief Merge record in row_merge_block_t.
89
The format is the same as a record in ROW_FORMAT=COMPACT with the
90
exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
93
/** Buffer for sorting in main memory. */
94
struct row_merge_buf_struct {
95
mem_heap_t* heap; /*!< memory heap where allocated */
96
dict_index_t* index; /*!< the index the tuples belong to */
97
ulint total_size; /*!< total amount of data bytes */
98
ulint n_tuples; /*!< number of data tuples */
99
ulint max_tuples; /*!< maximum number of data tuples */
100
const dfield_t**tuples; /*!< array of pointers to
101
arrays of fields that form
103
const dfield_t**tmp_tuples; /*!< temporary copy of tuples,
107
/** Buffer for sorting in main memory. */
108
typedef struct row_merge_buf_struct row_merge_buf_t;
110
/** Information about temporary files used in merge sort */
111
struct merge_file_struct {
112
int fd; /*!< file descriptor */
113
ulint offset; /*!< file offset */
116
/** Information about temporary files used in merge sort */
117
typedef struct merge_file_struct merge_file_t;
120
/******************************************************//**
121
Display a merge tuple. */
124
row_merge_tuple_print(
125
/*==================*/
126
FILE* f, /*!< in: output stream */
127
const dfield_t* entry, /*!< in: tuple to print */
128
ulint n_fields)/*!< in: number of fields in the tuple */
132
for (j = 0; j < n_fields; j++) {
133
const dfield_t* field = &entry[j];
135
if (dfield_is_null(field)) {
136
fputs("\n NULL;", f);
138
ulint field_len = dfield_get_len(field);
139
ulint len = ut_min(field_len, 20);
140
if (dfield_is_ext(field)) {
145
ut_print_buf(f, dfield_get_data(field), len);
146
if (len != field_len) {
147
fprintf(f, " (total %lu bytes)", field_len);
153
#endif /* UNIV_DEBUG */
155
/******************************************************//**
156
Allocate a sort buffer.
157
@return own: sort buffer */
160
row_merge_buf_create_low(
161
/*=====================*/
162
mem_heap_t* heap, /*!< in: heap where allocated */
163
dict_index_t* index, /*!< in: secondary index */
164
ulint max_tuples, /*!< in: maximum number of data tuples */
165
ulint buf_size) /*!< in: size of the buffer, in bytes */
167
row_merge_buf_t* buf;
169
ut_ad(max_tuples > 0);
170
ut_ad(max_tuples <= sizeof(row_merge_block_t));
171
ut_ad(max_tuples < buf_size);
173
buf = mem_heap_zalloc(heap, buf_size);
176
buf->max_tuples = max_tuples;
177
buf->tuples = mem_heap_alloc(heap,
178
2 * max_tuples * sizeof *buf->tuples);
179
buf->tmp_tuples = buf->tuples + max_tuples;
184
/******************************************************//**
185
Allocate a sort buffer.
186
@return own: sort buffer */
189
row_merge_buf_create(
190
/*=================*/
191
dict_index_t* index) /*!< in: secondary index */
193
row_merge_buf_t* buf;
198
max_tuples = sizeof(row_merge_block_t)
199
/ ut_max(1, dict_index_get_min_size(index));
201
buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
203
heap = mem_heap_create(buf_size + sizeof(row_merge_block_t));
205
buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
210
/******************************************************//**
212
@return sort buffer */
217
row_merge_buf_t* buf) /*!< in,own: sort buffer */
220
ulint max_tuples = buf->max_tuples;
221
mem_heap_t* heap = buf->heap;
222
dict_index_t* index = buf->index;
224
buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
226
mem_heap_empty(heap);
228
return(row_merge_buf_create_low(heap, index, max_tuples, buf_size));
231
/******************************************************//**
232
Deallocate a sort buffer. */
237
row_merge_buf_t* buf) /*!< in,own: sort buffer, to be freed */
239
mem_heap_free(buf->heap);
242
/******************************************************//**
243
Insert a data tuple into a sort buffer.
244
@return TRUE if added, FALSE if out of space */
249
row_merge_buf_t* buf, /*!< in/out: sort buffer */
250
const dtuple_t* row, /*!< in: row in clustered index */
251
const row_ext_t* ext) /*!< in: cache of externally stored
252
column prefixes, or NULL */
258
const dict_index_t* index;
262
if (buf->n_tuples >= buf->max_tuples) {
266
UNIV_PREFETCH_R(row->fields);
270
n_fields = dict_index_get_n_fields(index);
272
entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
273
buf->tuples[buf->n_tuples] = entry;
277
extra_size = UT_BITS_IN_BYTES(index->n_nullable);
279
for (i = 0; i < n_fields; i++, field++) {
280
const dict_field_t* ifield;
281
const dict_col_t* col;
283
const dfield_t* row_field;
286
ifield = dict_index_get_nth_field(index, i);
288
col_no = dict_col_get_no(col);
289
row_field = dtuple_get_nth_field(row, col_no);
290
dfield_copy(field, row_field);
291
len = dfield_get_len(field);
293
if (dfield_is_null(field)) {
294
ut_ad(!(col->prtype & DATA_NOT_NULL));
296
} else if (UNIV_LIKELY(!ext)) {
297
} else if (dict_index_is_clust(index)) {
298
/* Flag externally stored fields. */
299
const byte* buf = row_ext_lookup(ext, col_no,
301
if (UNIV_LIKELY_NULL(buf)) {
302
ut_a(buf != field_ref_zero);
303
if (i < dict_index_get_n_unique(index)) {
304
dfield_set_data(field, buf, len);
306
dfield_set_ext(field);
307
len = dfield_get_len(field);
311
const byte* buf = row_ext_lookup(ext, col_no,
313
if (UNIV_LIKELY_NULL(buf)) {
314
ut_a(buf != field_ref_zero);
315
dfield_set_data(field, buf, len);
319
/* If a column prefix index, take only the prefix */
321
if (ifield->prefix_len) {
322
len = dtype_get_at_most_n_mbchars(
324
col->mbminlen, col->mbmaxlen,
326
len, dfield_get_data(field));
327
dfield_set_len(field, len);
330
ut_ad(len <= col->len || col->mtype == DATA_BLOB);
332
if (ifield->fixed_len) {
333
ut_ad(len == ifield->fixed_len);
334
ut_ad(!dfield_is_ext(field));
335
} else if (dfield_is_ext(field)) {
338
|| (col->len < 256 && col->mtype != DATA_BLOB)) {
341
/* For variable-length columns, we look up the
342
maximum length from the column itself. If this
343
is a prefix index column shorter than 256 bytes,
344
this will waste one byte. */
355
size = rec_get_converted_size_comp(index,
357
entry, n_fields, &extra);
359
ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size);
360
ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra);
362
#endif /* UNIV_DEBUG */
364
/* Add to the total size of the record in row_merge_block_t
365
the encoded length of extra_size and the extra bytes (extra_size).
366
See row_merge_buf_write() for the variable-length encoding
368
data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
370
/* The following assertion may fail if row_merge_block_t is
371
declared very small and a PRIMARY KEY is being created with
372
many prefix columns. In that case, the record may exceed the
373
page_zip_rec_needs_ext() limit. However, no further columns
374
will be moved to external storage until the record is inserted
375
to the clustered index B-tree. */
376
ut_ad(data_size < sizeof(row_merge_block_t));
378
/* Reserve one byte for the end marker of row_merge_block_t. */
379
if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) {
383
buf->total_size += data_size;
388
/* Copy the data fields. */
391
dfield_dup(field++, buf->heap);
392
} while (--n_fields);
397
/** Structure for reporting duplicate records. */
398
struct row_merge_dup_struct {
399
const dict_index_t* index; /*!< index being sorted */
400
TABLE* table; /*!< MySQL table object */
401
ulint n_dup; /*!< number of duplicates */
404
/** Structure for reporting duplicate records. */
405
typedef struct row_merge_dup_struct row_merge_dup_t;
407
/*************************************************************//**
408
Report a duplicate key. */
411
row_merge_dup_report(
412
/*=================*/
413
row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
414
const dfield_t* entry) /*!< in: duplicate index entry */
417
const dtuple_t* tuple;
418
dtuple_t tuple_store;
420
const dict_index_t* index = dup->index;
421
ulint n_fields= dict_index_get_n_fields(index);
422
mem_heap_t* heap = NULL;
423
ulint offsets_[REC_OFFS_NORMAL_SIZE];
428
/* Only report the first duplicate record,
429
but count all duplicate records. */
433
rec_offs_init(offsets_);
435
/* Convert the tuple to a record and then to MySQL format. */
437
tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
438
n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
440
rec = rec_convert_dtuple_to_rec(buf, index, tuple, n_ext);
441
offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED,
444
innobase_rec_to_mysql(dup->table, rec, index, offsets);
446
if (UNIV_LIKELY_NULL(heap)) {
451
/*************************************************************//**
453
@return 1, 0, -1 if a is greater, equal, less, respectively, than b */
458
ulint n_field,/*!< in: number of fields */
459
const dfield_t* a, /*!< in: first tuple to be compared */
460
const dfield_t* b, /*!< in: second tuple to be compared */
461
row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */
464
const dfield_t* field = a;
466
/* Compare the fields of the tuples until a difference is
467
found or we run out of fields to compare. If !cmp at the
468
end, the tuples are equal. */
470
cmp = cmp_dfield_dfield(a++, b++);
471
} while (!cmp && --n_field);
473
if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) {
474
/* Report a duplicate value error if the tuples are
475
logically equal. NULL columns are logically inequal,
476
although they are equal in the sorting order. Find
477
out if any of the fields are NULL. */
478
for (b = field; b != a; b++) {
479
if (dfield_is_null(b)) {
485
row_merge_dup_report(dup, field);
492
/** Wrapper for row_merge_tuple_sort() to inject some more context to
493
UT_SORT_FUNCTION_BODY().
494
@param a array of tuples that being sorted
495
@param b aux (work area), same size as tuples[]
496
@param c lower bound of the sorting area, inclusive
497
@param d upper bound of the sorting area, inclusive */
498
#define row_merge_tuple_sort_ctx(a,b,c,d) \
499
row_merge_tuple_sort(n_field, dup, a, b, c, d)
500
/** Wrapper for row_merge_tuple_cmp() to inject some more context to
501
UT_SORT_FUNCTION_BODY().
502
@param a first tuple to be compared
503
@param b second tuple to be compared
504
@return 1, 0, -1 if a is greater, equal, less, respectively, than b */
505
#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup)
507
/**********************************************************************//**
508
Merge sort the tuple buffer in main memory. */
511
row_merge_tuple_sort(
512
/*=================*/
513
ulint n_field,/*!< in: number of fields */
514
row_merge_dup_t* dup, /*!< in/out: for reporting duplicates */
515
const dfield_t** tuples, /*!< in/out: tuples */
516
const dfield_t** aux, /*!< in/out: work area */
517
ulint low, /*!< in: lower bound of the
518
sorting area, inclusive */
519
ulint high) /*!< in: upper bound of the
520
sorting area, exclusive */
522
UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
523
tuples, aux, low, high, row_merge_tuple_cmp_ctx);
526
/******************************************************//**
532
row_merge_buf_t* buf, /*!< in/out: sort buffer */
533
row_merge_dup_t* dup) /*!< in/out: for reporting duplicates */
535
row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup,
536
buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
539
/******************************************************//**
540
Write a buffer to a block. */
545
const row_merge_buf_t* buf, /*!< in: sorted buffer */
547
const merge_file_t* of, /*!< in: output file */
548
#endif /* UNIV_DEBUG */
549
row_merge_block_t* block) /*!< out: buffer for writing to file */
551
# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block)
552
#endif /* !UNIV_DEBUG */
554
const dict_index_t* index = buf->index;
555
ulint n_fields= dict_index_get_n_fields(index);
556
byte* b = &(*block)[0];
560
for (i = 0; i < buf->n_tuples; i++) {
563
const dfield_t* entry = buf->tuples[i];
565
size = rec_get_converted_size_comp(index,
569
ut_ad(size > extra_size);
570
ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES);
571
extra_size -= REC_N_NEW_EXTRA_BYTES;
572
size -= REC_N_NEW_EXTRA_BYTES;
574
/* Encode extra_size + 1 */
575
if (extra_size + 1 < 0x80) {
576
*b++ = (byte) (extra_size + 1);
578
ut_ad((extra_size + 1) < 0x8000);
579
*b++ = (byte) (0x80 | ((extra_size + 1) >> 8));
580
*b++ = (byte) (extra_size + 1);
583
ut_ad(b + size < block[1]);
585
rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index,
592
if (row_merge_print_write) {
593
fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
594
(void*) b, of->fd, (ulong) of->offset,
596
row_merge_tuple_print(stderr, entry, n_fields);
598
#endif /* UNIV_DEBUG */
601
/* Write an "end-of-chunk" marker. */
603
ut_a(b == block[0] + buf->total_size);
605
#ifdef UNIV_DEBUG_VALGRIND
606
/* The rest of the block is uninitialized. Initialize it
607
to avoid bogus warnings. */
608
memset(b, 0xff, block[1] - b);
609
#endif /* UNIV_DEBUG_VALGRIND */
611
if (row_merge_print_write) {
612
fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
613
(void*) b, of->fd, (ulong) of->offset);
615
#endif /* UNIV_DEBUG */
618
/******************************************************//**
619
Create a memory heap and allocate space for row_merge_rec_offsets().
620
@return memory heap */
623
row_merge_heap_create(
624
/*==================*/
625
const dict_index_t* index, /*!< in: record descriptor */
626
ulint** offsets1, /*!< out: offsets */
627
ulint** offsets2) /*!< out: offsets */
629
ulint i = 1 + REC_OFFS_HEADER_SIZE
630
+ dict_index_get_n_fields(index);
631
mem_heap_t* heap = mem_heap_create(2 * i * sizeof *offsets1);
633
*offsets1 = mem_heap_alloc(heap, i * sizeof *offsets1);
634
*offsets2 = mem_heap_alloc(heap, i * sizeof *offsets2);
636
(*offsets1)[0] = (*offsets2)[0] = i;
637
(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
642
/**********************************************************************//**
643
Search an index object by name and column names. If several indexes match,
644
return the index with the max id.
645
@return matching index, NULL if not found */
648
row_merge_dict_table_get_index(
649
/*===========================*/
650
dict_table_t* table, /*!< in: table */
651
const merge_index_def_t*index_def) /*!< in: index definition */
655
const char** column_names;
657
column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
659
for (i = 0; i < index_def->n_fields; ++i) {
660
column_names[i] = index_def->fields[i].field_name;
663
index = dict_table_get_index_by_max_id(
664
table, index_def->name, column_names, index_def->n_fields);
666
mem_free((void*) column_names);
671
/********************************************************************//**
672
Read a merge block from the file system.
673
@return TRUE if request was successful, FALSE if fail */
678
int fd, /*!< in: file descriptor */
679
ulint offset, /*!< in: offset where to read */
680
row_merge_block_t* buf) /*!< out: data */
682
ib_uint64_t ofs = ((ib_uint64_t) offset) * sizeof *buf;
685
success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
686
(ulint) (ofs & 0xFFFFFFFF),
689
if (UNIV_UNLIKELY(!success)) {
690
ut_print_timestamp(stderr);
692
" InnoDB: failed to read merge block at %"PRIu64"\n", ofs);
695
return(UNIV_LIKELY(success));
698
/********************************************************************//**
699
Read a merge block from the file system.
700
@return TRUE if request was successful, FALSE if fail */
705
int fd, /*!< in: file descriptor */
706
ulint offset, /*!< in: offset where to write */
707
const void* buf) /*!< in: data */
709
ib_uint64_t ofs = ((ib_uint64_t) offset)
710
* sizeof(row_merge_block_t);
712
return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
713
(ulint) (ofs & 0xFFFFFFFF),
715
sizeof(row_merge_block_t))));
718
/********************************************************************//**
720
@return pointer to next record, or NULL on I/O error or end of list */
725
row_merge_block_t* block, /*!< in/out: file buffer */
726
mrec_buf_t* buf, /*!< in/out: secondary buffer */
727
const byte* b, /*!< in: pointer to record */
728
const dict_index_t* index, /*!< in: index of the record */
729
int fd, /*!< in: file descriptor */
730
ulint* foffs, /*!< in/out: file offset */
731
const mrec_t** mrec, /*!< out: pointer to merge record,
732
or NULL on end of list
733
(non-NULL on I/O error) */
734
ulint* offsets)/*!< out: offsets of mrec */
742
ut_ad(b >= block[0]);
749
ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
750
+ dict_index_get_n_fields(index));
754
if (UNIV_UNLIKELY(!extra_size)) {
758
if (row_merge_print_read) {
759
fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
760
(const void*) b, (const void*) block,
763
#endif /* UNIV_DEBUG */
767
if (extra_size >= 0x80) {
768
/* Read another byte of extra_size. */
770
if (UNIV_UNLIKELY(b >= block[1])) {
771
if (!row_merge_read(fd, ++(*foffs), block)) {
773
/* Signal I/O error. */
778
/* Wrap around to the beginning of the buffer. */
782
extra_size = (extra_size & 0x7f) << 8;
786
/* Normalize extra_size. Above, value 0 signals "end of list". */
789
/* Read the extra bytes. */
791
if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
792
/* The record spans two blocks. Copy the entire record
793
to the auxiliary buffer and handle this as a special
796
avail_size = block[1] - b;
798
memcpy(*buf, b, avail_size);
800
if (!row_merge_read(fd, ++(*foffs), block)) {
805
/* Wrap around to the beginning of the buffer. */
808
/* Copy the record. */
809
memcpy(*buf + avail_size, b, extra_size - avail_size);
810
b += extra_size - avail_size;
812
*mrec = *buf + extra_size;
814
rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
816
data_size = rec_offs_data_size(offsets);
818
/* These overflows should be impossible given that
819
records are much smaller than either buffer, and
820
the record starts near the beginning of each buffer. */
821
ut_a(extra_size + data_size < sizeof *buf);
822
ut_a(b + data_size < block[1]);
824
/* Copy the data bytes. */
825
memcpy(*buf + extra_size, b, data_size);
831
*mrec = b + extra_size;
833
rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
835
data_size = rec_offs_data_size(offsets);
836
ut_ad(extra_size + data_size < sizeof *buf);
838
b += extra_size + data_size;
840
if (UNIV_LIKELY(b < block[1])) {
841
/* The record fits entirely in the block.
842
This is the normal case. */
846
/* The record spans two blocks. Copy it to buf. */
848
b -= extra_size + data_size;
849
avail_size = block[1] - b;
850
memcpy(*buf, b, avail_size);
851
*mrec = *buf + extra_size;
853
/* We cannot invoke rec_offs_make_valid() here, because there
854
are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size.
855
Similarly, rec_offs_validate() would fail, because it invokes
857
offsets[2] = (ulint) *mrec;
858
offsets[3] = (ulint) index;
859
#endif /* UNIV_DEBUG */
861
if (!row_merge_read(fd, ++(*foffs), block)) {
866
/* Wrap around to the beginning of the buffer. */
869
/* Copy the rest of the record. */
870
memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
871
b += extra_size + data_size - avail_size;
875
if (row_merge_print_read) {
876
fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
877
(const void*) b, (const void*) block,
879
rec_print_comp(stderr, *mrec, offsets);
882
#endif /* UNIV_DEBUG */
887
/********************************************************************//**
888
Write a merge record. */
891
row_merge_write_rec_low(
892
/*====================*/
893
byte* b, /*!< out: buffer */
894
ulint e, /*!< in: encoded extra_size */
896
ulint size, /*!< in: total size to write */
897
int fd, /*!< in: file descriptor */
898
ulint foffs, /*!< in: file offset */
899
#endif /* UNIV_DEBUG */
900
const mrec_t* mrec, /*!< in: record to write */
901
const ulint* offsets)/*!< in: offsets of mrec */
903
# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets) \
904
row_merge_write_rec_low(b, e, mrec, offsets)
905
#endif /* !UNIV_DEBUG */
908
const byte* const end = b + size;
909
ut_ad(e == rec_offs_extra_size(offsets) + 1);
911
if (row_merge_print_write) {
912
fprintf(stderr, "row_merge_write %p,%d,%lu ",
913
(void*) b, fd, (ulong) foffs);
914
rec_print_comp(stderr, mrec, offsets);
917
#endif /* UNIV_DEBUG */
922
*b++ = (byte) (0x80 | (e >> 8));
926
memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
927
ut_ad(b + rec_offs_size(offsets) == end);
930
/********************************************************************//**
931
Write a merge record.
932
@return pointer to end of block, or NULL on error */
937
row_merge_block_t* block, /*!< in/out: file buffer */
938
mrec_buf_t* buf, /*!< in/out: secondary buffer */
939
byte* b, /*!< in: pointer to end of block */
940
int fd, /*!< in: file descriptor */
941
ulint* foffs, /*!< in/out: file offset */
942
const mrec_t* mrec, /*!< in: record to write */
943
const ulint* offsets)/*!< in: offsets of mrec */
951
ut_ad(b >= block[0]);
955
ut_ad(mrec < block[0] || mrec > block[1]);
956
ut_ad(mrec < buf[0] || mrec > buf[1]);
958
/* Normalize extra_size. Value 0 signals "end of list". */
959
extra_size = rec_offs_extra_size(offsets) + 1;
961
size = extra_size + (extra_size >= 0x80)
962
+ rec_offs_data_size(offsets);
964
if (UNIV_UNLIKELY(b + size >= block[1])) {
965
/* The record spans two blocks.
966
Copy it to the temporary buffer first. */
967
avail_size = block[1] - b;
969
row_merge_write_rec_low(buf[0],
970
extra_size, size, fd, *foffs,
973
/* Copy the head of the temporary buffer, write
974
the completed block, and copy the tail of the
975
record to the head of the new block. */
976
memcpy(b, buf[0], avail_size);
978
if (!row_merge_write(fd, (*foffs)++, block)) {
982
UNIV_MEM_INVALID(block[0], sizeof block[0]);
986
memcpy(b, buf[0] + avail_size, size - avail_size);
987
b += size - avail_size;
989
row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
997
/********************************************************************//**
998
Write an end-of-list marker.
999
@return pointer to end of block, or NULL on error */
1002
row_merge_write_eof(
1003
/*================*/
1004
row_merge_block_t* block, /*!< in/out: file buffer */
1005
byte* b, /*!< in: pointer to end of block */
1006
int fd, /*!< in: file descriptor */
1007
ulint* foffs) /*!< in/out: file offset */
1010
ut_ad(b >= block[0]);
1011
ut_ad(b < block[1]);
1014
if (row_merge_print_write) {
1015
fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
1016
(void*) b, (void*) block, fd, (ulong) *foffs);
1018
#endif /* UNIV_DEBUG */
1021
UNIV_MEM_ASSERT_RW(block[0], b - block[0]);
1022
UNIV_MEM_ASSERT_W(block[0], sizeof block[0]);
1023
#ifdef UNIV_DEBUG_VALGRIND
1024
/* The rest of the block is uninitialized. Initialize it
1025
to avoid bogus warnings. */
1026
memset(b, 0xff, block[1] - b);
1027
#endif /* UNIV_DEBUG_VALGRIND */
1029
if (!row_merge_write(fd, (*foffs)++, block)) {
1033
UNIV_MEM_INVALID(block[0], sizeof block[0]);
1037
/*************************************************************//**
1038
Compare two merge records.
1039
@return 1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */
1044
const mrec_t* mrec1, /*!< in: first merge
1045
record to be compared */
1046
const mrec_t* mrec2, /*!< in: second merge
1047
record to be compared */
1048
const ulint* offsets1, /*!< in: first record offsets */
1049
const ulint* offsets2, /*!< in: second record offsets */
1050
const dict_index_t* index) /*!< in: index */
1054
cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index);
1057
if (row_merge_print_cmp) {
1058
fputs("row_merge_cmp1 ", stderr);
1059
rec_print_comp(stderr, mrec1, offsets1);
1060
fputs("\nrow_merge_cmp2 ", stderr);
1061
rec_print_comp(stderr, mrec2, offsets2);
1062
fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp);
1064
#endif /* UNIV_DEBUG */
1069
/********************************************************************//**
1070
Reads clustered index of the table and create temporary files
1071
containing the index entries for the indexes to be built.
1072
@return DB_SUCCESS or error */
1075
row_merge_read_clustered_index(
1076
/*===========================*/
1077
trx_t* trx, /*!< in: transaction */
1078
TABLE* table, /*!< in/out: MySQL table object,
1079
for reporting erroneous records */
1080
const dict_table_t* old_table,/*!< in: table where rows are
1082
const dict_table_t* new_table,/*!< in: table where indexes are
1083
created; identical to old_table
1084
unless creating a PRIMARY KEY */
1085
dict_index_t** index, /*!< in: indexes to be created */
1086
merge_file_t* files, /*!< in: temporary files */
1087
ulint n_index,/*!< in: number of indexes to create */
1088
row_merge_block_t* block) /*!< in/out: file buffer */
1090
dict_index_t* clust_index; /* Clustered index */
1091
mem_heap_t* row_heap; /* Heap memory to create
1092
clustered index records */
1093
row_merge_buf_t** merge_buf; /* Temporary list for records*/
1094
btr_pcur_t pcur; /* Persistent cursor on the
1096
mtr_t mtr; /* Mini transaction */
1097
ulint err = DB_SUCCESS;/* Return code */
1099
ulint n_nonnull = 0; /* number of columns
1100
changed to NOT NULL */
1101
ulint* nonnull = NULL; /* NOT NULL columns */
1103
trx->op_info = "reading clustered index";
1111
/* Create and initialize memory for record buffers */
1113
merge_buf = mem_alloc(n_index * sizeof *merge_buf);
1115
for (i = 0; i < n_index; i++) {
1116
merge_buf[i] = row_merge_buf_create(index[i]);
1121
/* Find the clustered index and create a persistent cursor
1124
clust_index = dict_table_get_first_index(old_table);
1126
btr_pcur_open_at_index_side(
1127
TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
1129
if (UNIV_UNLIKELY(old_table != new_table)) {
1130
ulint n_cols = dict_table_get_n_cols(old_table);
1132
/* A primary key will be created. Identify the
1133
columns that were flagged NOT NULL in the new table,
1134
so that we can quickly check that the records in the
1135
(old) clustered index do not violate the added NOT
1136
NULL constraints. */
1138
ut_a(n_cols == dict_table_get_n_cols(new_table));
1140
nonnull = mem_alloc(n_cols * sizeof *nonnull);
1142
for (i = 0; i < n_cols; i++) {
1143
if (dict_table_get_nth_col(old_table, i)->prtype
1149
if (dict_table_get_nth_col(new_table, i)->prtype
1152
nonnull[n_nonnull++] = i;
1162
row_heap = mem_heap_create(sizeof(mrec_buf_t));
1164
/* Scan the clustered index. */
1168
dtuple_t* row = NULL;
1170
ibool has_next = TRUE;
1172
btr_pcur_move_to_next_on_page(&pcur);
1174
/* When switching pages, commit the mini-transaction
1175
in order to release the latch on the old page. */
1177
if (btr_pcur_is_after_last_on_page(&pcur)) {
1178
btr_pcur_store_position(&pcur, &mtr);
1181
btr_pcur_restore_position(BTR_SEARCH_LEAF,
1183
has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr);
1186
if (UNIV_LIKELY(has_next)) {
1187
rec = btr_pcur_get_rec(&pcur);
1188
offsets = rec_get_offsets(rec, clust_index, NULL,
1189
ULINT_UNDEFINED, &row_heap);
1191
/* Skip delete marked records. */
1192
if (rec_get_deleted_flag(
1193
rec, dict_table_is_comp(old_table))) {
1197
srv_n_rows_inserted++;
1199
/* Build a row based on the clustered index. */
1201
row = row_build(ROW_COPY_POINTERS, clust_index,
1203
new_table, &ext, row_heap);
1205
if (UNIV_LIKELY_NULL(nonnull)) {
1206
for (i = 0; i < n_nonnull; i++) {
1208
= &row->fields[nonnull[i]];
1210
= dfield_get_type(field);
1212
ut_a(!(field_type->prtype
1215
if (dfield_is_null(field)) {
1216
err = DB_PRIMARY_KEY_IS_NULL;
1221
field_type->prtype |= DATA_NOT_NULL;
1226
/* Build all entries for all the indexes to be created
1227
in a single scan of the clustered index. */
1229
for (i = 0; i < n_index; i++) {
1230
row_merge_buf_t* buf = merge_buf[i];
1231
merge_file_t* file = &files[i];
1232
const dict_index_t* index = buf->index;
1235
(row && row_merge_buf_add(buf, row, ext))) {
1239
/* The buffer must be sufficiently large
1240
to hold at least one record. */
1241
ut_ad(buf->n_tuples || !has_next);
1243
/* We have enough data tuples to form a block.
1244
Sort them and write to disk. */
1246
if (buf->n_tuples) {
1247
if (dict_index_is_unique(index)) {
1248
row_merge_dup_t dup;
1249
dup.index = buf->index;
1253
row_merge_buf_sort(buf, &dup);
1256
err = DB_DUPLICATE_KEY;
1258
trx->error_key_num = i;
1262
row_merge_buf_sort(buf, NULL);
1266
row_merge_buf_write(buf, file, block);
1268
if (!row_merge_write(file->fd, file->offset++,
1270
err = DB_OUT_OF_FILE_SPACE;
1274
UNIV_MEM_INVALID(block[0], sizeof block[0]);
1275
merge_buf[i] = row_merge_buf_empty(buf);
1277
/* Try writing the record again, now that
1278
the buffer has been written out and emptied. */
1281
(row && !row_merge_buf_add(buf, row, ext))) {
1282
/* An empty buffer should have enough
1283
room for at least one record. */
1288
mem_heap_empty(row_heap);
1290
if (UNIV_UNLIKELY(!has_next)) {
1296
btr_pcur_close(&pcur);
1298
mem_heap_free(row_heap);
1300
if (UNIV_LIKELY_NULL(nonnull)) {
1304
for (i = 0; i < n_index; i++) {
1305
row_merge_buf_free(merge_buf[i]);
1308
mem_free(merge_buf);
1315
/** Write a record via buffer 2 and read the next record to buffer N.
1316
@param N number of the buffer (0 or 1)
1317
@param AT_END statement to execute at end of input */
1318
#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END) \
1320
b2 = row_merge_write_rec(&block[2], &buf[2], b2, \
1321
of->fd, &of->offset, \
1322
mrec##N, offsets##N); \
1323
if (UNIV_UNLIKELY(!b2)) { \
1326
b##N = row_merge_read_rec(&block[N], &buf[N], \
1328
file->fd, foffs##N, \
1329
&mrec##N, offsets##N); \
1330
if (UNIV_UNLIKELY(!b##N)) { \
1338
/*************************************************************//**
1339
Merge two blocks of linked lists on disk and write a bigger block.
1340
@return DB_SUCCESS or error code */
1345
const dict_index_t* index, /*!< in: index being created */
1346
merge_file_t* file, /*!< in/out: file containing
1348
row_merge_block_t* block, /*!< in/out: 3 buffers */
1349
ulint* foffs0, /*!< in/out: offset of first
1350
source list in the file */
1351
ulint* foffs1, /*!< in/out: offset of second
1352
source list in the file */
1353
merge_file_t* of, /*!< in/out: output file */
1354
TABLE* table) /*!< in/out: MySQL table, for
1355
reporting erroneous key value
1358
mem_heap_t* heap; /*!< memory heap for offsets0, offsets1 */
1360
mrec_buf_t buf[3]; /*!< buffer for handling split mrec in block[] */
1361
const byte* b0; /*!< pointer to block[0] */
1362
const byte* b1; /*!< pointer to block[1] */
1363
byte* b2; /*!< pointer to block[2] */
1364
const mrec_t* mrec0; /*!< merge rec, points to block[0] or buf[0] */
1365
const mrec_t* mrec1; /*!< merge rec, points to block[1] or buf[1] */
1366
ulint* offsets0;/* offsets of mrec0 */
1367
ulint* offsets1;/* offsets of mrec1 */
1369
heap = row_merge_heap_create(index, &offsets0, &offsets1);
1371
/* Write a record and read the next record. Split the output
1372
file in two halves, which can be merged on the following pass. */
1374
if (!row_merge_read(file->fd, *foffs0, &block[0])
1375
|| !row_merge_read(file->fd, *foffs1, &block[1])) {
1377
mem_heap_free(heap);
1378
return(DB_CORRUPTION);
1385
b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
1386
foffs0, &mrec0, offsets0);
1387
b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd,
1388
foffs1, &mrec1, offsets1);
1389
if (UNIV_UNLIKELY(!b0 && mrec0)
1390
|| UNIV_UNLIKELY(!b1 && mrec1)) {
1395
while (mrec0 && mrec1) {
1396
switch (row_merge_cmp(mrec0, mrec1,
1397
offsets0, offsets1, index)) {
1400
(dict_index_is_unique(index))) {
1401
innobase_rec_to_mysql(table, mrec0,
1403
mem_heap_free(heap);
1404
return(DB_DUPLICATE_KEY);
1408
ROW_MERGE_WRITE_GET_NEXT(0, goto merged);
1411
ROW_MERGE_WRITE_GET_NEXT(1, goto merged);
1421
/* append all mrec0 to output */
1423
ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
1428
/* append all mrec1 to output */
1430
ROW_MERGE_WRITE_GET_NEXT(1, goto done1);
1435
mem_heap_free(heap);
1436
b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset);
1437
return(b2 ? DB_SUCCESS : DB_CORRUPTION);
1440
/*************************************************************//**
1442
@return DB_SUCCESS or error code */
1447
const dict_index_t* index, /*!< in: index being created */
1448
merge_file_t* file, /*!< in/out: file containing
1450
ulint half, /*!< in: half the file */
1451
row_merge_block_t* block, /*!< in/out: 3 buffers */
1452
int* tmpfd, /*!< in/out: temporary file handle */
1453
TABLE* table) /*!< in/out: MySQL table, for
1454
reporting erroneous key value
1457
ulint foffs0; /*!< first input offset */
1458
ulint foffs1; /*!< second input offset */
1459
ulint error; /*!< error code */
1460
merge_file_t of; /*!< output file */
1462
UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]);
1468
/* Merge blocks to the output file. */
1472
for (; foffs0 < half && foffs1 < file->offset; foffs0++, foffs1++) {
1473
error = row_merge_blocks(index, file, block,
1474
&foffs0, &foffs1, &of, table);
1476
if (error != DB_SUCCESS) {
1481
/* Copy the last block, if there is one. */
1482
while (foffs0 < half) {
1483
if (!row_merge_read(file->fd, foffs0++, block)
1484
|| !row_merge_write(of.fd, of.offset++, block)) {
1485
return(DB_CORRUPTION);
1488
while (foffs1 < file->offset) {
1489
if (!row_merge_read(file->fd, foffs1++, block)
1490
|| !row_merge_write(of.fd, of.offset++, block)) {
1491
return(DB_CORRUPTION);
1495
/* Swap file descriptors for the next pass. */
1499
UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]);
1504
/*************************************************************//**
1506
@return DB_SUCCESS or error code */
1511
const dict_index_t* index, /*!< in: index being created */
1512
merge_file_t* file, /*!< in/out: file containing
1514
row_merge_block_t* block, /*!< in/out: 3 buffers */
1515
int* tmpfd, /*!< in/out: temporary file handle */
1516
TABLE* table) /*!< in/out: MySQL table, for
1517
reporting erroneous key value
1520
ulint blksz; /*!< block size */
1522
for (blksz = 1; blksz < file->offset; blksz *= 2) {
1526
ut_ad(ut_is_2pow(blksz));
1527
half = ut_2pow_round((file->offset + (blksz - 1)) / 2, blksz);
1528
error = row_merge(index, file, half, block, tmpfd, table);
1530
if (error != DB_SUCCESS) {
1538
/*************************************************************//**
1539
Copy externally stored columns to the data tuple. */
1542
row_merge_copy_blobs(
1543
/*=================*/
1544
const mrec_t* mrec, /*!< in: merge record */
1545
const ulint* offsets,/*!< in: offsets of mrec */
1546
ulint zip_size,/*!< in: compressed page size in bytes, or 0 */
1547
dtuple_t* tuple, /*!< in/out: data tuple */
1548
mem_heap_t* heap) /*!< in/out: memory heap */
1551
ulint n_fields = dtuple_get_n_fields(tuple);
1553
for (i = 0; i < n_fields; i++) {
1556
dfield_t* field = dtuple_get_nth_field(tuple, i);
1558
if (!dfield_is_ext(field)) {
1562
ut_ad(!dfield_is_null(field));
1564
/* The table is locked during index creation.
1565
Therefore, externally stored columns cannot possibly
1566
be freed between the time the BLOB pointers are read
1567
(row_merge_read_clustered_index()) and dereferenced
1569
data = btr_rec_copy_externally_stored_field(
1570
mrec, offsets, zip_size, i, &len, heap);
1572
dfield_set_data(field, data, len);
1576
/********************************************************************//**
1577
Read sorted file containing index data tuples and insert these data
1579
@return DB_SUCCESS or error number */
1582
row_merge_insert_index_tuples(
1583
/*==========================*/
1584
trx_t* trx, /*!< in: transaction */
1585
dict_index_t* index, /*!< in: index */
1586
dict_table_t* table, /*!< in: new table */
1587
ulint zip_size,/*!< in: compressed page size of
1588
the old table, or 0 if uncompressed */
1589
int fd, /*!< in: file descriptor */
1590
row_merge_block_t* block) /*!< in/out: file buffer */
1596
mem_heap_t* tuple_heap;
1597
mem_heap_t* graph_heap;
1598
ulint error = DB_SUCCESS;
1606
/* We use the insert query graph as the dummy graph
1607
needed in the row module call */
1609
trx->op_info = "inserting index entries";
1611
graph_heap = mem_heap_create(500);
1612
node = ins_node_create(INS_DIRECT, table, graph_heap);
1614
thr = pars_complete_graph_for_exec(node, trx, graph_heap);
1616
que_thr_move_to_run_state_for_mysql(thr, trx);
1618
tuple_heap = mem_heap_create(1000);
1621
ulint i = 1 + REC_OFFS_HEADER_SIZE
1622
+ dict_index_get_n_fields(index);
1623
offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
1625
offsets[1] = dict_index_get_n_fields(index);
1630
if (!row_merge_read(fd, foffs, block)) {
1631
error = DB_CORRUPTION;
1638
b = row_merge_read_rec(block, &buf, b, index,
1639
fd, &foffs, &mrec, offsets);
1640
if (UNIV_UNLIKELY(!b)) {
1641
/* End of list, or I/O error */
1643
error = DB_CORRUPTION;
1648
dtuple = row_rec_to_index_entry_low(
1649
mrec, index, offsets, &n_ext, tuple_heap);
1651
if (UNIV_UNLIKELY(n_ext)) {
1652
row_merge_copy_blobs(mrec, offsets, zip_size,
1653
dtuple, tuple_heap);
1657
node->table = table;
1658
node->trx_id = trx->id;
1660
ut_ad(dtuple_validate(dtuple));
1663
thr->run_node = thr;
1664
thr->prev_node = thr->common.parent;
1666
error = row_ins_index_entry(index, dtuple,
1669
if (UNIV_LIKELY(error == DB_SUCCESS)) {
1674
thr->lock_state = QUE_THR_LOCK_ROW;
1675
trx->error_state = error;
1676
que_thr_stop_for_mysql(thr);
1677
thr->lock_state = QUE_THR_LOCK_NOLOCK;
1678
} while (row_mysql_handle_errors(&error, trx,
1683
mem_heap_empty(tuple_heap);
1687
que_thr_stop_for_mysql_no_error(thr, trx);
1689
que_graph_free(thr->graph);
1693
mem_heap_free(tuple_heap);
1698
/*********************************************************************//**
1699
Sets an exclusive lock on a table, for the duration of creating indexes.
1700
@return error code or DB_SUCCESS */
1703
row_merge_lock_table(
1704
/*=================*/
1705
trx_t* trx, /*!< in/out: transaction */
1706
dict_table_t* table, /*!< in: table to lock */
1707
enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */
1715
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
1716
ut_ad(mode == LOCK_X || mode == LOCK_S);
1718
heap = mem_heap_create(512);
1720
trx->op_info = "setting table lock for creating or dropping index";
1722
node = sel_node_create(heap);
1723
thr = pars_complete_graph_for_exec(node, trx, heap);
1724
thr->graph->state = QUE_FORK_ACTIVE;
1726
/* We use the select query graph as the dummy graph needed
1727
in the lock module call */
1729
thr = que_fork_get_first_thr(que_node_get_parent(thr));
1730
que_thr_move_to_run_state_for_mysql(thr, trx);
1733
thr->run_node = thr;
1734
thr->prev_node = thr->common.parent;
1736
err = lock_table(0, table, mode, thr);
1738
trx->error_state = err;
1740
if (UNIV_LIKELY(err == DB_SUCCESS)) {
1741
que_thr_stop_for_mysql_no_error(thr, trx);
1743
que_thr_stop_for_mysql(thr);
1745
if (err != DB_QUE_THR_SUSPENDED) {
1746
ibool was_lock_wait;
1748
was_lock_wait = row_mysql_handle_errors(
1749
&err, trx, thr, NULL);
1751
if (was_lock_wait) {
1758
parent = que_node_get_parent(thr);
1759
run_thr = que_fork_start_command(parent);
1761
ut_a(run_thr == thr);
1763
/* There was a lock wait but the thread was not
1764
in a ready to run or running state. */
1765
trx->error_state = DB_LOCK_WAIT;
1771
que_graph_free(thr->graph);
1777
/*********************************************************************//**
1778
Drop an index from the InnoDB system tables. The data dictionary must
1779
have been locked exclusively by the caller, because the transaction
1780
will not be committed. */
1783
row_merge_drop_index(
1784
/*=================*/
1785
dict_index_t* index, /*!< in: index to be removed */
1786
dict_table_t* table, /*!< in: table */
1787
trx_t* trx) /*!< in: transaction handle */
1790
pars_info_t* info = pars_info_create();
1792
/* We use the private SQL parser of Innobase to generate the
1793
query graphs needed in deleting the dictionary data from system
1794
tables in Innobase. Deleting a row from SYS_INDEXES table also
1795
frees the file segments of the B-tree associated with the index. */
1797
static const char str1[] =
1798
"PROCEDURE DROP_INDEX_PROC () IS\n"
1800
"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
1801
"DELETE FROM SYS_INDEXES WHERE ID = :indexid\n"
1802
" AND TABLE_ID = :tableid;\n"
1805
ut_ad(index && table && trx);
1807
pars_info_add_dulint_literal(info, "indexid", index->id);
1808
pars_info_add_dulint_literal(info, "tableid", table->id);
1810
trx_start_if_not_started(trx);
1811
trx->op_info = "dropping index";
1813
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
1815
err = que_eval_sql(info, str1, FALSE, trx);
1817
ut_a(err == DB_SUCCESS);
1819
/* Replace this index with another equivalent index for all
1820
foreign key constraints on this table where this index is used */
1822
dict_table_replace_index_in_foreign_list(table, index);
1823
dict_index_remove_from_cache(table, index);
1828
/*********************************************************************//**
1829
Drop those indexes which were created before an error occurred when
1830
building an index. The data dictionary must have been locked
1831
exclusively by the caller, because the transaction will not be
1835
row_merge_drop_indexes(
1836
/*===================*/
1837
trx_t* trx, /*!< in: transaction */
1838
dict_table_t* table, /*!< in: table containing the indexes */
1839
dict_index_t** index, /*!< in: indexes to drop */
1840
ulint num_created) /*!< in: number of elements in index[] */
1844
for (key_num = 0; key_num < num_created; key_num++) {
1845
row_merge_drop_index(index[key_num], table, trx);
1849
/*********************************************************************//**
1850
Drop all partially created indexes during crash recovery. */
1853
row_merge_drop_temp_indexes(void)
1854
/*=============================*/
1859
/* We use the private SQL parser of Innobase to generate the
1860
query graphs needed in deleting the dictionary data from system
1861
tables in Innobase. Deleting a row from SYS_INDEXES table also
1862
frees the file segments of the B-tree associated with the index. */
1863
static const char drop_temp_indexes[] =
1864
"PROCEDURE DROP_TEMP_INDEXES_PROC () IS\n"
1866
"DECLARE CURSOR c IS SELECT ID FROM SYS_INDEXES\n"
1867
"WHERE SUBSTR(NAME,0,1)='" TEMP_INDEX_PREFIX_STR "';\n"
1870
"\tWHILE 1=1 LOOP\n"
1871
"\t\tFETCH c INTO indexid;\n"
1872
"\t\tIF (SQL % NOTFOUND) THEN\n"
1875
"\t\tDELETE FROM SYS_FIELDS WHERE INDEX_ID = indexid;\n"
1876
"\t\tDELETE FROM SYS_INDEXES WHERE ID = indexid;\n"
1882
trx = trx_allocate_for_background();
1883
trx->op_info = "dropping partially created indexes";
1884
row_mysql_lock_data_dictionary(trx);
1886
/* Incomplete transactions may be holding some locks on the
1887
data dictionary tables. However, they should never have been
1888
able to lock the records corresponding to the partially
1889
created indexes that we are attempting to delete, because the
1890
table was locked when the indexes were being created. We will
1891
drop the partially created indexes before the rollback of
1892
incomplete transactions is initiated. Thus, this should not
1893
interfere with the incomplete transactions. */
1894
trx->isolation_level = TRX_ISO_READ_UNCOMMITTED;
1895
pars_info_t *info = pars_info_create();
1896
err = que_eval_sql(info, drop_temp_indexes, FALSE, trx);
1897
ut_a(err == DB_SUCCESS);
1899
row_mysql_unlock_data_dictionary(trx);
1900
trx_free_for_background(trx);
1903
/*********************************************************************//**
1904
Create a merge file. */
1907
row_merge_file_create(
1908
/*==================*/
1909
merge_file_t* merge_file) /*!< out: merge file structure */
1911
merge_file->fd = innobase_mysql_tmpfile();
1912
merge_file->offset = 0;
1915
/*********************************************************************//**
1916
Destroy a merge file. */
1919
row_merge_file_destroy(
1920
/*===================*/
1921
merge_file_t* merge_file) /*!< out: merge file structure */
1923
if (merge_file->fd != -1) {
1924
close(merge_file->fd);
1925
merge_file->fd = -1;
1929
/*********************************************************************//**
1930
Determine the precise type of a column that is added to a tem
1931
if a column must be constrained NOT NULL.
1932
@return col->prtype, possibly ORed with DATA_NOT_NULL */
1935
row_merge_col_prtype(
1936
/*=================*/
1937
const dict_col_t* col, /*!< in: column */
1938
const char* col_name, /*!< in: name of the column */
1939
const merge_index_def_t*index_def) /*!< in: the index definition
1940
of the primary key */
1942
ulint prtype = col->prtype;
1945
ut_ad(index_def->ind_type & DICT_CLUSTERED);
1947
if (prtype & DATA_NOT_NULL) {
1952
/* All columns that are included
1953
in the PRIMARY KEY must be NOT NULL. */
1955
for (i = 0; i < index_def->n_fields; i++) {
1956
if (!strcmp(col_name, index_def->fields[i].field_name)) {
1957
return(prtype | DATA_NOT_NULL);
1964
/*********************************************************************//**
1965
Create a temporary table for creating a primary key, using the definition
1966
of an existing table.
1967
@return table, or NULL on error */
1970
row_merge_create_temporary_table(
1971
/*=============================*/
1972
const char* table_name, /*!< in: new table name */
1973
const merge_index_def_t*index_def, /*!< in: the index definition
1974
of the primary key */
1975
const dict_table_t* table, /*!< in: old table definition */
1976
trx_t* trx) /*!< in/out: transaction
1977
(sets error_state) */
1980
dict_table_t* new_table = NULL;
1981
ulint n_cols = dict_table_get_n_user_cols(table);
1983
mem_heap_t* heap = mem_heap_create(1000);
1988
ut_ad(mutex_own(&dict_sys->mutex));
1990
new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags);
1992
for (i = 0; i < n_cols; i++) {
1993
const dict_col_t* col;
1994
const char* col_name;
1996
col = dict_table_get_nth_col(table, i);
1997
col_name = dict_table_get_col_name(table, i);
1999
dict_mem_table_add_col(new_table, heap, col_name, col->mtype,
2000
row_merge_col_prtype(col, col_name,
2005
error = row_create_table_for_mysql(new_table, trx);
2006
mem_heap_free(heap);
2008
if (error != DB_SUCCESS) {
2009
trx->error_state = error;
2016
/*********************************************************************//**
2017
Rename the temporary indexes in the dictionary to permanent ones. The
2018
data dictionary must have been locked exclusively by the caller,
2019
because the transaction will not be committed.
2020
@return DB_SUCCESS if all OK */
2023
row_merge_rename_indexes(
2024
/*=====================*/
2025
trx_t* trx, /*!< in/out: transaction */
2026
dict_table_t* table) /*!< in/out: table with new indexes */
2028
ulint err = DB_SUCCESS;
2029
pars_info_t* info = pars_info_create();
2031
/* We use the private SQL parser of Innobase to generate the
2032
query graphs needed in renaming indexes. */
2034
static const char rename_indexes[] =
2035
"PROCEDURE RENAME_INDEXES_PROC () IS\n"
2037
"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
2038
"WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='"
2039
TEMP_INDEX_PREFIX_STR "';\n"
2044
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
2046
trx->op_info = "renaming indexes";
2048
pars_info_add_dulint_literal(info, "tableid", table->id);
2050
err = que_eval_sql(info, rename_indexes, FALSE, trx);
2052
if (err == DB_SUCCESS) {
2053
dict_index_t* index = dict_table_get_first_index(table);
2055
if (*index->name == TEMP_INDEX_PREFIX) {
2058
index = dict_table_get_next_index(index);
2067
/*********************************************************************//**
2068
Rename the tables in the data dictionary. The data dictionary must
2069
have been locked exclusively by the caller, because the transaction
2070
will not be committed.
2071
@return error code or DB_SUCCESS */
2074
row_merge_rename_tables(
2075
/*====================*/
2076
dict_table_t* old_table, /*!< in/out: old table, renamed to
2078
dict_table_t* new_table, /*!< in/out: new table, renamed to
2080
const char* tmp_name, /*!< in: new name for old_table */
2081
trx_t* trx) /*!< in: transaction handle */
2083
ulint err = DB_ERROR;
2085
const char* old_name= old_table->name;
2087
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
2088
ut_ad(old_table != new_table);
2089
ut_ad(mutex_own(&dict_sys->mutex));
2091
ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
2093
trx->op_info = "renaming tables";
2095
/* We use the private SQL parser of Innobase to generate the query
2096
graphs needed in updating the dictionary data in system tables. */
2098
info = pars_info_create();
2100
pars_info_add_str_literal(info, "new_name", new_table->name);
2101
pars_info_add_str_literal(info, "old_name", old_name);
2102
pars_info_add_str_literal(info, "tmp_name", tmp_name);
2104
err = que_eval_sql(info,
2105
"PROCEDURE RENAME_TABLES () IS\n"
2107
"UPDATE SYS_TABLES SET NAME = :tmp_name\n"
2108
" WHERE NAME = :old_name;\n"
2109
"UPDATE SYS_TABLES SET NAME = :old_name\n"
2110
" WHERE NAME = :new_name;\n"
2111
"END;\n", FALSE, trx);
2113
if (err != DB_SUCCESS) {
2118
/* The following calls will also rename the .ibd data files if
2119
the tables are stored in a single-table tablespace */
2121
if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE)
2122
|| !dict_table_rename_in_cache(new_table, old_name, FALSE)) {
2128
err = dict_load_foreigns(old_name, TRUE);
2130
if (err != DB_SUCCESS) {
2132
trx->error_state = DB_SUCCESS;
2133
trx_general_rollback_for_mysql(trx, FALSE, NULL);
2134
trx->error_state = DB_SUCCESS;
2142
/*********************************************************************//**
2143
Create and execute a query graph for creating an index.
2144
@return DB_SUCCESS or error code */
2147
row_merge_create_index_graph(
2148
/*=========================*/
2149
trx_t* trx, /*!< in: trx */
2150
dict_table_t* table, /*!< in: table */
2151
dict_index_t* index) /*!< in: index */
2153
ind_node_t* node; /*!< Index creation node */
2154
mem_heap_t* heap; /*!< Memory heap */
2155
que_thr_t* thr; /*!< Query thread */
2162
heap = mem_heap_create(512);
2164
index->table = table;
2165
node = ind_create_graph_create(index, heap);
2166
thr = pars_complete_graph_for_exec(node, trx, heap);
2168
ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
2170
que_run_threads(thr);
2172
err = trx->error_state;
2174
que_graph_free((que_t*) que_node_get_parent(thr));
2179
/*********************************************************************//**
2180
Create the index and load in to the dictionary.
2181
@return index, or NULL on error */
2184
row_merge_create_index(
2185
/*===================*/
2186
trx_t* trx, /*!< in/out: trx (sets error_state) */
2187
dict_table_t* table, /*!< in: the index is on this table */
2188
const merge_index_def_t*index_def)
2189
/*!< in: the index definition */
2191
dict_index_t* index;
2193
ulint n_fields = index_def->n_fields;
2196
/* Create the index prototype, using the passed in def, this is not
2197
a persistent operation. We pass 0 as the space id, and determine at
2198
a lower level the space id where to store the table. */
2200
index = dict_mem_index_create(table->name, index_def->name,
2201
0, index_def->ind_type, n_fields);
2205
for (i = 0; i < n_fields; i++) {
2206
merge_index_field_t* ifield = &index_def->fields[i];
2208
dict_mem_index_add_field(index, ifield->field_name,
2209
ifield->prefix_len);
2212
/* Add the index to SYS_INDEXES, using the index prototype. */
2213
err = row_merge_create_index_graph(trx, table, index);
2215
if (err == DB_SUCCESS) {
2217
index = row_merge_dict_table_get_index(
2222
/* Note the id of the transaction that created this
2223
index, we use it to restrict readers from accessing
2224
this index, to ensure read consistency. */
2225
index->trx_id = (ib_uint64_t)
2226
ut_conv_dulint_to_longlong(trx->id);
2234
/*********************************************************************//**
2235
Check if a transaction can use an index. */
2238
row_merge_is_index_usable(
2239
/*======================*/
2240
const trx_t* trx, /*!< in: transaction */
2241
const dict_index_t* index) /*!< in: index to check */
2243
return(!trx->read_view || read_view_sees_trx_id(
2245
ut_dulint_create((ulint) (index->trx_id >> 32),
2246
(ulint) index->trx_id & 0xFFFFFFFF)));
2249
/*********************************************************************//**
2251
@return DB_SUCCESS or error code */
2254
row_merge_drop_table(
2255
/*=================*/
2256
trx_t* trx, /*!< in: transaction */
2257
dict_table_t* table) /*!< in: table to drop */
2259
/* There must be no open transactions on the table. */
2260
ut_a(table->n_mysql_handles_opened == 0);
2262
return(row_drop_table_for_mysql(table->name, trx, FALSE));
2265
/*********************************************************************//**
2266
Build indexes on a table by reading a clustered index,
2267
creating a temporary file containing index entries, merge sorting
2268
these index entries and inserting sorted index entries to indexes.
2269
@return DB_SUCCESS or error code */
2272
row_merge_build_indexes(
2273
/*====================*/
2274
trx_t* trx, /*!< in: transaction */
2275
dict_table_t* old_table, /*!< in: table where rows are
2277
dict_table_t* new_table, /*!< in: table where indexes are
2278
created; identical to old_table
2279
unless creating a PRIMARY KEY */
2280
dict_index_t** indexes, /*!< in: indexes to be created */
2281
ulint n_indexes, /*!< in: size of indexes[] */
2282
TABLE* table) /*!< in/out: MySQL table, for
2283
reporting erroneous key value
2286
merge_file_t* merge_files;
2287
row_merge_block_t* block;
2299
trx_start_if_not_started(trx);
2301
/* Allocate memory for merge file data structure and initialize
2304
merge_files = mem_alloc(n_indexes * sizeof *merge_files);
2305
block_size = 3 * sizeof *block;
2306
block = os_mem_alloc_large(&block_size);
2308
for (i = 0; i < n_indexes; i++) {
2310
row_merge_file_create(&merge_files[i]);
2313
tmpfd = innobase_mysql_tmpfile();
2315
/* Reset the MySQL row buffer that is used when reporting
2317
innobase_rec_reset(table);
2319
/* Read clustered index of the table and create files for
2320
secondary index entries for merge sort */
2322
error = row_merge_read_clustered_index(
2323
trx, table, old_table, new_table, indexes,
2324
merge_files, n_indexes, block);
2326
if (error != DB_SUCCESS) {
2331
/* Now we have files containing index entries ready for
2332
sorting and inserting. */
2334
for (i = 0; i < n_indexes; i++) {
2335
error = row_merge_sort(indexes[i], &merge_files[i],
2336
block, &tmpfd, table);
2338
if (error == DB_SUCCESS) {
2339
error = row_merge_insert_index_tuples(
2340
trx, indexes[i], new_table,
2341
dict_table_zip_size(old_table),
2342
merge_files[i].fd, block);
2345
/* Close the temporary file to free up space. */
2346
row_merge_file_destroy(&merge_files[i]);
2348
if (error != DB_SUCCESS) {
2349
trx->error_key_num = i;
2357
for (i = 0; i < n_indexes; i++) {
2358
row_merge_file_destroy(&merge_files[i]);
2361
mem_free(merge_files);
2362
os_mem_free_large(block, block_size);