1
/*****************************************************************************
3
Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15
St, Fifth Floor, Boston, MA 02110-1301 USA
17
*****************************************************************************/
19
/**************************************************//**
23
Created 3/26/1996 Heikki Tuuri
24
*******************************************************/
32
#ifndef UNIV_HOTBACKUP
40
#include "trx0purge.h"
44
#include "read0read.h"
46
/** The file format tag structure with id and name. */
47
struct file_format_struct {
48
ulint id; /*!< id of the file format */
49
const char* name; /*!< text representation of the
51
mutex_t mutex; /*!< covers changes to the above
55
/** The file format tag */
56
typedef struct file_format_struct file_format_t;
58
/** The transaction system */
59
UNIV_INTERN trx_sys_t* trx_sys = NULL;
60
/** The doublewrite buffer */
61
UNIV_INTERN trx_doublewrite_t* trx_doublewrite = NULL;
63
/** The following is set to TRUE when we are upgrading from pre-4.1
64
format data files to the multiple tablespaces format data files */
65
UNIV_INTERN ibool trx_doublewrite_must_reset_space_ids = FALSE;
66
/** Set to TRUE when the doublewrite buffer is being created */
67
UNIV_INTERN ibool trx_doublewrite_buf_is_being_created = FALSE;
69
/** The following is TRUE when we are using the database in the
70
post-4.1 format, i.e., we have successfully upgraded, or have created
71
a new database installation */
72
UNIV_INTERN ibool trx_sys_multiple_tablespace_format = FALSE;
74
/** In a MySQL replication slave, in crash recovery we store the master log
75
file name and position here. */
77
/** Master binlog file name */
78
UNIV_INTERN char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
79
/** Master binlog file position. We have successfully got the updates
80
up to this position. -1 means that no crash recovery was needed, or
81
there was no master log position info inside InnoDB.*/
82
UNIV_INTERN ib_int64_t trx_sys_mysql_master_log_pos = -1;
85
/** If this MySQL server uses binary logging, after InnoDB has been inited
86
and if it has done a crash recovery, we store the binlog file name and position
89
/** Binlog file name */
90
UNIV_INTERN char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
91
/** Binlog file position, or -1 if unknown */
92
UNIV_INTERN ib_int64_t trx_sys_mysql_bin_log_pos = -1;
94
#endif /* !UNIV_HOTBACKUP */
96
/** List of animal names representing file format. */
97
static const char* file_format_name_map[] = {
126
/** The number of elements in the file format name array. */
127
static const ulint FILE_FORMAT_NAME_N
128
= sizeof(file_format_name_map) / sizeof(file_format_name_map[0]);
130
#ifdef UNIV_PFS_MUTEX
131
/* Key to register the mutex with performance schema */
132
UNIV_INTERN mysql_pfs_key_t trx_doublewrite_mutex_key;
133
UNIV_INTERN mysql_pfs_key_t file_format_max_mutex_key;
134
#endif /* UNIV_PFS_MUTEX */
136
#ifndef UNIV_HOTBACKUP
137
/** This is used to track the maximum file format id known to InnoDB. It's
138
updated via SET GLOBAL innodb_file_format_max = 'x' or when we open
139
or create a table. */
140
static file_format_t file_format_max;
142
/****************************************************************//**
143
Determines if a page number is located inside the doublewrite buffer.
144
@return TRUE if the location is inside the two blocks of the
145
doublewrite buffer */
148
trx_doublewrite_page_inside(
149
/*========================*/
150
ulint page_no) /*!< in: page number */
152
if (trx_doublewrite == NULL) {
157
if (page_no >= trx_doublewrite->block1
158
&& page_no < trx_doublewrite->block1
159
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
163
if (page_no >= trx_doublewrite->block2
164
&& page_no < trx_doublewrite->block2
165
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
172
/****************************************************************//**
173
Creates or initialializes the doublewrite buffer at a database start. */
176
trx_doublewrite_init(
177
/*=================*/
178
byte* doublewrite) /*!< in: pointer to the doublewrite buf
179
header on trx sys page */
181
trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t));
183
/* Since we now start to use the doublewrite buffer, no need to call
184
fsync() after every write to a data file */
186
os_do_not_call_flush_at_each_write = TRUE;
187
#endif /* UNIV_DO_FLUSH */
189
mutex_create(trx_doublewrite_mutex_key,
190
&trx_doublewrite->mutex, SYNC_DOUBLEWRITE);
192
trx_doublewrite->first_free = 0;
194
trx_doublewrite->block1 = mach_read_from_4(
195
doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
196
trx_doublewrite->block2 = mach_read_from_4(
197
doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
198
trx_doublewrite->write_buf_unaligned = ut_malloc(
199
(1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE);
201
trx_doublewrite->write_buf = ut_align(
202
trx_doublewrite->write_buf_unaligned, UNIV_PAGE_SIZE);
203
trx_doublewrite->buf_block_arr = mem_alloc(
204
2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * sizeof(void*));
207
/****************************************************************//**
208
Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
209
multiple tablespace format. */
212
trx_sys_mark_upgraded_to_multiple_tablespaces(void)
213
/*===============================================*/
219
/* We upgraded to 4.1.x and reset the space id fields in the
220
doublewrite buffer. Let us mark to the trx_sys header that the upgrade
225
block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
227
buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
229
doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
231
mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
232
TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
236
/* Flush the modified pages to disk and make a checkpoint */
237
log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
239
trx_sys_multiple_tablespace_format = TRUE;
242
/****************************************************************//**
243
Creates the doublewrite buffer to a new InnoDB installation. The header of the
244
doublewrite buffer is placed on the trx system header page. */
247
trx_sys_create_doublewrite_buf(void)
248
/*================================*/
252
#ifdef UNIV_SYNC_DEBUG
253
buf_block_t* new_block;
254
#endif /* UNIV_SYNC_DEBUG */
262
if (trx_doublewrite) {
270
trx_doublewrite_buf_is_being_created = TRUE;
272
block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
274
buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
276
doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
278
if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
279
== TRX_SYS_DOUBLEWRITE_MAGIC_N) {
280
/* The doublewrite buffer has already been created:
281
just read in some numbers */
283
trx_doublewrite_init(doublewrite);
286
trx_doublewrite_buf_is_being_created = FALSE;
289
"InnoDB: Doublewrite buffer not found:"
292
if (buf_pool_get_curr_size()
293
< ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
294
+ FSP_EXTENT_SIZE / 2 + 100)
297
"InnoDB: Cannot create doublewrite buffer:"
299
"InnoDB: increase your buffer pool size.\n"
300
"InnoDB: Cannot continue operation.\n");
305
block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
307
+ TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
309
/* fseg_create acquires a second latch on the page,
310
therefore we must declare it: */
312
buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
314
if (block2 == NULL) {
316
"InnoDB: Cannot create doublewrite buffer:"
318
"InnoDB: increase your tablespace size.\n"
319
"InnoDB: Cannot continue operation.\n");
321
/* We exit without committing the mtr to prevent
322
its modifications to the database getting to disk */
327
fseg_header = buf_block_get_frame(block)
328
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
331
for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
332
+ FSP_EXTENT_SIZE / 2; i++) {
333
page_no = fseg_alloc_free_page(fseg_header,
336
if (page_no == FIL_NULL) {
338
"InnoDB: Cannot create doublewrite"
339
" buffer: you must\n"
340
"InnoDB: increase your"
341
" tablespace size.\n"
342
"InnoDB: Cannot continue operation.\n"
348
/* We read the allocated pages to the buffer pool;
349
when they are written to disk in a flush, the space
350
id and page number fields are also written to the
351
pages. When we at database startup read pages
352
from the doublewrite buffer, we know that if the
353
space id and page number in them are the same as
354
the page position in the tablespace, then the page
355
has not been written to in doublewrite. */
357
#ifdef UNIV_SYNC_DEBUG
359
#endif /* UNIV_SYNC_DEBUG */
360
buf_page_get(TRX_SYS_SPACE, 0, page_no,
362
buf_block_dbg_add_level(new_block,
363
SYNC_NO_ORDER_CHECK);
365
if (i == FSP_EXTENT_SIZE / 2) {
366
ut_a(page_no == FSP_EXTENT_SIZE);
367
mlog_write_ulint(doublewrite
368
+ TRX_SYS_DOUBLEWRITE_BLOCK1,
369
page_no, MLOG_4BYTES, &mtr);
370
mlog_write_ulint(doublewrite
371
+ TRX_SYS_DOUBLEWRITE_REPEAT
372
+ TRX_SYS_DOUBLEWRITE_BLOCK1,
373
page_no, MLOG_4BYTES, &mtr);
374
} else if (i == FSP_EXTENT_SIZE / 2
375
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
376
ut_a(page_no == 2 * FSP_EXTENT_SIZE);
377
mlog_write_ulint(doublewrite
378
+ TRX_SYS_DOUBLEWRITE_BLOCK2,
379
page_no, MLOG_4BYTES, &mtr);
380
mlog_write_ulint(doublewrite
381
+ TRX_SYS_DOUBLEWRITE_REPEAT
382
+ TRX_SYS_DOUBLEWRITE_BLOCK2,
383
page_no, MLOG_4BYTES, &mtr);
384
} else if (i > FSP_EXTENT_SIZE / 2) {
385
ut_a(page_no == prev_page_no + 1);
388
prev_page_no = page_no;
391
mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
392
TRX_SYS_DOUBLEWRITE_MAGIC_N,
394
mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
395
+ TRX_SYS_DOUBLEWRITE_REPEAT,
396
TRX_SYS_DOUBLEWRITE_MAGIC_N,
399
mlog_write_ulint(doublewrite
400
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
401
TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
405
/* Flush the modified pages to disk and make a checkpoint */
406
log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
408
fprintf(stderr, "InnoDB: Doublewrite buffer created\n");
410
trx_sys_multiple_tablespace_format = TRUE;
416
/****************************************************************//**
417
At a database startup initializes the doublewrite buffer memory structure if
418
we already have a doublewrite buffer created in the data files. If we are
419
upgrading to an InnoDB version which supports multiple tablespaces, then this
420
function performs the necessary update operations. If we are in a crash
421
recovery, this function uses a possible doublewrite buffer to restore
422
half-written pages in the data files. */
425
trx_sys_doublewrite_init_or_restore_pages(
426
/*======================================*/
427
ibool restore_corrupt_pages) /*!< in: TRUE=restore pages */
431
byte* unaligned_read_buf;
434
ulint source_page_no;
441
/* We do the file i/o past the buffer pool */
443
unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
444
read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE);
446
/* Read the trx sys header to check if we are using the doublewrite
449
fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0,
450
UNIV_PAGE_SIZE, read_buf, NULL);
451
doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
453
if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
454
== TRX_SYS_DOUBLEWRITE_MAGIC_N) {
455
/* The doublewrite buffer has been created */
457
trx_doublewrite_init(doublewrite);
459
block1 = trx_doublewrite->block1;
460
block2 = trx_doublewrite->block2;
462
buf = trx_doublewrite->write_buf;
467
if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
468
!= TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
470
/* We are upgrading from a version < 4.1.x to a version where
471
multiple tablespaces are supported. We must reset the space id
472
field in the pages in the doublewrite buffer because starting
473
from this version the space id is stored to
474
FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
476
trx_doublewrite_must_reset_space_ids = TRUE;
479
"InnoDB: Resetting space id's in the"
480
" doublewrite buffer\n");
482
trx_sys_multiple_tablespace_format = TRUE;
485
/* Read the pages from the doublewrite buffer to memory */
487
fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block1, 0,
488
TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
490
fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block2, 0,
491
TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
492
buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
494
/* Check if any of these pages is half-written in data files, in the
499
for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
501
page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
503
if (trx_doublewrite_must_reset_space_ids) {
507
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
508
/* We do not need to calculate new checksums for the
509
pages because the field .._SPACE_ID does not affect
510
them. Write the page back to where we read it from. */
512
if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
513
source_page_no = block1 + i;
515
source_page_no = block2
516
+ i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
519
fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0,
520
UNIV_PAGE_SIZE, page, NULL);
521
/* printf("Resetting space id in page %lu\n",
524
space_id = mach_read_from_4(
525
page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
528
if (!restore_corrupt_pages) {
529
/* The database was shut down gracefully: no need to
532
} else if (!fil_tablespace_exists_in_mem(space_id)) {
533
/* Maybe we have dropped the single-table tablespace
534
and this page once belonged to it: do nothing */
536
} else if (!fil_check_adress_in_tablespace(space_id,
539
"InnoDB: Warning: a page in the"
540
" doublewrite buffer is not within space\n"
541
"InnoDB: bounds; space id %lu"
542
" page number %lu, page %lu in"
543
" doublewrite buf.\n",
544
(ulong) space_id, (ulong) page_no, (ulong) i);
546
} else if (space_id == TRX_SYS_SPACE
547
&& ((page_no >= block1
549
< block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
550
|| (page_no >= block2
553
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
555
/* It is an unwritten doublewrite buffer page:
558
ulint zip_size = fil_space_get_zip_size(space_id);
560
/* Read in the actual page from the file */
561
fil_io(OS_FILE_READ, TRUE, space_id, zip_size,
563
zip_size ? zip_size : UNIV_PAGE_SIZE,
566
/* Check if the page is corrupt */
569
(buf_page_is_corrupted(read_buf, zip_size))) {
572
"InnoDB: Warning: database page"
573
" corruption or a failed\n"
574
"InnoDB: file read of"
575
" space %lu page %lu.\n"
576
"InnoDB: Trying to recover it from"
577
" the doublewrite buffer.\n",
578
(ulong) space_id, (ulong) page_no);
580
if (buf_page_is_corrupted(page, zip_size)) {
582
"InnoDB: Dump of the page:\n");
583
buf_page_print(read_buf, zip_size);
586
" corresponding page"
587
" in doublewrite buffer:\n");
588
buf_page_print(page, zip_size);
591
"InnoDB: Also the page in the"
592
" doublewrite buffer"
594
"InnoDB: Cannot continue"
596
"InnoDB: You can try to"
597
" recover the database"
601
" innodb_force_recovery=6\n");
605
/* Write the good page from the
606
doublewrite buffer to the intended
609
fil_io(OS_FILE_WRITE, TRUE, space_id,
610
zip_size, page_no, 0,
611
zip_size ? zip_size : UNIV_PAGE_SIZE,
614
"InnoDB: Recovered the page from"
615
" the doublewrite buffer.\n");
619
page += UNIV_PAGE_SIZE;
622
fil_flush_file_spaces(FIL_TABLESPACE);
625
ut_free(unaligned_read_buf);
628
/****************************************************************//**
629
Checks that trx is in the trx list.
630
@return TRUE if is in */
635
trx_t* in_trx) /*!< in: trx */
639
ut_ad(mutex_own(&(kernel_mutex)));
641
trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
643
while (trx != NULL) {
650
trx = UT_LIST_GET_NEXT(trx_list, trx);
656
/*****************************************************************//**
657
Writes the value of max_trx_id to the file based trx system header. */
660
trx_sys_flush_max_trx_id(void)
661
/*==========================*/
663
trx_sysf_t* sys_header;
666
ut_ad(mutex_own(&kernel_mutex));
670
sys_header = trx_sysf_get(&mtr);
672
mlog_write_ull(sys_header + TRX_SYS_TRX_ID_STORE,
673
trx_sys->max_trx_id, &mtr);
677
/*****************************************************************//**
678
Updates the offset information about the end of the MySQL binlog entry
679
which corresponds to the transaction just being committed. In a MySQL
680
replication slave updates the latest master binlog position up to which
681
replication has proceeded. */
684
trx_sys_update_mysql_binlog_offset(
685
/*===============================*/
686
const char* file_name,/*!< in: MySQL log file name */
687
ib_int64_t offset, /*!< in: position in that log file */
688
ulint field, /*!< in: offset of the MySQL log info field in
689
the trx sys header */
690
mtr_t* mtr) /*!< in: mtr */
692
trx_sysf_t* sys_header;
694
if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) {
696
/* We cannot fit the name to the 512 bytes we have reserved */
701
sys_header = trx_sysf_get(mtr);
703
if (mach_read_from_4(sys_header + field
704
+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
705
!= TRX_SYS_MYSQL_LOG_MAGIC_N) {
707
mlog_write_ulint(sys_header + field
708
+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD,
709
TRX_SYS_MYSQL_LOG_MAGIC_N,
713
if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME),
716
mlog_write_string(sys_header + field
717
+ TRX_SYS_MYSQL_LOG_NAME,
718
(byte*) file_name, 1 + ut_strlen(file_name),
722
if (mach_read_from_4(sys_header + field
723
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0
724
|| (offset >> 32) > 0) {
726
mlog_write_ulint(sys_header + field
727
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH,
728
(ulint)(offset >> 32),
732
mlog_write_ulint(sys_header + field
733
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW,
734
(ulint)(offset & 0xFFFFFFFFUL),
738
/*****************************************************************//**
739
Stores the MySQL binlog offset info in the trx system header if
740
the magic number shows it valid, and print the info to stderr */
743
trx_sys_print_mysql_binlog_offset(void)
744
/*===================================*/
746
trx_sysf_t* sys_header;
748
ulint trx_sys_mysql_bin_log_pos_high;
749
ulint trx_sys_mysql_bin_log_pos_low;
753
sys_header = trx_sysf_get(&mtr);
755
if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
756
+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
757
!= TRX_SYS_MYSQL_LOG_MAGIC_N) {
764
trx_sys_mysql_bin_log_pos_high = mach_read_from_4(
765
sys_header + TRX_SYS_MYSQL_LOG_INFO
766
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH);
767
trx_sys_mysql_bin_log_pos_low = mach_read_from_4(
768
sys_header + TRX_SYS_MYSQL_LOG_INFO
769
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW);
771
trx_sys_mysql_bin_log_pos
772
= (((ib_int64_t)trx_sys_mysql_bin_log_pos_high) << 32)
773
+ (ib_int64_t)trx_sys_mysql_bin_log_pos_low;
775
ut_memcpy(trx_sys_mysql_bin_log_name,
776
sys_header + TRX_SYS_MYSQL_LOG_INFO
777
+ TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN);
780
"InnoDB: Last MySQL binlog file position %lu %lu,"
782
trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low,
783
trx_sys_mysql_bin_log_name);
788
/*****************************************************************//**
789
Prints to stderr the MySQL master log offset info in the trx system header if
790
the magic number shows it valid. */
793
trx_sys_print_mysql_master_log_pos(void)
794
/*====================================*/
796
trx_sysf_t* sys_header;
801
sys_header = trx_sysf_get(&mtr);
803
if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
804
+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
805
!= TRX_SYS_MYSQL_LOG_MAGIC_N) {
813
"InnoDB: In a MySQL replication slave the last"
814
" master binlog file\n"
815
"InnoDB: position %lu %lu, file name %s\n",
816
(ulong) mach_read_from_4(sys_header
817
+ TRX_SYS_MYSQL_MASTER_LOG_INFO
818
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
819
(ulong) mach_read_from_4(sys_header
820
+ TRX_SYS_MYSQL_MASTER_LOG_INFO
821
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW),
822
sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
823
+ TRX_SYS_MYSQL_LOG_NAME);
824
/* Copy the master log position info to global variables we can
825
use in ha_innobase.cc to initialize glob_mi to right values */
827
ut_memcpy(trx_sys_mysql_master_log_name,
828
sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
829
+ TRX_SYS_MYSQL_LOG_NAME,
830
TRX_SYS_MYSQL_LOG_NAME_LEN);
832
trx_sys_mysql_master_log_pos
833
= (((ib_int64_t) mach_read_from_4(
834
sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
835
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32)
836
+ ((ib_int64_t) mach_read_from_4(
837
sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
838
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW));
842
/****************************************************************//**
843
Looks for a free slot for a rollback segment in the trx system file copy.
844
@return slot index or ULINT_UNDEFINED if not found */
847
trx_sysf_rseg_find_free(
848
/*====================*/
849
mtr_t* mtr) /*!< in: mtr */
851
trx_sysf_t* sys_header;
855
ut_ad(mutex_own(&(kernel_mutex)));
857
sys_header = trx_sysf_get(mtr);
859
for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
861
page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
863
if (page_no == FIL_NULL) {
869
return(ULINT_UNDEFINED);
872
/*****************************************************************//**
873
Creates the file page for the transaction system. This function is called only
874
at the database creation, before trx_sys_init. */
879
mtr_t* mtr) /*!< in: mtr */
881
trx_sysf_t* sys_header;
891
/* Note that below we first reserve the file space x-latch, and
892
then enter the kernel: we must do it in this order to conform
893
to the latching order rules. */
895
mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr);
896
mutex_enter(&kernel_mutex);
898
/* Create the trx sys file block in a new allocated file segment */
899
block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
901
buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
903
ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
905
page = buf_block_get_frame(block);
907
mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
910
/* Reset the doublewrite buffer magic number to zero so that we
911
know that the doublewrite buffer has not yet been created (this
912
suppresses a Valgrind warning) */
914
mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
915
+ TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
917
sys_header = trx_sysf_get(mtr);
919
/* Start counting transaction ids from number 1 up */
920
mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE, 1);
922
/* Reset the rollback segment slots. Old versions of InnoDB
923
define TRX_SYS_N_RSEGS as 256 (TRX_SYS_OLD_N_RSEGS) and expect
924
that the whole array is initialized. */
925
ptr = TRX_SYS_RSEGS + sys_header;
926
len = ut_max(TRX_SYS_OLD_N_RSEGS, TRX_SYS_N_RSEGS)
927
* TRX_SYS_RSEG_SLOT_SIZE;
928
memset(ptr, 0xff, len);
930
ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END));
932
/* Initialize all of the page. This part used to be uninitialized. */
933
memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr);
935
mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
936
+ page - sys_header, mtr);
938
/* Create the first rollback segment in the SYSTEM tablespace */
939
slot_no = trx_sysf_rseg_find_free(mtr);
940
page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no,
942
ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
943
ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO);
945
mutex_exit(&kernel_mutex);
948
/*****************************************************************//**
949
Creates and initializes the central memory structures for the transaction
950
system. This is called when the database is started. */
953
trx_sys_init_at_db_start(void)
954
/*==========================*/
956
trx_sysf_t* sys_header;
957
ib_uint64_t rows_to_undo = 0;
958
const char* unit = "";
964
ut_ad(trx_sys == NULL);
966
mutex_enter(&kernel_mutex);
968
trx_sys = mem_alloc(sizeof(trx_sys_t));
970
sys_header = trx_sysf_get(&mtr);
972
trx_rseg_list_and_array_init(sys_header, &mtr);
974
trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
976
/* VERY important: after the database is started, max_trx_id value is
977
divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
978
trx_sys_get_new_trx_id will evaluate to TRUE when the function
979
is first time called, and the value for trx id will be written
980
to the disk-based header! Thus trx id values will not overlap when
981
the database is repeatedly started! */
983
trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN
984
+ ut_uint64_align_up(mach_read_from_8(sys_header
985
+ TRX_SYS_TRX_ID_STORE),
986
TRX_SYS_TRX_ID_WRITE_MARGIN);
988
UT_LIST_INIT(trx_sys->mysql_trx_list);
989
trx_dummy_sess = sess_open();
990
trx_lists_init_at_db_start();
992
if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
993
trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
997
if (trx->conc_state != TRX_PREPARED) {
998
rows_to_undo += trx->undo_no;
1001
trx = UT_LIST_GET_NEXT(trx_list, trx);
1008
if (rows_to_undo > 1000000000) {
1010
rows_to_undo = rows_to_undo / 1000000;
1014
"InnoDB: %lu transaction(s) which must be"
1015
" rolled back or cleaned up\n"
1016
"InnoDB: in total %lu%s row operations to undo\n",
1017
(ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
1018
(ulong) rows_to_undo, unit);
1020
fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
1021
trx_sys->max_trx_id);
1024
UT_LIST_INIT(trx_sys->view_list);
1026
trx_purge_sys_create();
1028
mutex_exit(&kernel_mutex);
1033
/*****************************************************************//**
1034
Creates and initializes the transaction system at the database creation. */
1037
trx_sys_create(void)
1038
/*================*/
1044
trx_sysf_create(&mtr);
1048
trx_sys_init_at_db_start();
1051
/*****************************************************************//**
1052
Update the file format tag.
1053
@return always TRUE */
1056
trx_sys_file_format_max_write(
1057
/*==========================*/
1058
ulint format_id, /*!< in: file format id */
1059
const char** name) /*!< out: max file format name, can
1065
ib_uint64_t tag_value;
1069
block = buf_page_get(
1070
TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
1072
file_format_max.id = format_id;
1073
file_format_max.name = trx_sys_file_format_id_to_name(format_id);
1075
ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
1076
tag_value = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
1079
*name = file_format_max.name;
1082
mlog_write_ull(ptr, tag_value, &mtr);
1089
/*****************************************************************//**
1090
Read the file format tag.
1091
@return the file format or ULINT_UNDEFINED if not set. */
1094
trx_sys_file_format_max_read(void)
1095
/*==============================*/
1099
const buf_block_t* block;
1100
ib_id_t file_format_id;
1102
/* Since this is called during the startup phase it's safe to
1103
read the value without a covering mutex. */
1106
block = buf_page_get(
1107
TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
1109
ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
1110
file_format_id = mach_read_from_8(ptr);
1114
file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
1116
if (file_format_id >= FILE_FORMAT_NAME_N) {
1118
/* Either it has never been tagged, or garbage in it. */
1119
return(ULINT_UNDEFINED);
1122
return((ulint) file_format_id);
1125
/*****************************************************************//**
1126
Get the name representation of the file format from its id.
1127
@return pointer to the name */
1130
trx_sys_file_format_id_to_name(
1131
/*===========================*/
1132
const ulint id) /*!< in: id of the file format */
1134
ut_a(id < FILE_FORMAT_NAME_N);
1136
return(file_format_name_map[id]);
1139
/*****************************************************************//**
1140
Check for the max file format tag stored on disk. Note: If max_format_id
1141
is == DICT_TF_FORMAT_MAX + 1 then we only print a warning.
1142
@return DB_SUCCESS or error code */
1145
trx_sys_file_format_max_check(
1146
/*==========================*/
1147
ulint max_format_id) /*!< in: max format id to check */
1151
/* Check the file format in the tablespace. Do not try to
1152
recover if the file format is not supported by the engine
1153
unless forced by the user. */
1154
format_id = trx_sys_file_format_max_read();
1155
if (format_id == ULINT_UNDEFINED) {
1156
/* Format ID was not set. Set it to minimum possible
1158
format_id = DICT_TF_FORMAT_MIN;
1161
ut_print_timestamp(stderr);
1163
" InnoDB: highest supported file format is %s.\n",
1164
trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX));
1166
if (format_id > DICT_TF_FORMAT_MAX) {
1168
ut_a(format_id < FILE_FORMAT_NAME_N);
1170
ut_print_timestamp(stderr);
1172
" InnoDB: %s: the system tablespace is in a file "
1173
"format that this version doesn't support - %s\n",
1174
((max_format_id <= DICT_TF_FORMAT_MAX)
1175
? "Error" : "Warning"),
1176
trx_sys_file_format_id_to_name(format_id));
1178
if (max_format_id <= DICT_TF_FORMAT_MAX) {
1183
format_id = (format_id > max_format_id) ? format_id : max_format_id;
1185
/* We don't need a mutex here, as this function should only
1186
be called once at start up. */
1187
file_format_max.id = format_id;
1188
file_format_max.name = trx_sys_file_format_id_to_name(format_id);
1193
/*****************************************************************//**
1194
Set the file format id unconditionally except if it's already the
1196
@return TRUE if value updated */
1199
trx_sys_file_format_max_set(
1200
/*========================*/
1201
ulint format_id, /*!< in: file format id */
1202
const char** name) /*!< out: max file format name or
1203
NULL if not needed. */
1207
ut_a(format_id <= DICT_TF_FORMAT_MAX);
1209
mutex_enter(&file_format_max.mutex);
1211
/* Only update if not already same value. */
1212
if (format_id != file_format_max.id) {
1214
ret = trx_sys_file_format_max_write(format_id, name);
1217
mutex_exit(&file_format_max.mutex);
1222
/********************************************************************//**
1223
Tags the system table space with minimum format id if it has not been
1225
WARNING: This function is only called during the startup and AFTER the
1226
redo log application during recovery has finished. */
1229
trx_sys_file_format_tag_init(void)
1230
/*==============================*/
1234
format_id = trx_sys_file_format_max_read();
1236
/* If format_id is not set then set it to the minimum. */
1237
if (format_id == ULINT_UNDEFINED) {
1238
trx_sys_file_format_max_set(DICT_TF_FORMAT_MIN, NULL);
1242
/********************************************************************//**
1243
Update the file format tag in the system tablespace only if the given
1244
format id is greater than the known max id.
1245
@return TRUE if format_id was bigger than the known max id */
1248
trx_sys_file_format_max_upgrade(
1249
/*============================*/
1250
const char** name, /*!< out: max file format name */
1251
ulint format_id) /*!< in: file format identifier */
1256
ut_a(file_format_max.name != NULL);
1257
ut_a(format_id <= DICT_TF_FORMAT_MAX);
1259
mutex_enter(&file_format_max.mutex);
1261
if (format_id > file_format_max.id) {
1263
ret = trx_sys_file_format_max_write(format_id, name);
1266
mutex_exit(&file_format_max.mutex);
1271
/*****************************************************************//**
1272
Get the name representation of the file format from its id.
1273
@return pointer to the max format name */
1276
trx_sys_file_format_max_get(void)
1277
/*=============================*/
1279
return(file_format_max.name);
1282
/*****************************************************************//**
1283
Initializes the tablespace tag system. */
1286
trx_sys_file_format_init(void)
1287
/*==========================*/
1289
mutex_create(file_format_max_mutex_key,
1290
&file_format_max.mutex, SYNC_FILE_FORMAT_TAG);
1292
/* We don't need a mutex here, as this function should only
1293
be called once at start up. */
1294
file_format_max.id = DICT_TF_FORMAT_MIN;
1296
file_format_max.name = trx_sys_file_format_id_to_name(
1297
file_format_max.id);
1300
/*****************************************************************//**
1301
Closes the tablespace tag system. */
1304
trx_sys_file_format_close(void)
1305
/*===========================*/
1307
/* Does nothing at the moment */
1310
/*********************************************************************
1311
Creates the rollback segments */
1314
trx_sys_create_rsegs(
1315
/*=================*/
1316
ulint n_rsegs) /*!< number of rollback segments to create */
1318
ulint new_rsegs = 0;
1320
/* Do not create additional rollback segments if
1321
innodb_force_recovery has been set and the database
1322
was not shutdown cleanly. */
1323
if (!srv_force_recovery && !recv_needed_recovery) {
1326
for (i = 0; i < n_rsegs; ++i) {
1328
if (trx_rseg_create() != NULL) {
1336
if (new_rsegs > 0) {
1338
"InnoDB: %lu rollback segment(s) active.\n",
1343
#else /* !UNIV_HOTBACKUP */
1344
/*****************************************************************//**
1345
Prints to stderr the MySQL binlog info in the system header if the
1346
magic number shows it valid. */
1349
trx_sys_print_mysql_binlog_offset_from_page(
1350
/*========================================*/
1351
const byte* page) /*!< in: buffer containing the trx
1352
system header page, i.e., page number
1353
TRX_SYS_PAGE_NO in the tablespace */
1355
const trx_sysf_t* sys_header;
1357
sys_header = page + TRX_SYS;
1359
if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
1360
+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
1361
== TRX_SYS_MYSQL_LOG_MAGIC_N) {
1364
"ibbackup: Last MySQL binlog file position %lu %lu,"
1366
(ulong) mach_read_from_4(
1367
sys_header + TRX_SYS_MYSQL_LOG_INFO
1368
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
1369
(ulong) mach_read_from_4(
1370
sys_header + TRX_SYS_MYSQL_LOG_INFO
1371
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW),
1372
sys_header + TRX_SYS_MYSQL_LOG_INFO
1373
+ TRX_SYS_MYSQL_LOG_NAME);
1378
/* THESE ARE COPIED FROM NON-HOTBACKUP PART OF THE INNODB SOURCE TREE
1379
(This code duplicaton should be fixed at some point!)
1382
#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */
1383
/* The offset of the file format tag on the trx system header page */
1384
#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16)
1385
/* We use these random constants to reduce the probability of reading
1386
garbage (from previous versions) that maps to an actual format id. We
1387
use these as bit masks at the time of reading and writing from/to disk. */
1388
#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL
1389
#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL
1391
/* END OF COPIED DEFINITIONS */
1394
/*****************************************************************//**
1395
Reads the file format id from the first system table space file.
1396
Even if the call succeeds and returns TRUE, the returned format id
1397
may be ULINT_UNDEFINED signalling that the format id was not present
1399
@return TRUE if call succeeds */
1402
trx_sys_read_file_format_id(
1403
/*========================*/
1404
const char *pathname, /*!< in: pathname of the first system
1406
ulint *format_id) /*!< out: file format of the system table
1411
byte buf[UNIV_PAGE_SIZE * 2];
1412
page_t* page = ut_align(buf, UNIV_PAGE_SIZE);
1414
ib_id_t file_format_id;
1416
*format_id = ULINT_UNDEFINED;
1418
file = os_file_create_simple_no_error_handling(
1419
innodb_file_data_key,
1426
/* The following call prints an error message */
1427
os_file_get_last_error(TRUE);
1429
ut_print_timestamp(stderr);
1432
" ibbackup: Error: trying to read system tablespace file format,\n"
1433
" ibbackup: but could not open the tablespace file %s!\n",
1439
/* Read the page on which file format is stored */
1441
success = os_file_read_no_error_handling(
1442
file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, 0, UNIV_PAGE_SIZE
1445
/* The following call prints an error message */
1446
os_file_get_last_error(TRUE);
1448
ut_print_timestamp(stderr);
1451
" ibbackup: Error: trying to read system table space file format,\n"
1452
" ibbackup: but failed to read the tablespace file %s!\n",
1455
os_file_close(file);
1458
os_file_close(file);
1460
/* get the file format from the page */
1461
ptr = page + TRX_SYS_FILE_FORMAT_TAG;
1462
file_format_id = mach_read_from_8(ptr);
1463
file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
1465
if (file_format_id >= FILE_FORMAT_NAME_N) {
1467
/* Either it has never been tagged, or garbage in it. */
1471
*format_id = (ulint) file_format_id;
1477
/*****************************************************************//**
1478
Reads the file format id from the given per-table data file.
1479
@return TRUE if call succeeds */
1482
trx_sys_read_pertable_file_format_id(
1483
/*=================================*/
1484
const char *pathname, /*!< in: pathname of a per-table
1486
ulint *format_id) /*!< out: file format of the per-table
1491
byte buf[UNIV_PAGE_SIZE * 2];
1492
page_t* page = ut_align(buf, UNIV_PAGE_SIZE);
1496
*format_id = ULINT_UNDEFINED;
1498
file = os_file_create_simple_no_error_handling(
1499
innodb_file_data_key,
1506
/* The following call prints an error message */
1507
os_file_get_last_error(TRUE);
1509
ut_print_timestamp(stderr);
1512
" ibbackup: Error: trying to read per-table tablespace format,\n"
1513
" ibbackup: but could not open the tablespace file %s!\n",
1519
/* Read the first page of the per-table datafile */
1521
success = os_file_read_no_error_handling(
1522
file, page, 0, 0, UNIV_PAGE_SIZE
1525
/* The following call prints an error message */
1526
os_file_get_last_error(TRUE);
1528
ut_print_timestamp(stderr);
1531
" ibbackup: Error: trying to per-table data file format,\n"
1532
" ibbackup: but failed to read the tablespace file %s!\n",
1535
os_file_close(file);
1538
os_file_close(file);
1540
/* get the file format from the page */
1542
flags = mach_read_from_4(ptr);
1544
/* file format is Antelope */
1547
} else if (flags & 1) {
1548
/* tablespace flags are ok */
1549
*format_id = (flags / 32) % 128;
1552
/* bad tablespace flags */
1558
/*****************************************************************//**
1559
Get the name representation of the file format from its id.
1560
@return pointer to the name */
1563
trx_sys_file_format_id_to_name(
1564
/*===========================*/
1565
const ulint id) /*!< in: id of the file format */
1567
if (!(id < FILE_FORMAT_NAME_N)) {
1572
return(file_format_name_map[id]);
1575
#endif /* !UNIV_HOTBACKUP */
1577
#ifndef UNIV_HOTBACKUP
1578
/*********************************************************************
1579
Shutdown/Close the transaction system. */
1588
ut_ad(trx_sys != NULL);
1590
/* Check that all read views are closed except read view owned
1593
if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) {
1595
"InnoDB: Error: all read views were not closed"
1596
" before shutdown:\n"
1597
"InnoDB: %lu read views open \n",
1598
UT_LIST_GET_LEN(trx_sys->view_list) - 1);
1601
sess_close(trx_dummy_sess);
1602
trx_dummy_sess = NULL;
1604
trx_purge_sys_close();
1606
mutex_enter(&kernel_mutex);
1608
/* Free the double write data structures. */
1609
ut_a(trx_doublewrite != NULL);
1610
ut_free(trx_doublewrite->write_buf_unaligned);
1611
trx_doublewrite->write_buf_unaligned = NULL;
1613
mem_free(trx_doublewrite->buf_block_arr);
1614
trx_doublewrite->buf_block_arr = NULL;
1616
mutex_free(&trx_doublewrite->mutex);
1617
mem_free(trx_doublewrite);
1618
trx_doublewrite = NULL;
1620
/* There can't be any active transactions. */
1621
rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
1623
while (rseg != NULL) {
1624
trx_rseg_t* prev_rseg = rseg;
1626
rseg = UT_LIST_GET_NEXT(rseg_list, prev_rseg);
1627
UT_LIST_REMOVE(rseg_list, trx_sys->rseg_list, prev_rseg);
1629
trx_rseg_mem_free(prev_rseg);
1632
view = UT_LIST_GET_FIRST(trx_sys->view_list);
1634
while (view != NULL) {
1635
read_view_t* prev_view = view;
1637
view = UT_LIST_GET_NEXT(view_list, prev_view);
1639
/* Views are allocated from the trx_sys->global_read_view_heap.
1640
So, we simply remove the element here. */
1641
UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view);
1644
ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == 0);
1645
ut_a(UT_LIST_GET_LEN(trx_sys->rseg_list) == 0);
1646
ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0);
1647
ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0);
1652
mutex_exit(&kernel_mutex);
1654
#endif /* !UNIV_HOTBACKUP */