1
/*****************************************************************************
3
Copyright (C) 1996, 2010, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15
St, Fifth Floor, Boston, MA 02110-1301 USA
17
*****************************************************************************/
19
/**************************************************//**
23
Created 3/26/1996 Heikki Tuuri
24
*******************************************************/
32
#ifndef UNIV_HOTBACKUP
40
#include "trx0purge.h"
44
#include "read0read.h"
46
/** The file format tag structure with id and name. */
47
struct file_format_struct {
48
ulint id; /*!< id of the file format */
49
const char* name; /*!< text representation of the
51
mutex_t mutex; /*!< covers changes to the above
55
/** The file format tag */
56
typedef struct file_format_struct file_format_t;
58
/** The transaction system */
59
UNIV_INTERN trx_sys_t* trx_sys = NULL;
60
/** The doublewrite buffer */
61
UNIV_INTERN trx_doublewrite_t* trx_doublewrite = NULL;
63
/** The following is set to TRUE when we are upgrading from pre-4.1
64
format data files to the multiple tablespaces format data files */
65
UNIV_INTERN ibool trx_doublewrite_must_reset_space_ids = FALSE;
66
/** Set to TRUE when the doublewrite buffer is being created */
67
UNIV_INTERN ibool trx_doublewrite_buf_is_being_created = FALSE;
69
/** The following is TRUE when we are using the database in the
70
post-4.1 format, i.e., we have successfully upgraded, or have created
71
a new database installation */
72
UNIV_INTERN ibool trx_sys_multiple_tablespace_format = FALSE;
74
/** In a MySQL replication slave, in crash recovery we store the master log
75
file name and position here. */
77
/** Master binlog file name */
78
UNIV_INTERN char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
79
/** Master binlog file position. We have successfully got the updates
80
up to this position. -1 means that no crash recovery was needed, or
81
there was no master log position info inside InnoDB.*/
82
UNIV_INTERN ib_int64_t trx_sys_mysql_master_log_pos = -1;
85
/** If this MySQL server uses binary logging, after InnoDB has been inited
86
and if it has done a crash recovery, we store the binlog file name and position
89
/** Binlog file name */
90
UNIV_INTERN char trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
91
/** Binlog file position, or -1 if unknown */
92
UNIV_INTERN ib_int64_t trx_sys_mysql_bin_log_pos = -1;
94
UNIV_INTERN drizzled::atomic<uint64_t> trx_sys_commit_id;
97
#endif /* !UNIV_HOTBACKUP */
99
/** List of animal names representing file format. */
100
static const char* file_format_name_map[] = {
129
/** The number of elements in the file format name array. */
130
static const ulint FILE_FORMAT_NAME_N
131
= sizeof(file_format_name_map) / sizeof(file_format_name_map[0]);
133
#ifdef UNIV_PFS_MUTEX
134
/* Key to register the mutex with performance schema */
135
UNIV_INTERN mysql_pfs_key_t trx_doublewrite_mutex_key;
136
UNIV_INTERN mysql_pfs_key_t file_format_max_mutex_key;
137
#endif /* UNIV_PFS_MUTEX */
139
#ifndef UNIV_HOTBACKUP
140
/** This is used to track the maximum file format id known to InnoDB. It's
141
updated via SET GLOBAL innodb_file_format_max = 'x' or when we open
142
or create a table. */
143
static file_format_t file_format_max;
145
/****************************************************************//**
146
Determines if a page number is located inside the doublewrite buffer.
147
@return TRUE if the location is inside the two blocks of the
148
doublewrite buffer */
151
trx_doublewrite_page_inside(
152
/*========================*/
153
ulint page_no) /*!< in: page number */
155
if (trx_doublewrite == NULL) {
160
if (page_no >= trx_doublewrite->block1
161
&& page_no < trx_doublewrite->block1
162
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
166
if (page_no >= trx_doublewrite->block2
167
&& page_no < trx_doublewrite->block2
168
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
175
/****************************************************************//**
176
Creates or initialializes the doublewrite buffer at a database start. */
179
trx_doublewrite_init(
180
/*=================*/
181
byte* doublewrite) /*!< in: pointer to the doublewrite buf
182
header on trx sys page */
184
trx_doublewrite = static_cast<trx_doublewrite_t *>(mem_alloc(sizeof(trx_doublewrite_t)));
186
/* Since we now start to use the doublewrite buffer, no need to call
187
fsync() after every write to a data file */
189
os_do_not_call_flush_at_each_write = TRUE;
190
#endif /* UNIV_DO_FLUSH */
192
mutex_create(trx_doublewrite_mutex_key,
193
&trx_doublewrite->mutex, SYNC_DOUBLEWRITE);
195
trx_doublewrite->first_free = 0;
197
trx_doublewrite->block1 = mach_read_from_4(
198
doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
199
trx_doublewrite->block2 = mach_read_from_4(
200
doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
201
trx_doublewrite->write_buf_unaligned = static_cast<byte *>(ut_malloc(
202
(1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE));
204
trx_doublewrite->write_buf = static_cast<byte *>(ut_align(
205
trx_doublewrite->write_buf_unaligned, UNIV_PAGE_SIZE));
206
trx_doublewrite->buf_block_arr = static_cast<buf_page_t **>(mem_alloc(
207
2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * sizeof(void*)));
210
/****************************************************************//**
211
Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
212
multiple tablespace format. */
215
trx_sys_mark_upgraded_to_multiple_tablespaces(void)
216
/*===============================================*/
222
/* We upgraded to 4.1.x and reset the space id fields in the
223
doublewrite buffer. Let us mark to the trx_sys header that the upgrade
228
block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
230
buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
232
doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
234
mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
235
TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
239
/* Flush the modified pages to disk and make a checkpoint */
240
log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
242
trx_sys_multiple_tablespace_format = TRUE;
245
/****************************************************************//**
246
Creates the doublewrite buffer to a new InnoDB installation. The header of the
247
doublewrite buffer is placed on the trx system header page. */
250
trx_sys_create_doublewrite_buf(void)
251
/*================================*/
255
#ifdef UNIV_SYNC_DEBUG
256
buf_block_t* new_block;
257
#endif /* UNIV_SYNC_DEBUG */
265
if (trx_doublewrite) {
273
trx_doublewrite_buf_is_being_created = TRUE;
275
block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
277
buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
279
doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
281
if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
282
== TRX_SYS_DOUBLEWRITE_MAGIC_N) {
283
/* The doublewrite buffer has already been created:
284
just read in some numbers */
286
trx_doublewrite_init(doublewrite);
289
trx_doublewrite_buf_is_being_created = FALSE;
292
"InnoDB: Doublewrite buffer not found:"
295
if (buf_pool_get_curr_size()
296
< ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
297
+ FSP_EXTENT_SIZE / 2 + 100)
300
"InnoDB: Cannot create doublewrite buffer:"
302
"InnoDB: increase your buffer pool size.\n"
303
"InnoDB: Cannot continue operation.\n");
308
block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
310
+ TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
312
/* fseg_create acquires a second latch on the page,
313
therefore we must declare it: */
315
buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
317
if (block2 == NULL) {
319
"InnoDB: Cannot create doublewrite buffer:"
321
"InnoDB: increase your tablespace size.\n"
322
"InnoDB: Cannot continue operation.\n");
324
/* We exit without committing the mtr to prevent
325
its modifications to the database getting to disk */
330
fseg_header = buf_block_get_frame(block)
331
+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
334
for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
335
+ FSP_EXTENT_SIZE / 2; i++) {
336
page_no = fseg_alloc_free_page(fseg_header,
339
if (page_no == FIL_NULL) {
341
"InnoDB: Cannot create doublewrite"
342
" buffer: you must\n"
343
"InnoDB: increase your"
344
" tablespace size.\n"
345
"InnoDB: Cannot continue operation.\n"
351
/* We read the allocated pages to the buffer pool;
352
when they are written to disk in a flush, the space
353
id and page number fields are also written to the
354
pages. When we at database startup read pages
355
from the doublewrite buffer, we know that if the
356
space id and page number in them are the same as
357
the page position in the tablespace, then the page
358
has not been written to in doublewrite. */
360
#ifdef UNIV_SYNC_DEBUG
362
#endif /* UNIV_SYNC_DEBUG */
363
buf_page_get(TRX_SYS_SPACE, 0, page_no,
365
buf_block_dbg_add_level(new_block,
366
SYNC_NO_ORDER_CHECK);
368
if (i == FSP_EXTENT_SIZE / 2) {
369
ut_a(page_no == FSP_EXTENT_SIZE);
370
mlog_write_ulint(doublewrite
371
+ TRX_SYS_DOUBLEWRITE_BLOCK1,
372
page_no, MLOG_4BYTES, &mtr);
373
mlog_write_ulint(doublewrite
374
+ TRX_SYS_DOUBLEWRITE_REPEAT
375
+ TRX_SYS_DOUBLEWRITE_BLOCK1,
376
page_no, MLOG_4BYTES, &mtr);
377
} else if (i == FSP_EXTENT_SIZE / 2
378
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
379
ut_a(page_no == 2 * FSP_EXTENT_SIZE);
380
mlog_write_ulint(doublewrite
381
+ TRX_SYS_DOUBLEWRITE_BLOCK2,
382
page_no, MLOG_4BYTES, &mtr);
383
mlog_write_ulint(doublewrite
384
+ TRX_SYS_DOUBLEWRITE_REPEAT
385
+ TRX_SYS_DOUBLEWRITE_BLOCK2,
386
page_no, MLOG_4BYTES, &mtr);
387
} else if (i > FSP_EXTENT_SIZE / 2) {
388
ut_a(page_no == prev_page_no + 1);
391
prev_page_no = page_no;
394
mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
395
TRX_SYS_DOUBLEWRITE_MAGIC_N,
397
mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
398
+ TRX_SYS_DOUBLEWRITE_REPEAT,
399
TRX_SYS_DOUBLEWRITE_MAGIC_N,
402
mlog_write_ulint(doublewrite
403
+ TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
404
TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
408
/* Flush the modified pages to disk and make a checkpoint */
409
log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
411
fprintf(stderr, "InnoDB: Doublewrite buffer created\n");
413
trx_sys_multiple_tablespace_format = TRUE;
419
/****************************************************************//**
420
At a database startup initializes the doublewrite buffer memory structure if
421
we already have a doublewrite buffer created in the data files. If we are
422
upgrading to an InnoDB version which supports multiple tablespaces, then this
423
function performs the necessary update operations. If we are in a crash
424
recovery, this function uses a possible doublewrite buffer to restore
425
half-written pages in the data files. */
428
trx_sys_doublewrite_init_or_restore_pages(
429
/*======================================*/
430
ibool restore_corrupt_pages) /*!< in: TRUE=restore pages */
434
byte* unaligned_read_buf;
437
ulint source_page_no;
444
/* We do the file i/o past the buffer pool */
446
unaligned_read_buf = static_cast<byte *>(ut_malloc(2 * UNIV_PAGE_SIZE));
447
read_buf = static_cast<byte *>(ut_align(unaligned_read_buf, UNIV_PAGE_SIZE));
449
/* Read the trx sys header to check if we are using the doublewrite
452
fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0,
453
UNIV_PAGE_SIZE, read_buf, NULL);
454
doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
456
if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
457
== TRX_SYS_DOUBLEWRITE_MAGIC_N) {
458
/* The doublewrite buffer has been created */
460
trx_doublewrite_init(doublewrite);
462
block1 = trx_doublewrite->block1;
463
block2 = trx_doublewrite->block2;
465
buf = trx_doublewrite->write_buf;
470
if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
471
!= TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
473
/* We are upgrading from a version < 4.1.x to a version where
474
multiple tablespaces are supported. We must reset the space id
475
field in the pages in the doublewrite buffer because starting
476
from this version the space id is stored to
477
FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
479
trx_doublewrite_must_reset_space_ids = TRUE;
482
"InnoDB: Resetting space id's in the"
483
" doublewrite buffer\n");
485
trx_sys_multiple_tablespace_format = TRUE;
488
/* Read the pages from the doublewrite buffer to memory */
490
fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block1, 0,
491
TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
493
fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, 0, block2, 0,
494
TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
495
buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
497
/* Check if any of these pages is half-written in data files, in the
502
for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
504
page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
506
if (trx_doublewrite_must_reset_space_ids) {
510
+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
511
/* We do not need to calculate new checksums for the
512
pages because the field .._SPACE_ID does not affect
513
them. Write the page back to where we read it from. */
515
if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
516
source_page_no = block1 + i;
518
source_page_no = block2
519
+ i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
522
fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0,
523
UNIV_PAGE_SIZE, page, NULL);
524
/* printf("Resetting space id in page %lu\n",
527
space_id = mach_read_from_4(
528
page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
531
if (!restore_corrupt_pages) {
532
/* The database was shut down gracefully: no need to
535
} else if (!fil_tablespace_exists_in_mem(space_id)) {
536
/* Maybe we have dropped the single-table tablespace
537
and this page once belonged to it: do nothing */
539
} else if (!fil_check_adress_in_tablespace(space_id,
542
"InnoDB: Warning: a page in the"
543
" doublewrite buffer is not within space\n"
544
"InnoDB: bounds; space id %lu"
545
" page number %lu, page %lu in"
546
" doublewrite buf.\n",
547
(ulong) space_id, (ulong) page_no, (ulong) i);
549
} else if (space_id == TRX_SYS_SPACE
550
&& ((page_no >= block1
552
< block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
553
|| (page_no >= block2
556
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
558
/* It is an unwritten doublewrite buffer page:
561
ulint zip_size = fil_space_get_zip_size(space_id);
563
/* Read in the actual page from the file */
564
fil_io(OS_FILE_READ, TRUE, space_id, zip_size,
566
zip_size ? zip_size : UNIV_PAGE_SIZE,
569
/* Check if the page is corrupt */
572
(buf_page_is_corrupted(read_buf, zip_size))) {
575
"InnoDB: Warning: database page"
576
" corruption or a failed\n"
577
"InnoDB: file read of"
578
" space %lu page %lu.\n"
579
"InnoDB: Trying to recover it from"
580
" the doublewrite buffer.\n",
581
(ulong) space_id, (ulong) page_no);
583
if (buf_page_is_corrupted(page, zip_size)) {
585
"InnoDB: Dump of the page:\n");
586
buf_page_print(read_buf, zip_size);
589
" corresponding page"
590
" in doublewrite buffer:\n");
591
buf_page_print(page, zip_size);
594
"InnoDB: Also the page in the"
595
" doublewrite buffer"
597
"InnoDB: Cannot continue"
599
"InnoDB: You can try to"
600
" recover the database"
604
" innodb_force_recovery=6\n");
608
/* Write the good page from the
609
doublewrite buffer to the intended
612
fil_io(OS_FILE_WRITE, TRUE, space_id,
613
zip_size, page_no, 0,
614
zip_size ? zip_size : UNIV_PAGE_SIZE,
617
"InnoDB: Recovered the page from"
618
" the doublewrite buffer.\n");
622
page += UNIV_PAGE_SIZE;
625
fil_flush_file_spaces(FIL_TABLESPACE);
628
ut_free(unaligned_read_buf);
631
/****************************************************************//**
632
Checks that trx is in the trx list.
633
@return TRUE if is in */
638
trx_t* in_trx) /*!< in: trx */
642
ut_ad(mutex_own(&(kernel_mutex)));
644
trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
646
while (trx != NULL) {
653
trx = UT_LIST_GET_NEXT(trx_list, trx);
659
/*****************************************************************//**
660
Writes the value of max_trx_id to the file based trx system header. */
663
trx_sys_flush_max_trx_id(void)
664
/*==========================*/
666
trx_sysf_t* sys_header;
669
ut_ad(mutex_own(&kernel_mutex));
673
sys_header = trx_sysf_get(&mtr);
675
mlog_write_ull(sys_header + TRX_SYS_TRX_ID_STORE,
676
trx_sys->max_trx_id, &mtr);
682
trx_sys_flush_commit_id(uint64_t commit_id, ulint field, mtr_t* mtr)
684
trx_sysf_t* sys_header;
686
sys_header = trx_sysf_get(mtr);
688
mlog_write_ull(sys_header + field + TRX_SYS_DRIZZLE_MAX_COMMIT_ID,
695
trx_sys_read_commit_id(void)
696
/*===================================*/
698
trx_sysf_t* sys_header;
703
sys_header = trx_sysf_get(&mtr);
705
trx_sys_commit_id = mach_read_from_8(sys_header + TRX_SYS_DRIZZLE_LOG_INFO
706
+ TRX_SYS_DRIZZLE_MAX_COMMIT_ID);
711
/****************************************************************//**
712
Looks for a free slot for a rollback segment in the trx system file copy.
713
@return slot index or ULINT_UNDEFINED if not found */
716
trx_sysf_rseg_find_free(
717
/*====================*/
718
mtr_t* mtr) /*!< in: mtr */
720
trx_sysf_t* sys_header;
724
ut_ad(mutex_own(&(kernel_mutex)));
726
sys_header = trx_sysf_get(mtr);
728
for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
730
page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
732
if (page_no == FIL_NULL) {
738
return(ULINT_UNDEFINED);
741
/*****************************************************************//**
742
Creates the file page for the transaction system. This function is called only
743
at the database creation, before trx_sys_init. */
748
mtr_t* mtr) /*!< in: mtr */
750
trx_sysf_t* sys_header;
760
/* Note that below we first reserve the file space x-latch, and
761
then enter the kernel: we must do it in this order to conform
762
to the latching order rules. */
764
mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr);
765
mutex_enter(&kernel_mutex);
767
/* Create the trx sys file block in a new allocated file segment */
768
block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
770
buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
772
ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
774
page = buf_block_get_frame(block);
776
mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
779
/* Reset the doublewrite buffer magic number to zero so that we
780
know that the doublewrite buffer has not yet been created (this
781
suppresses a Valgrind warning) */
783
mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
784
+ TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
786
sys_header = trx_sysf_get(mtr);
788
/* Start counting transaction ids from number 1 up */
789
mach_write_to_8(sys_header + TRX_SYS_TRX_ID_STORE, 1);
791
/* Reset the rollback segment slots. Old versions of InnoDB
792
define TRX_SYS_N_RSEGS as 256 (TRX_SYS_OLD_N_RSEGS) and expect
793
that the whole array is initialized. */
794
ptr = TRX_SYS_RSEGS + sys_header;
795
len = ut_max(TRX_SYS_OLD_N_RSEGS, TRX_SYS_N_RSEGS)
796
* TRX_SYS_RSEG_SLOT_SIZE;
797
memset(ptr, 0xff, len);
799
ut_a(ptr <= page + (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END));
801
/* Initialize all of the page. This part used to be uninitialized. */
802
memset(ptr, 0, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page - ptr);
804
mlog_log_string(sys_header, UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
805
+ page - sys_header, mtr);
807
/* Create the first rollback segment in the SYSTEM tablespace */
808
slot_no = trx_sysf_rseg_find_free(mtr);
809
page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, slot_no,
811
ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
812
ut_a(page_no == FSP_FIRST_RSEG_PAGE_NO);
814
mutex_exit(&kernel_mutex);
817
/*****************************************************************//**
818
Creates and initializes the central memory structures for the transaction
819
system. This is called when the database is started. */
822
trx_sys_init_at_db_start(void)
823
/*==========================*/
825
trx_sysf_t* sys_header;
826
ib_uint64_t rows_to_undo = 0;
827
const char* unit = "";
833
ut_ad(trx_sys == NULL);
835
mutex_enter(&kernel_mutex);
837
trx_sys = static_cast<trx_sys_t *>(mem_alloc(sizeof(trx_sys_t)));
839
sys_header = trx_sysf_get(&mtr);
841
trx_rseg_list_and_array_init(sys_header, &mtr);
843
trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
845
/* VERY important: after the database is started, max_trx_id value is
846
divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
847
trx_sys_get_new_trx_id will evaluate to TRUE when the function
848
is first time called, and the value for trx id will be written
849
to the disk-based header! Thus trx id values will not overlap when
850
the database is repeatedly started! */
852
trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN
853
+ ut_uint64_align_up(mach_read_from_8(sys_header
854
+ TRX_SYS_TRX_ID_STORE),
855
TRX_SYS_TRX_ID_WRITE_MARGIN);
857
UT_LIST_INIT(trx_sys->mysql_trx_list);
858
trx_dummy_sess = sess_open();
859
trx_lists_init_at_db_start();
861
if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
862
trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
866
if (trx->conc_state != TRX_PREPARED) {
867
rows_to_undo += trx->undo_no;
870
trx = UT_LIST_GET_NEXT(trx_list, trx);
877
if (rows_to_undo > 1000000000) {
879
rows_to_undo = rows_to_undo / 1000000;
883
"InnoDB: %lu transaction(s) which must be"
884
" rolled back or cleaned up\n"
885
"InnoDB: in total %lu%s row operations to undo\n",
886
(ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
887
(ulong) rows_to_undo, unit);
889
fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
890
trx_sys->max_trx_id);
893
UT_LIST_INIT(trx_sys->view_list);
895
trx_purge_sys_create();
897
mutex_exit(&kernel_mutex);
902
/*****************************************************************//**
903
Creates and initializes the transaction system at the database creation. */
913
trx_sysf_create(&mtr);
917
trx_sys_init_at_db_start();
920
/*****************************************************************//**
921
Update the file format tag.
922
@return always TRUE */
925
trx_sys_file_format_max_write(
926
/*==========================*/
927
ulint format_id, /*!< in: file format id */
928
const char** name) /*!< out: max file format name, can
934
ib_uint64_t tag_value;
938
block = buf_page_get(
939
TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
941
file_format_max.id = format_id;
942
file_format_max.name = trx_sys_file_format_id_to_name(format_id);
944
ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
945
tag_value = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
948
*name = file_format_max.name;
951
mlog_write_ull(ptr, tag_value, &mtr);
958
/*****************************************************************//**
959
Read the file format tag.
960
@return the file format or ULINT_UNDEFINED if not set. */
963
trx_sys_file_format_max_read(void)
964
/*==============================*/
968
const buf_block_t* block;
969
ib_id_t file_format_id;
971
/* Since this is called during the startup phase it's safe to
972
read the value without a covering mutex. */
975
block = buf_page_get(
976
TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
978
ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
979
file_format_id = mach_read_from_8(ptr);
983
file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
985
if (file_format_id >= FILE_FORMAT_NAME_N) {
987
/* Either it has never been tagged, or garbage in it. */
988
return(ULINT_UNDEFINED);
991
return((ulint) file_format_id);
994
/*****************************************************************//**
995
Get the name representation of the file format from its id.
996
@return pointer to the name */
999
trx_sys_file_format_id_to_name(
1000
/*===========================*/
1001
const ulint id) /*!< in: id of the file format */
1003
ut_a(id < FILE_FORMAT_NAME_N);
1005
return(file_format_name_map[id]);
1008
/*****************************************************************//**
1009
Check for the max file format tag stored on disk. Note: If max_format_id
1010
is == DICT_TF_FORMAT_MAX + 1 then we only print a warning.
1011
@return DB_SUCCESS or error code */
1014
trx_sys_file_format_max_check(
1015
/*==========================*/
1016
ulint max_format_id) /*!< in: max format id to check */
1020
/* Check the file format in the tablespace. Do not try to
1021
recover if the file format is not supported by the engine
1022
unless forced by the user. */
1023
format_id = trx_sys_file_format_max_read();
1024
if (format_id == ULINT_UNDEFINED) {
1025
/* Format ID was not set. Set it to minimum possible
1027
format_id = DICT_TF_FORMAT_MIN;
1030
ut_print_timestamp(stderr);
1032
" InnoDB: highest supported file format is %s.\n",
1033
trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX));
1035
if (format_id > DICT_TF_FORMAT_MAX) {
1037
ut_a(format_id < FILE_FORMAT_NAME_N);
1039
ut_print_timestamp(stderr);
1041
" InnoDB: %s: the system tablespace is in a file "
1042
"format that this version doesn't support - %s\n",
1043
((max_format_id <= DICT_TF_FORMAT_MAX)
1044
? "Error" : "Warning"),
1045
trx_sys_file_format_id_to_name(format_id));
1047
if (max_format_id <= DICT_TF_FORMAT_MAX) {
1052
format_id = (format_id > max_format_id) ? format_id : max_format_id;
1054
/* We don't need a mutex here, as this function should only
1055
be called once at start up. */
1056
file_format_max.id = format_id;
1057
file_format_max.name = trx_sys_file_format_id_to_name(format_id);
1062
/*****************************************************************//**
1063
Set the file format id unconditionally except if it's already the
1065
@return TRUE if value updated */
1068
trx_sys_file_format_max_set(
1069
/*========================*/
1070
ulint format_id, /*!< in: file format id */
1071
const char** name) /*!< out: max file format name or
1072
NULL if not needed. */
1076
ut_a(format_id <= DICT_TF_FORMAT_MAX);
1078
mutex_enter(&file_format_max.mutex);
1080
/* Only update if not already same value. */
1081
if (format_id != file_format_max.id) {
1083
ret = trx_sys_file_format_max_write(format_id, name);
1086
mutex_exit(&file_format_max.mutex);
1091
/********************************************************************//**
1092
Tags the system table space with minimum format id if it has not been
1094
WARNING: This function is only called during the startup and AFTER the
1095
redo log application during recovery has finished. */
1098
trx_sys_file_format_tag_init(void)
1099
/*==============================*/
1103
format_id = trx_sys_file_format_max_read();
1105
/* If format_id is not set then set it to the minimum. */
1106
if (format_id == ULINT_UNDEFINED) {
1107
trx_sys_file_format_max_set(DICT_TF_FORMAT_MIN, NULL);
1111
/********************************************************************//**
1112
Update the file format tag in the system tablespace only if the given
1113
format id is greater than the known max id.
1114
@return TRUE if format_id was bigger than the known max id */
1117
trx_sys_file_format_max_upgrade(
1118
/*============================*/
1119
const char** name, /*!< out: max file format name */
1120
ulint format_id) /*!< in: file format identifier */
1125
ut_a(file_format_max.name != NULL);
1126
ut_a(format_id <= DICT_TF_FORMAT_MAX);
1128
mutex_enter(&file_format_max.mutex);
1130
if (format_id > file_format_max.id) {
1132
ret = trx_sys_file_format_max_write(format_id, name);
1135
mutex_exit(&file_format_max.mutex);
1140
/*****************************************************************//**
1141
Get the name representation of the file format from its id.
1142
@return pointer to the max format name */
1145
trx_sys_file_format_max_get(void)
1146
/*=============================*/
1148
return(file_format_max.name);
1151
/*****************************************************************//**
1152
Initializes the tablespace tag system. */
1155
trx_sys_file_format_init(void)
1156
/*==========================*/
1158
mutex_create(file_format_max_mutex_key,
1159
&file_format_max.mutex, SYNC_FILE_FORMAT_TAG);
1161
/* We don't need a mutex here, as this function should only
1162
be called once at start up. */
1163
file_format_max.id = DICT_TF_FORMAT_MIN;
1165
file_format_max.name = trx_sys_file_format_id_to_name(
1166
file_format_max.id);
1169
/*****************************************************************//**
1170
Closes the tablespace tag system. */
1173
trx_sys_file_format_close(void)
1174
/*===========================*/
1176
/* Does nothing at the moment */
1179
/*********************************************************************
1180
Creates the rollback segments */
1183
trx_sys_create_rsegs(
1184
/*=================*/
1185
ulint n_rsegs) /*!< number of rollback segments to create */
1187
ulint new_rsegs = 0;
1189
/* Do not create additional rollback segments if
1190
innodb_force_recovery has been set and the database
1191
was not shutdown cleanly. */
1192
if (!srv_force_recovery && !recv_needed_recovery) {
1195
for (i = 0; i < n_rsegs; ++i) {
1197
if (trx_rseg_create() != NULL) {
1205
if (new_rsegs > 0) {
1207
"InnoDB: %lu rollback segment(s) active.\n",
1212
#else /* !UNIV_HOTBACKUP */
1214
/* THESE ARE COPIED FROM NON-HOTBACKUP PART OF THE INNODB SOURCE TREE
1215
(This code duplicaton should be fixed at some point!)
1218
#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */
1219
/* The offset of the file format tag on the trx system header page */
1220
#define TRX_SYS_FILE_FORMAT_TAG (UNIV_PAGE_SIZE - 16)
1221
/* We use these random constants to reduce the probability of reading
1222
garbage (from previous versions) that maps to an actual format id. We
1223
use these as bit masks at the time of reading and writing from/to disk. */
1224
#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW 3645922177UL
1225
#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH 2745987765UL
1227
/* END OF COPIED DEFINITIONS */
1230
/*****************************************************************//**
1231
Reads the file format id from the first system table space file.
1232
Even if the call succeeds and returns TRUE, the returned format id
1233
may be ULINT_UNDEFINED signalling that the format id was not present
1235
@return TRUE if call succeeds */
1238
trx_sys_read_file_format_id(
1239
/*========================*/
1240
const char *pathname, /*!< in: pathname of the first system
1242
ulint *format_id) /*!< out: file format of the system table
1247
byte buf[UNIV_PAGE_SIZE * 2];
1248
page_t* page = ut_align(buf, UNIV_PAGE_SIZE);
1250
ib_id_t file_format_id;
1252
*format_id = ULINT_UNDEFINED;
1254
file = os_file_create_simple_no_error_handling(
1255
innodb_file_data_key,
1262
/* The following call prints an error message */
1263
os_file_get_last_error(TRUE);
1265
ut_print_timestamp(stderr);
1268
" ibbackup: Error: trying to read system tablespace file format,\n"
1269
" ibbackup: but could not open the tablespace file %s!\n",
1275
/* Read the page on which file format is stored */
1277
success = os_file_read_no_error_handling(
1278
file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, 0, UNIV_PAGE_SIZE
1281
/* The following call prints an error message */
1282
os_file_get_last_error(TRUE);
1284
ut_print_timestamp(stderr);
1287
" ibbackup: Error: trying to read system table space file format,\n"
1288
" ibbackup: but failed to read the tablespace file %s!\n",
1291
os_file_close(file);
1294
os_file_close(file);
1296
/* get the file format from the page */
1297
ptr = page + TRX_SYS_FILE_FORMAT_TAG;
1298
file_format_id = mach_read_from_8(ptr);
1299
file_format_id -= TRX_SYS_FILE_FORMAT_TAG_MAGIC_N;
1301
if (file_format_id >= FILE_FORMAT_NAME_N) {
1303
/* Either it has never been tagged, or garbage in it. */
1307
*format_id = (ulint) file_format_id;
1313
/*****************************************************************//**
1314
Reads the file format id from the given per-table data file.
1315
@return TRUE if call succeeds */
1318
trx_sys_read_pertable_file_format_id(
1319
/*=================================*/
1320
const char *pathname, /*!< in: pathname of a per-table
1322
ulint *format_id) /*!< out: file format of the per-table
1327
byte buf[UNIV_PAGE_SIZE * 2];
1328
page_t* page = ut_align(buf, UNIV_PAGE_SIZE);
1332
*format_id = ULINT_UNDEFINED;
1334
file = os_file_create_simple_no_error_handling(
1335
innodb_file_data_key,
1342
/* The following call prints an error message */
1343
os_file_get_last_error(TRUE);
1345
ut_print_timestamp(stderr);
1348
" ibbackup: Error: trying to read per-table tablespace format,\n"
1349
" ibbackup: but could not open the tablespace file %s!\n",
1355
/* Read the first page of the per-table datafile */
1357
success = os_file_read_no_error_handling(
1358
file, page, 0, 0, UNIV_PAGE_SIZE
1361
/* The following call prints an error message */
1362
os_file_get_last_error(TRUE);
1364
ut_print_timestamp(stderr);
1367
" ibbackup: Error: trying to per-table data file format,\n"
1368
" ibbackup: but failed to read the tablespace file %s!\n",
1371
os_file_close(file);
1374
os_file_close(file);
1376
/* get the file format from the page */
1378
flags = mach_read_from_4(ptr);
1380
/* file format is Antelope */
1383
} else if (flags & 1) {
1384
/* tablespace flags are ok */
1385
*format_id = (flags / 32) % 128;
1388
/* bad tablespace flags */
1394
/*****************************************************************//**
1395
Get the name representation of the file format from its id.
1396
@return pointer to the name */
1399
trx_sys_file_format_id_to_name(
1400
/*===========================*/
1401
const ulint id) /*!< in: id of the file format */
1403
if (!(id < FILE_FORMAT_NAME_N)) {
1408
return(file_format_name_map[id]);
1411
#endif /* !UNIV_HOTBACKUP */
1413
#ifndef UNIV_HOTBACKUP
1414
/*********************************************************************
1415
Shutdown/Close the transaction system. */
1424
ut_ad(trx_sys != NULL);
1426
/* Check that all read views are closed except read view owned
1429
if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) {
1431
"InnoDB: Error: all read views were not closed"
1432
" before shutdown:\n"
1433
"InnoDB: %lu read views open \n",
1434
static_cast<ulint>(UT_LIST_GET_LEN(trx_sys->view_list)) - 1);
1437
sess_close(trx_dummy_sess);
1438
trx_dummy_sess = NULL;
1440
trx_purge_sys_close();
1442
mutex_enter(&kernel_mutex);
1444
/* Free the double write data structures. */
1445
ut_a(trx_doublewrite != NULL);
1446
ut_free(trx_doublewrite->write_buf_unaligned);
1447
trx_doublewrite->write_buf_unaligned = NULL;
1449
mem_free(trx_doublewrite->buf_block_arr);
1450
trx_doublewrite->buf_block_arr = NULL;
1452
mutex_free(&trx_doublewrite->mutex);
1453
mem_free(trx_doublewrite);
1454
trx_doublewrite = NULL;
1456
/* There can't be any active transactions. */
1457
rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
1459
while (rseg != NULL) {
1460
trx_rseg_t* prev_rseg = rseg;
1462
rseg = UT_LIST_GET_NEXT(rseg_list, prev_rseg);
1463
UT_LIST_REMOVE(rseg_list, trx_sys->rseg_list, prev_rseg);
1465
trx_rseg_mem_free(prev_rseg);
1468
view = UT_LIST_GET_FIRST(trx_sys->view_list);
1470
while (view != NULL) {
1471
read_view_t* prev_view = view;
1473
view = UT_LIST_GET_NEXT(view_list, prev_view);
1475
/* Views are allocated from the trx_sys->global_read_view_heap.
1476
So, we simply remove the element here. */
1477
UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view);
1480
ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == 0);
1481
ut_a(UT_LIST_GET_LEN(trx_sys->rseg_list) == 0);
1482
ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0);
1483
ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0);
1488
mutex_exit(&kernel_mutex);
1490
#endif /* !UNIV_HOTBACKUP */