~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to storage/innobase/trx/trx0sys.c

  • Committer: brian
  • Date: 2008-06-25 05:29:13 UTC
  • Revision ID: brian@localhost.localdomain-20080625052913-6upwo0jsrl4lnapl
clean slate

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/******************************************************
 
2
Transaction system
 
3
 
 
4
(c) 1996 Innobase Oy
 
5
 
 
6
Created 3/26/1996 Heikki Tuuri
 
7
*******************************************************/
 
8
 
 
9
#include "trx0sys.h"
 
10
 
 
11
#ifdef UNIV_NONINL
 
12
#include "trx0sys.ic"
 
13
#endif
 
14
 
 
15
#include "fsp0fsp.h"
 
16
#include "mtr0mtr.h"
 
17
#include "trx0trx.h"
 
18
#include "trx0rseg.h"
 
19
#include "trx0undo.h"
 
20
#include "srv0srv.h"
 
21
#include "trx0purge.h"
 
22
#include "log0log.h"
 
23
#include "os0file.h"
 
24
 
 
25
/* The transaction system */
 
26
trx_sys_t*              trx_sys         = NULL;
 
27
trx_doublewrite_t*      trx_doublewrite = NULL;
 
28
 
 
29
/* The following is set to TRUE when we are upgrading from the old format data
 
30
files to the new >= 4.1.x format multiple tablespaces format data files */
 
31
 
 
32
ibool                   trx_doublewrite_must_reset_space_ids    = FALSE;
 
33
 
 
34
/* The following is TRUE when we are using the database in the new format,
 
35
i.e., we have successfully upgraded, or have created a new database
 
36
installation */
 
37
 
 
38
ibool                   trx_sys_multiple_tablespace_format      = FALSE;
 
39
 
 
40
/* In a MySQL replication slave, in crash recovery we store the master log
 
41
file name and position here. We have successfully got the updates to InnoDB
 
42
up to this position. If .._pos is -1, it means no crash recovery was needed,
 
43
or there was no master log position info inside InnoDB. */
 
44
 
 
45
char            trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
 
46
ib_longlong     trx_sys_mysql_master_log_pos    = -1;
 
47
 
 
48
/* If this MySQL server uses binary logging, after InnoDB has been inited
 
49
and if it has done a crash recovery, we store the binlog file name and position
 
50
here. If .._pos is -1, it means there was no binlog position info inside
 
51
InnoDB. */
 
52
 
 
53
char            trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
 
54
ib_longlong     trx_sys_mysql_bin_log_pos       = -1;
 
55
 
 
56
 
 
57
/********************************************************************
 
58
Determines if a page number is located inside the doublewrite buffer. */
 
59
 
 
60
ibool
 
61
trx_doublewrite_page_inside(
 
62
/*========================*/
 
63
                                /* out: TRUE if the location is inside
 
64
                                the two blocks of the doublewrite buffer */
 
65
        ulint   page_no)        /* in: page number */
 
66
{
 
67
        if (trx_doublewrite == NULL) {
 
68
 
 
69
                return(FALSE);
 
70
        }
 
71
 
 
72
        if (page_no >= trx_doublewrite->block1
 
73
            && page_no < trx_doublewrite->block1
 
74
            + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
75
                return(TRUE);
 
76
        }
 
77
 
 
78
        if (page_no >= trx_doublewrite->block2
 
79
            && page_no < trx_doublewrite->block2
 
80
            + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
81
                return(TRUE);
 
82
        }
 
83
 
 
84
        return(FALSE);
 
85
}
 
86
 
 
87
/********************************************************************
 
88
Creates or initialializes the doublewrite buffer at a database start. */
 
89
static
 
90
void
 
91
trx_doublewrite_init(
 
92
/*=================*/
 
93
        byte*   doublewrite)    /* in: pointer to the doublewrite buf
 
94
                                header on trx sys page */
 
95
{
 
96
        trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t));
 
97
 
 
98
        /* Since we now start to use the doublewrite buffer, no need to call
 
99
        fsync() after every write to a data file */
 
100
#ifdef UNIV_DO_FLUSH
 
101
        os_do_not_call_flush_at_each_write = TRUE;
 
102
#endif /* UNIV_DO_FLUSH */
 
103
 
 
104
        mutex_create(&trx_doublewrite->mutex, SYNC_DOUBLEWRITE);
 
105
 
 
106
        trx_doublewrite->first_free = 0;
 
107
 
 
108
        trx_doublewrite->block1 = mach_read_from_4(
 
109
                doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
 
110
        trx_doublewrite->block2 = mach_read_from_4(
 
111
                doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
 
112
        trx_doublewrite->write_buf_unaligned = ut_malloc(
 
113
                (1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE);
 
114
 
 
115
        trx_doublewrite->write_buf = ut_align(
 
116
                trx_doublewrite->write_buf_unaligned, UNIV_PAGE_SIZE);
 
117
        trx_doublewrite->buf_block_arr = mem_alloc(
 
118
                2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * sizeof(void*));
 
119
}
 
120
 
 
121
/********************************************************************
 
122
Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
 
123
multiple tablespace format. */
 
124
 
 
125
void
 
126
trx_sys_mark_upgraded_to_multiple_tablespaces(void)
 
127
/*===============================================*/
 
128
{
 
129
        page_t* page;
 
130
        byte*   doublewrite;
 
131
        mtr_t   mtr;
 
132
 
 
133
        /* We upgraded to 4.1.x and reset the space id fields in the
 
134
        doublewrite buffer. Let us mark to the trx_sys header that the upgrade
 
135
        has been done. */
 
136
 
 
137
        mtr_start(&mtr);
 
138
 
 
139
        page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
 
140
#ifdef UNIV_SYNC_DEBUG
 
141
        buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK);
 
142
#endif /* UNIV_SYNC_DEBUG */
 
143
 
 
144
        doublewrite = page + TRX_SYS_DOUBLEWRITE;
 
145
 
 
146
        mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
 
147
                         TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
 
148
                         MLOG_4BYTES, &mtr);
 
149
        mtr_commit(&mtr);
 
150
 
 
151
        /* Flush the modified pages to disk and make a checkpoint */
 
152
        log_make_checkpoint_at(ut_dulint_max, TRUE);
 
153
 
 
154
        trx_sys_multiple_tablespace_format = TRUE;
 
155
}
 
156
 
 
157
/********************************************************************
 
158
Creates the doublewrite buffer to a new InnoDB installation. The header of the
 
159
doublewrite buffer is placed on the trx system header page. */
 
160
 
 
161
void
 
162
trx_sys_create_doublewrite_buf(void)
 
163
/*================================*/
 
164
{
 
165
        page_t* page;
 
166
        page_t* page2;
 
167
        page_t* new_page;
 
168
        byte*   doublewrite;
 
169
        byte*   fseg_header;
 
170
        ulint   page_no;
 
171
        ulint   prev_page_no;
 
172
        ulint   i;
 
173
        mtr_t   mtr;
 
174
 
 
175
        if (trx_doublewrite) {
 
176
                /* Already inited */
 
177
 
 
178
                return;
 
179
        }
 
180
 
 
181
start_again:
 
182
        mtr_start(&mtr);
 
183
 
 
184
        page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
 
185
#ifdef UNIV_SYNC_DEBUG
 
186
        buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK);
 
187
#endif /* UNIV_SYNC_DEBUG */
 
188
 
 
189
        doublewrite = page + TRX_SYS_DOUBLEWRITE;
 
190
 
 
191
        if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
 
192
            == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
 
193
                /* The doublewrite buffer has already been created:
 
194
                just read in some numbers */
 
195
 
 
196
                trx_doublewrite_init(doublewrite);
 
197
 
 
198
                mtr_commit(&mtr);
 
199
        } else {
 
200
                fprintf(stderr,
 
201
                        "InnoDB: Doublewrite buffer not found:"
 
202
                        " creating new\n");
 
203
 
 
204
                if (buf_pool_get_curr_size()
 
205
                    < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
 
206
                        + FSP_EXTENT_SIZE / 2 + 100)
 
207
                       * UNIV_PAGE_SIZE)) {
 
208
                        fprintf(stderr,
 
209
                                "InnoDB: Cannot create doublewrite buffer:"
 
210
                                " you must\n"
 
211
                                "InnoDB: increase your buffer pool size.\n"
 
212
                                "InnoDB: Cannot continue operation.\n");
 
213
 
 
214
                        exit(1);
 
215
                }
 
216
 
 
217
                page2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
 
218
                                    TRX_SYS_DOUBLEWRITE
 
219
                                    + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
 
220
 
 
221
                /* fseg_create acquires a second latch on the page,
 
222
                therefore we must declare it: */
 
223
 
 
224
#ifdef UNIV_SYNC_DEBUG
 
225
                buf_page_dbg_add_level(page2, SYNC_NO_ORDER_CHECK);
 
226
#endif /* UNIV_SYNC_DEBUG */
 
227
 
 
228
                if (page2 == NULL) {
 
229
                        fprintf(stderr,
 
230
                                "InnoDB: Cannot create doublewrite buffer:"
 
231
                                " you must\n"
 
232
                                "InnoDB: increase your tablespace size.\n"
 
233
                                "InnoDB: Cannot continue operation.\n");
 
234
 
 
235
                        /* We exit without committing the mtr to prevent
 
236
                        its modifications to the database getting to disk */
 
237
 
 
238
                        exit(1);
 
239
                }
 
240
 
 
241
                fseg_header = page + TRX_SYS_DOUBLEWRITE
 
242
                        + TRX_SYS_DOUBLEWRITE_FSEG;
 
243
                prev_page_no = 0;
 
244
 
 
245
                for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
 
246
                             + FSP_EXTENT_SIZE / 2; i++) {
 
247
                        page_no = fseg_alloc_free_page(fseg_header,
 
248
                                                       prev_page_no + 1,
 
249
                                                       FSP_UP, &mtr);
 
250
                        if (page_no == FIL_NULL) {
 
251
                                fprintf(stderr,
 
252
                                        "InnoDB: Cannot create doublewrite"
 
253
                                        " buffer: you must\n"
 
254
                                        "InnoDB: increase your"
 
255
                                        " tablespace size.\n"
 
256
                                        "InnoDB: Cannot continue operation.\n"
 
257
                                        );
 
258
 
 
259
                                exit(1);
 
260
                        }
 
261
 
 
262
                        /* We read the allocated pages to the buffer pool;
 
263
                        when they are written to disk in a flush, the space
 
264
                        id and page number fields are also written to the
 
265
                        pages. When we at database startup read pages
 
266
                        from the doublewrite buffer, we know that if the
 
267
                        space id and page number in them are the same as
 
268
                        the page position in the tablespace, then the page
 
269
                        has not been written to in doublewrite. */
 
270
 
 
271
                        new_page = buf_page_get(TRX_SYS_SPACE, page_no,
 
272
                                                RW_X_LATCH, &mtr);
 
273
#ifdef UNIV_SYNC_DEBUG
 
274
                        buf_page_dbg_add_level(new_page, SYNC_NO_ORDER_CHECK);
 
275
#endif /* UNIV_SYNC_DEBUG */
 
276
 
 
277
                        /* Make a dummy change to the page to ensure it will
 
278
                        be written to disk in a flush */
 
279
 
 
280
                        mlog_write_ulint(new_page + FIL_PAGE_DATA,
 
281
                                         TRX_SYS_DOUBLEWRITE_MAGIC_N,
 
282
                                         MLOG_4BYTES, &mtr);
 
283
 
 
284
                        if (i == FSP_EXTENT_SIZE / 2) {
 
285
                                mlog_write_ulint(doublewrite
 
286
                                                 + TRX_SYS_DOUBLEWRITE_BLOCK1,
 
287
                                                 page_no, MLOG_4BYTES, &mtr);
 
288
                                mlog_write_ulint(doublewrite
 
289
                                                 + TRX_SYS_DOUBLEWRITE_REPEAT
 
290
                                                 + TRX_SYS_DOUBLEWRITE_BLOCK1,
 
291
                                                 page_no, MLOG_4BYTES, &mtr);
 
292
                        } else if (i == FSP_EXTENT_SIZE / 2
 
293
                                   + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
294
                                mlog_write_ulint(doublewrite
 
295
                                                 + TRX_SYS_DOUBLEWRITE_BLOCK2,
 
296
                                                 page_no, MLOG_4BYTES, &mtr);
 
297
                                mlog_write_ulint(doublewrite
 
298
                                                 + TRX_SYS_DOUBLEWRITE_REPEAT
 
299
                                                 + TRX_SYS_DOUBLEWRITE_BLOCK2,
 
300
                                                 page_no, MLOG_4BYTES, &mtr);
 
301
                        } else if (i > FSP_EXTENT_SIZE / 2) {
 
302
                                ut_a(page_no == prev_page_no + 1);
 
303
                        }
 
304
 
 
305
                        prev_page_no = page_no;
 
306
                }
 
307
 
 
308
                mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
 
309
                                 TRX_SYS_DOUBLEWRITE_MAGIC_N,
 
310
                                 MLOG_4BYTES, &mtr);
 
311
                mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
 
312
                                 + TRX_SYS_DOUBLEWRITE_REPEAT,
 
313
                                 TRX_SYS_DOUBLEWRITE_MAGIC_N,
 
314
                                 MLOG_4BYTES, &mtr);
 
315
 
 
316
                mlog_write_ulint(doublewrite
 
317
                                 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
 
318
                                 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
 
319
                                 MLOG_4BYTES, &mtr);
 
320
                mtr_commit(&mtr);
 
321
 
 
322
                /* Flush the modified pages to disk and make a checkpoint */
 
323
                log_make_checkpoint_at(ut_dulint_max, TRUE);
 
324
 
 
325
                fprintf(stderr, "InnoDB: Doublewrite buffer created\n");
 
326
 
 
327
                trx_sys_multiple_tablespace_format = TRUE;
 
328
 
 
329
                goto start_again;
 
330
        }
 
331
}
 
332
 
 
333
/********************************************************************
 
334
At a database startup initializes the doublewrite buffer memory structure if
 
335
we already have a doublewrite buffer created in the data files. If we are
 
336
upgrading to an InnoDB version which supports multiple tablespaces, then this
 
337
function performs the necessary update operations. If we are in a crash
 
338
recovery, this function uses a possible doublewrite buffer to restore
 
339
half-written pages in the data files. */
 
340
 
 
341
void
 
342
trx_sys_doublewrite_init_or_restore_pages(
 
343
/*======================================*/
 
344
        ibool   restore_corrupt_pages)
 
345
{
 
346
        byte*   buf;
 
347
        byte*   read_buf;
 
348
        byte*   unaligned_read_buf;
 
349
        ulint   block1;
 
350
        ulint   block2;
 
351
        ulint   source_page_no;
 
352
        byte*   page;
 
353
        byte*   doublewrite;
 
354
        ulint   space_id;
 
355
        ulint   page_no;
 
356
        ulint   i;
 
357
 
 
358
        /* We do the file i/o past the buffer pool */
 
359
 
 
360
        unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
 
361
        read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE);
 
362
 
 
363
        /* Read the trx sys header to check if we are using the doublewrite
 
364
        buffer */
 
365
 
 
366
        fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, TRX_SYS_PAGE_NO, 0,
 
367
               UNIV_PAGE_SIZE, read_buf, NULL);
 
368
        doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
 
369
 
 
370
        if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
 
371
            == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
 
372
                /* The doublewrite buffer has been created */
 
373
 
 
374
                trx_doublewrite_init(doublewrite);
 
375
 
 
376
                block1 = trx_doublewrite->block1;
 
377
                block2 = trx_doublewrite->block2;
 
378
 
 
379
                buf = trx_doublewrite->write_buf;
 
380
        } else {
 
381
                goto leave_func;
 
382
        }
 
383
 
 
384
        if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
 
385
            != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
 
386
 
 
387
                /* We are upgrading from a version < 4.1.x to a version where
 
388
                multiple tablespaces are supported. We must reset the space id
 
389
                field in the pages in the doublewrite buffer because starting
 
390
                from this version the space id is stored to
 
391
                FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
 
392
 
 
393
                trx_doublewrite_must_reset_space_ids = TRUE;
 
394
 
 
395
                fprintf(stderr,
 
396
                        "InnoDB: Resetting space id's in the"
 
397
                        " doublewrite buffer\n");
 
398
        } else {
 
399
                trx_sys_multiple_tablespace_format = TRUE;
 
400
        }
 
401
 
 
402
        /* Read the pages from the doublewrite buffer to memory */
 
403
 
 
404
        fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block1, 0,
 
405
               TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
 
406
               buf, NULL);
 
407
        fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block2, 0,
 
408
               TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
 
409
               buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
 
410
               NULL);
 
411
        /* Check if any of these pages is half-written in data files, in the
 
412
        intended position */
 
413
 
 
414
        page = buf;
 
415
 
 
416
        for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
 
417
 
 
418
                page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
 
419
 
 
420
                if (trx_doublewrite_must_reset_space_ids) {
 
421
 
 
422
                        space_id = 0;
 
423
                        mach_write_to_4(page
 
424
                                        + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
 
425
                        /* We do not need to calculate new checksums for the
 
426
                        pages because the field .._SPACE_ID does not affect
 
427
                        them. Write the page back to where we read it from. */
 
428
 
 
429
                        if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
430
                                source_page_no = block1 + i;
 
431
                        } else {
 
432
                                source_page_no = block2
 
433
                                        + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
 
434
                        }
 
435
 
 
436
                        fil_io(OS_FILE_WRITE, TRUE, 0, source_page_no, 0,
 
437
                               UNIV_PAGE_SIZE, page, NULL);
 
438
                        /* printf("Resetting space id in page %lu\n",
 
439
                        source_page_no); */
 
440
                } else {
 
441
                        space_id = mach_read_from_4(
 
442
                                page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
 
443
                }
 
444
 
 
445
                if (!restore_corrupt_pages) {
 
446
                        /* The database was shut down gracefully: no need to
 
447
                        restore pages */
 
448
 
 
449
                } else if (!fil_tablespace_exists_in_mem(space_id)) {
 
450
                        /* Maybe we have dropped the single-table tablespace
 
451
                        and this page once belonged to it: do nothing */
 
452
 
 
453
                } else if (!fil_check_adress_in_tablespace(space_id,
 
454
                                                           page_no)) {
 
455
                        fprintf(stderr,
 
456
                                "InnoDB: Warning: a page in the"
 
457
                                " doublewrite buffer is not within space\n"
 
458
                                "InnoDB: bounds; space id %lu"
 
459
                                " page number %lu, page %lu in"
 
460
                                " doublewrite buf.\n",
 
461
                                (ulong) space_id, (ulong) page_no, (ulong) i);
 
462
 
 
463
                } else if (space_id == TRX_SYS_SPACE
 
464
                           && ((page_no >= block1
 
465
                                && page_no
 
466
                                < block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
 
467
                               || (page_no >= block2
 
468
                                   && page_no
 
469
                                   < (block2
 
470
                                      + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
 
471
 
 
472
                        /* It is an unwritten doublewrite buffer page:
 
473
                        do nothing */
 
474
                } else {
 
475
                        /* Read in the actual page from the data files */
 
476
 
 
477
                        fil_io(OS_FILE_READ, TRUE, space_id, page_no, 0,
 
478
                               UNIV_PAGE_SIZE, read_buf, NULL);
 
479
                        /* Check if the page is corrupt */
 
480
 
 
481
                        if (buf_page_is_corrupted(read_buf)) {
 
482
 
 
483
                                fprintf(stderr,
 
484
                                        "InnoDB: Warning: database page"
 
485
                                        " corruption or a failed\n"
 
486
                                        "InnoDB: file read of page %lu.\n",
 
487
                                        (ulong) page_no);
 
488
                                fprintf(stderr,
 
489
                                        "InnoDB: Trying to recover it from"
 
490
                                        " the doublewrite buffer.\n");
 
491
 
 
492
                                if (buf_page_is_corrupted(page)) {
 
493
                                        fprintf(stderr,
 
494
                                                "InnoDB: Dump of the page:\n");
 
495
                                        buf_page_print(read_buf);
 
496
                                        fprintf(stderr,
 
497
                                                "InnoDB: Dump of"
 
498
                                                " corresponding page"
 
499
                                                " in doublewrite buffer:\n");
 
500
                                        buf_page_print(page);
 
501
 
 
502
                                        fprintf(stderr,
 
503
                                                "InnoDB: Also the page in the"
 
504
                                                " doublewrite buffer"
 
505
                                                " is corrupt.\n"
 
506
                                                "InnoDB: Cannot continue"
 
507
                                                " operation.\n"
 
508
                                                "InnoDB: You can try to"
 
509
                                                " recover the database"
 
510
                                                " with the my.cnf\n"
 
511
                                                "InnoDB: option:\n"
 
512
                                                "InnoDB: set-variable="
 
513
                                                "innodb_force_recovery=6\n");
 
514
                                        exit(1);
 
515
                                }
 
516
 
 
517
                                /* Write the good page from the
 
518
                                doublewrite buffer to the intended
 
519
                                position */
 
520
 
 
521
                                fil_io(OS_FILE_WRITE, TRUE, space_id,
 
522
                                       page_no, 0,
 
523
                                       UNIV_PAGE_SIZE, page, NULL);
 
524
                                fprintf(stderr,
 
525
                                        "InnoDB: Recovered the page from"
 
526
                                        " the doublewrite buffer.\n");
 
527
                        }
 
528
                }
 
529
 
 
530
                page += UNIV_PAGE_SIZE;
 
531
        }
 
532
 
 
533
        fil_flush_file_spaces(FIL_TABLESPACE);
 
534
 
 
535
leave_func:
 
536
        ut_free(unaligned_read_buf);
 
537
}
 
538
 
 
539
/********************************************************************
 
540
Checks that trx is in the trx list. */
 
541
 
 
542
ibool
 
543
trx_in_trx_list(
 
544
/*============*/
 
545
                        /* out: TRUE if is in */
 
546
        trx_t*  in_trx) /* in: trx */
 
547
{
 
548
        trx_t*  trx;
 
549
 
 
550
        ut_ad(mutex_own(&(kernel_mutex)));
 
551
 
 
552
        trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
 
553
 
 
554
        while (trx != NULL) {
 
555
 
 
556
                if (trx == in_trx) {
 
557
 
 
558
                        return(TRUE);
 
559
                }
 
560
 
 
561
                trx = UT_LIST_GET_NEXT(trx_list, trx);
 
562
        }
 
563
 
 
564
        return(FALSE);
 
565
}
 
566
 
 
567
/*********************************************************************
 
568
Writes the value of max_trx_id to the file based trx system header. */
 
569
 
 
570
void
 
571
trx_sys_flush_max_trx_id(void)
 
572
/*==========================*/
 
573
{
 
574
        trx_sysf_t*     sys_header;
 
575
        mtr_t           mtr;
 
576
 
 
577
        ut_ad(mutex_own(&kernel_mutex));
 
578
 
 
579
        mtr_start(&mtr);
 
580
 
 
581
        sys_header = trx_sysf_get(&mtr);
 
582
 
 
583
        mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
 
584
                          trx_sys->max_trx_id, &mtr);
 
585
        mtr_commit(&mtr);
 
586
}
 
587
 
 
588
/*********************************************************************
 
589
Updates the offset information about the end of the MySQL binlog entry
 
590
which corresponds to the transaction just being committed. In a MySQL
 
591
replication slave updates the latest master binlog position up to which
 
592
replication has proceeded. */
 
593
 
 
594
void
 
595
trx_sys_update_mysql_binlog_offset(
 
596
/*===============================*/
 
597
        const char*     file_name,/* in: MySQL log file name */
 
598
        ib_longlong     offset, /* in: position in that log file */
 
599
        ulint           field,  /* in: offset of the MySQL log info field in
 
600
                                the trx sys header */
 
601
        mtr_t*          mtr)    /* in: mtr */
 
602
{
 
603
        trx_sysf_t*     sys_header;
 
604
 
 
605
        if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) {
 
606
 
 
607
                /* We cannot fit the name to the 512 bytes we have reserved */
 
608
 
 
609
                return;
 
610
        }
 
611
 
 
612
        sys_header = trx_sysf_get(mtr);
 
613
 
 
614
        if (mach_read_from_4(sys_header + field
 
615
                             + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
 
616
            != TRX_SYS_MYSQL_LOG_MAGIC_N) {
 
617
 
 
618
                mlog_write_ulint(sys_header + field
 
619
                                 + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD,
 
620
                                 TRX_SYS_MYSQL_LOG_MAGIC_N,
 
621
                                 MLOG_4BYTES, mtr);
 
622
        }
 
623
 
 
624
        if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME),
 
625
                        file_name)) {
 
626
 
 
627
                mlog_write_string(sys_header + field
 
628
                                  + TRX_SYS_MYSQL_LOG_NAME,
 
629
                                  (byte*) file_name, 1 + ut_strlen(file_name),
 
630
                                  mtr);
 
631
        }
 
632
 
 
633
        if (mach_read_from_4(sys_header + field
 
634
                             + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0
 
635
            || (offset >> 32) > 0) {
 
636
 
 
637
                mlog_write_ulint(sys_header + field
 
638
                                 + TRX_SYS_MYSQL_LOG_OFFSET_HIGH,
 
639
                                 (ulint)(offset >> 32),
 
640
                                 MLOG_4BYTES, mtr);
 
641
        }
 
642
 
 
643
        mlog_write_ulint(sys_header + field
 
644
                         + TRX_SYS_MYSQL_LOG_OFFSET_LOW,
 
645
                         (ulint)(offset & 0xFFFFFFFFUL),
 
646
                         MLOG_4BYTES, mtr);
 
647
}
 
648
 
 
649
#ifdef UNIV_HOTBACKUP
 
650
/*********************************************************************
 
651
Prints to stderr the MySQL binlog info in the system header if the
 
652
magic number shows it valid. */
 
653
 
 
654
void
 
655
trx_sys_print_mysql_binlog_offset_from_page(
 
656
/*========================================*/
 
657
        byte*   page)   /* in: buffer containing the trx system header page,
 
658
                        i.e., page number TRX_SYS_PAGE_NO in the tablespace */
 
659
{
 
660
        trx_sysf_t*     sys_header;
 
661
 
 
662
        sys_header = page + TRX_SYS;
 
663
 
 
664
        if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
 
665
                             + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
 
666
            == TRX_SYS_MYSQL_LOG_MAGIC_N) {
 
667
 
 
668
                fprintf(stderr,
 
669
                        "ibbackup: Last MySQL binlog file position %lu %lu,"
 
670
                        " file name %s\n",
 
671
                        (ulong) mach_read_from_4(
 
672
                                sys_header + TRX_SYS_MYSQL_LOG_INFO
 
673
                                + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
 
674
                        (ulong) mach_read_from_4(
 
675
                                sys_header + TRX_SYS_MYSQL_LOG_INFO
 
676
                                + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
 
677
                        sys_header + TRX_SYS_MYSQL_LOG_INFO
 
678
                        + TRX_SYS_MYSQL_LOG_NAME);
 
679
        }
 
680
}
 
681
#endif /* UNIV_HOTBACKUP */
 
682
 
 
683
/*********************************************************************
 
684
Stores the MySQL binlog offset info in the trx system header if
 
685
the magic number shows it valid, and print the info to stderr */
 
686
 
 
687
void
 
688
trx_sys_print_mysql_binlog_offset(void)
 
689
/*===================================*/
 
690
{
 
691
        trx_sysf_t*     sys_header;
 
692
        mtr_t           mtr;
 
693
        ulint           trx_sys_mysql_bin_log_pos_high;
 
694
        ulint           trx_sys_mysql_bin_log_pos_low;
 
695
 
 
696
        mtr_start(&mtr);
 
697
 
 
698
        sys_header = trx_sysf_get(&mtr);
 
699
 
 
700
        if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
 
701
                             + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
 
702
            != TRX_SYS_MYSQL_LOG_MAGIC_N) {
 
703
 
 
704
                mtr_commit(&mtr);
 
705
 
 
706
                return;
 
707
        }
 
708
 
 
709
        trx_sys_mysql_bin_log_pos_high = mach_read_from_4(
 
710
                sys_header + TRX_SYS_MYSQL_LOG_INFO
 
711
                + TRX_SYS_MYSQL_LOG_OFFSET_HIGH);
 
712
        trx_sys_mysql_bin_log_pos_low = mach_read_from_4(
 
713
                sys_header + TRX_SYS_MYSQL_LOG_INFO
 
714
                + TRX_SYS_MYSQL_LOG_OFFSET_LOW);
 
715
 
 
716
        trx_sys_mysql_bin_log_pos
 
717
                = (((ib_longlong)trx_sys_mysql_bin_log_pos_high) << 32)
 
718
                + (ib_longlong)trx_sys_mysql_bin_log_pos_low;
 
719
 
 
720
        ut_memcpy(trx_sys_mysql_bin_log_name,
 
721
                  sys_header + TRX_SYS_MYSQL_LOG_INFO
 
722
                  + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN);
 
723
 
 
724
        fprintf(stderr,
 
725
                "InnoDB: Last MySQL binlog file position %lu %lu,"
 
726
                " file name %s\n",
 
727
                trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low,
 
728
                trx_sys_mysql_bin_log_name);
 
729
 
 
730
        mtr_commit(&mtr);
 
731
}
 
732
 
 
733
/*********************************************************************
 
734
Prints to stderr the MySQL master log offset info in the trx system header if
 
735
the magic number shows it valid. */
 
736
 
 
737
void
 
738
trx_sys_print_mysql_master_log_pos(void)
 
739
/*====================================*/
 
740
{
 
741
        trx_sysf_t*     sys_header;
 
742
        mtr_t           mtr;
 
743
 
 
744
        mtr_start(&mtr);
 
745
 
 
746
        sys_header = trx_sysf_get(&mtr);
 
747
 
 
748
        if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
 
749
                             + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
 
750
            != TRX_SYS_MYSQL_LOG_MAGIC_N) {
 
751
 
 
752
                mtr_commit(&mtr);
 
753
 
 
754
                return;
 
755
        }
 
756
 
 
757
        fprintf(stderr,
 
758
                "InnoDB: In a MySQL replication slave the last"
 
759
                " master binlog file\n"
 
760
                "InnoDB: position %lu %lu, file name %s\n",
 
761
                (ulong) mach_read_from_4(sys_header
 
762
                                         + TRX_SYS_MYSQL_MASTER_LOG_INFO
 
763
                                         + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
 
764
                (ulong) mach_read_from_4(sys_header
 
765
                                         + TRX_SYS_MYSQL_MASTER_LOG_INFO
 
766
                                         + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
 
767
                sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
 
768
                + TRX_SYS_MYSQL_LOG_NAME);
 
769
        /* Copy the master log position info to global variables we can
 
770
        use in ha_innobase.cc to initialize glob_mi to right values */
 
771
 
 
772
        ut_memcpy(trx_sys_mysql_master_log_name,
 
773
                  sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
 
774
                  + TRX_SYS_MYSQL_LOG_NAME,
 
775
                  TRX_SYS_MYSQL_LOG_NAME_LEN);
 
776
 
 
777
        trx_sys_mysql_master_log_pos
 
778
                = (((ib_longlong) mach_read_from_4(
 
779
                            sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
 
780
                            + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32)
 
781
                + ((ib_longlong) mach_read_from_4(
 
782
                           sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
 
783
                           + TRX_SYS_MYSQL_LOG_OFFSET_LOW));
 
784
        mtr_commit(&mtr);
 
785
}
 
786
 
 
787
/********************************************************************
 
788
Looks for a free slot for a rollback segment in the trx system file copy. */
 
789
 
 
790
ulint
 
791
trx_sysf_rseg_find_free(
 
792
/*====================*/
 
793
                        /* out: slot index or ULINT_UNDEFINED if not found */
 
794
        mtr_t*  mtr)    /* in: mtr */
 
795
{
 
796
        trx_sysf_t*     sys_header;
 
797
        ulint           page_no;
 
798
        ulint           i;
 
799
 
 
800
        ut_ad(mutex_own(&(kernel_mutex)));
 
801
 
 
802
        sys_header = trx_sysf_get(mtr);
 
803
 
 
804
        for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
 
805
 
 
806
                page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
 
807
 
 
808
                if (page_no == FIL_NULL) {
 
809
 
 
810
                        return(i);
 
811
                }
 
812
        }
 
813
 
 
814
        return(ULINT_UNDEFINED);
 
815
}
 
816
 
 
817
/*********************************************************************
 
818
Creates the file page for the transaction system. This function is called only
 
819
at the database creation, before trx_sys_init. */
 
820
static
 
821
void
 
822
trx_sysf_create(
 
823
/*============*/
 
824
        mtr_t*  mtr)    /* in: mtr */
 
825
{
 
826
        trx_sysf_t*     sys_header;
 
827
        ulint           slot_no;
 
828
        page_t*         page;
 
829
        ulint           page_no;
 
830
        ulint           i;
 
831
 
 
832
        ut_ad(mtr);
 
833
 
 
834
        /* Note that below we first reserve the file space x-latch, and
 
835
        then enter the kernel: we must do it in this order to conform
 
836
        to the latching order rules. */
 
837
 
 
838
        mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE), mtr);
 
839
        mutex_enter(&kernel_mutex);
 
840
 
 
841
        /* Create the trx sys file block in a new allocated file segment */
 
842
        page = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
 
843
                           mtr);
 
844
        ut_a(buf_frame_get_page_no(page) == TRX_SYS_PAGE_NO);
 
845
 
 
846
#ifdef UNIV_SYNC_DEBUG
 
847
        buf_page_dbg_add_level(page, SYNC_TRX_SYS_HEADER);
 
848
#endif /* UNIV_SYNC_DEBUG */
 
849
 
 
850
        mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
 
851
                         MLOG_2BYTES, mtr);
 
852
 
 
853
        /* Reset the doublewrite buffer magic number to zero so that we
 
854
        know that the doublewrite buffer has not yet been created (this
 
855
        suppresses a Valgrind warning) */
 
856
 
 
857
        mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
 
858
                         + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
 
859
 
 
860
        sys_header = trx_sysf_get(mtr);
 
861
 
 
862
        /* Start counting transaction ids from number 1 up */
 
863
        mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
 
864
                          ut_dulint_create(0, 1), mtr);
 
865
 
 
866
        /* Reset the rollback segment slots */
 
867
        for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
 
868
 
 
869
                trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr);
 
870
                trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr);
 
871
        }
 
872
 
 
873
        /* The remaining area (up to the page trailer) is uninitialized.
 
874
        Silence Valgrind warnings about it. */
 
875
        UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS
 
876
                                     + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
 
877
                                     + TRX_SYS_RSEG_SPACE),
 
878
                       (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
 
879
                        - (TRX_SYS_RSEGS
 
880
                           + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
 
881
                           + TRX_SYS_RSEG_SPACE))
 
882
                       + page - sys_header);
 
883
 
 
884
        /* Create the first rollback segment in the SYSTEM tablespace */
 
885
        page_no = trx_rseg_header_create(TRX_SYS_SPACE, ULINT_MAX, &slot_no,
 
886
                                         mtr);
 
887
        ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
 
888
        ut_a(page_no != FIL_NULL);
 
889
 
 
890
        mutex_exit(&kernel_mutex);
 
891
}
 
892
 
 
893
/*********************************************************************
 
894
Creates and initializes the central memory structures for the transaction
 
895
system. This is called when the database is started. */
 
896
 
 
897
void
 
898
trx_sys_init_at_db_start(void)
 
899
/*==========================*/
 
900
{
 
901
        trx_sysf_t*     sys_header;
 
902
        ib_longlong     rows_to_undo    = 0;
 
903
        const char*     unit            = "";
 
904
        trx_t*          trx;
 
905
        mtr_t           mtr;
 
906
 
 
907
        mtr_start(&mtr);
 
908
 
 
909
        ut_ad(trx_sys == NULL);
 
910
 
 
911
        mutex_enter(&kernel_mutex);
 
912
 
 
913
        trx_sys = mem_alloc(sizeof(trx_sys_t));
 
914
 
 
915
        sys_header = trx_sysf_get(&mtr);
 
916
 
 
917
        trx_rseg_list_and_array_init(sys_header, &mtr);
 
918
 
 
919
        trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
 
920
 
 
921
        /* VERY important: after the database is started, max_trx_id value is
 
922
        divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
 
923
        trx_sys_get_new_trx_id will evaluate to TRUE when the function
 
924
        is first time called, and the value for trx id will be written
 
925
        to the disk-based header! Thus trx id values will not overlap when
 
926
        the database is repeatedly started! */
 
927
 
 
928
        trx_sys->max_trx_id = ut_dulint_add(
 
929
                ut_dulint_align_up(mtr_read_dulint(
 
930
                                           sys_header
 
931
                                           + TRX_SYS_TRX_ID_STORE, &mtr),
 
932
                                   TRX_SYS_TRX_ID_WRITE_MARGIN),
 
933
                2 * TRX_SYS_TRX_ID_WRITE_MARGIN);
 
934
 
 
935
        UT_LIST_INIT(trx_sys->mysql_trx_list);
 
936
        trx_lists_init_at_db_start();
 
937
 
 
938
        if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
 
939
                trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
 
940
 
 
941
                for (;;) {
 
942
 
 
943
                        if ( trx->conc_state != TRX_PREPARED) {
 
944
                                rows_to_undo += ut_conv_dulint_to_longlong(
 
945
                                        trx->undo_no);
 
946
                        }
 
947
 
 
948
                        trx = UT_LIST_GET_NEXT(trx_list, trx);
 
949
 
 
950
                        if (!trx) {
 
951
                                break;
 
952
                        }
 
953
                }
 
954
 
 
955
                if (rows_to_undo > 1000000000) {
 
956
                        unit = "M";
 
957
                        rows_to_undo = rows_to_undo / 1000000;
 
958
                }
 
959
 
 
960
                fprintf(stderr,
 
961
                        "InnoDB: %lu transaction(s) which must be"
 
962
                        " rolled back or cleaned up\n"
 
963
                        "InnoDB: in total %lu%s row operations to undo\n",
 
964
                        (ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
 
965
                        (ulong) rows_to_undo, unit);
 
966
 
 
967
                fprintf(stderr, "InnoDB: Trx id counter is %lu %lu\n",
 
968
                        (ulong) ut_dulint_get_high(trx_sys->max_trx_id),
 
969
                        (ulong) ut_dulint_get_low(trx_sys->max_trx_id));
 
970
        }
 
971
 
 
972
        UT_LIST_INIT(trx_sys->view_list);
 
973
 
 
974
        trx_purge_sys_create();
 
975
 
 
976
        mutex_exit(&kernel_mutex);
 
977
 
 
978
        mtr_commit(&mtr);
 
979
}
 
980
 
 
981
/*********************************************************************
 
982
Creates and initializes the transaction system at the database creation. */
 
983
 
 
984
void
 
985
trx_sys_create(void)
 
986
/*================*/
 
987
{
 
988
        mtr_t   mtr;
 
989
 
 
990
        mtr_start(&mtr);
 
991
 
 
992
        trx_sysf_create(&mtr);
 
993
 
 
994
        mtr_commit(&mtr);
 
995
 
 
996
        trx_sys_init_at_db_start();
 
997
}