1
/*****************************************************************************
3
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
Place, Suite 330, Boston, MA 02111-1307 USA
17
*****************************************************************************/
19
/**************************************************//**
21
The database buffer buf_pool flush algorithm
23
Created 11/11/1995 Heikki Tuuri
24
*******************************************************/
35
#ifndef UNIV_HOTBACKUP
38
#include "page0page.h"
42
#include "ibuf0ibuf.h"
47
/**********************************************************************
48
These statistics are generated for heuristics used in estimating the
49
rate at which we should flush the dirty blocks to avoid bursty IO
50
activity. Note that the rate of flushing not only depends on how many
51
dirty pages we have in the buffer pool but it is also a fucntion of
52
how much redo the workload is generating and at what rate. */
55
/** Number of intervals for which we keep the history of these stats.
56
Each interval is 1 second, defined by the rate at which
57
srv_error_monitor_thread() calls buf_flush_stat_update(). */
58
#define BUF_FLUSH_STAT_N_INTERVAL 20
60
/** Sampled values buf_flush_stat_cur.
61
Not protected by any mutex. Updated by buf_flush_stat_update(). */
62
static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
64
/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
65
static ulint buf_flush_stat_arr_ind;
67
/** Values at start of the current interval. Reset by
68
buf_flush_stat_update(). */
69
static buf_flush_stat_t buf_flush_stat_cur;
71
/** Running sum of past values of buf_flush_stat_cur.
72
Updated by buf_flush_stat_update(). Not protected by any mutex. */
73
static buf_flush_stat_t buf_flush_stat_sum;
75
/** Number of pages flushed through non flush_list flushes. */
76
static ulint buf_lru_flush_page_count = 0;
80
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
81
/******************************************************************//**
82
Validates the flush list.
86
buf_flush_validate_low(void);
87
/*========================*/
88
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
90
/********************************************************************//**
91
Inserts a modified block into the flush list. */
94
buf_flush_insert_into_flush_list(
95
/*=============================*/
96
buf_block_t* block) /*!< in/out: block which is modified */
98
ut_ad(buf_pool_mutex_own());
99
ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
100
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
101
<= block->page.oldest_modification));
103
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
104
ut_ad(block->page.in_LRU_list);
105
ut_ad(block->page.in_page_hash);
106
ut_ad(!block->page.in_zip_hash);
107
ut_ad(!block->page.in_flush_list);
108
ut_d(block->page.in_flush_list = TRUE);
109
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
111
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
112
ut_a(buf_flush_validate_low());
113
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
116
/********************************************************************//**
117
Inserts a modified block into the flush list in the right sorted position.
118
This function is used by recovery, because there the modifications do not
119
necessarily come in the order of lsn's. */
122
buf_flush_insert_sorted_into_flush_list(
123
/*====================================*/
124
buf_block_t* block) /*!< in/out: block which is modified */
129
ut_ad(buf_pool_mutex_own());
130
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
132
ut_ad(block->page.in_LRU_list);
133
ut_ad(block->page.in_page_hash);
134
ut_ad(!block->page.in_zip_hash);
135
ut_ad(!block->page.in_flush_list);
136
ut_d(block->page.in_flush_list = TRUE);
139
b = UT_LIST_GET_FIRST(buf_pool->flush_list);
141
while (b && b->oldest_modification > block->page.oldest_modification) {
142
ut_ad(b->in_flush_list);
144
b = UT_LIST_GET_NEXT(list, b);
147
if (prev_b == NULL) {
148
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
150
UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
151
prev_b, &block->page);
154
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
155
ut_a(buf_flush_validate_low());
156
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
159
/********************************************************************//**
160
Returns TRUE if the file page block is immediately suitable for replacement,
161
i.e., the transition FILE_PAGE => NOT_USED allowed.
162
@return TRUE if can replace immediately */
165
buf_flush_ready_for_replace(
166
/*========================*/
167
buf_page_t* bpage) /*!< in: buffer control block, must be
168
buf_page_in_file(bpage) and in the LRU list */
170
ut_ad(buf_pool_mutex_own());
171
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
172
ut_ad(bpage->in_LRU_list);
174
if (UNIV_LIKELY(buf_page_in_file(bpage))) {
176
return(bpage->oldest_modification == 0
177
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE
178
&& bpage->buf_fix_count == 0);
181
ut_print_timestamp(stderr);
183
" InnoDB: Error: buffer block state %lu"
184
" in the LRU list!\n",
185
(ulong) buf_page_get_state(bpage));
186
ut_print_buf(stderr, bpage, sizeof(buf_page_t));
192
/********************************************************************//**
193
Returns TRUE if the block is modified and ready for flushing.
194
@return TRUE if can flush immediately */
197
buf_flush_ready_for_flush(
198
/*======================*/
199
buf_page_t* bpage, /*!< in: buffer control block, must be
200
buf_page_in_file(bpage) */
201
enum buf_flush flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
203
ut_a(buf_page_in_file(bpage));
204
ut_ad(buf_pool_mutex_own());
205
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
206
ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
208
if (bpage->oldest_modification != 0
209
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
210
ut_ad(bpage->in_flush_list);
212
if (flush_type != BUF_FLUSH_LRU) {
216
} else if (bpage->buf_fix_count == 0) {
218
/* If we are flushing the LRU list, to avoid deadlocks
219
we require the block not to be bufferfixed, and hence
229
/********************************************************************//**
230
Remove a block from the flush list of modified blocks. */
235
buf_page_t* bpage) /*!< in: pointer to the block in question */
237
ut_ad(buf_pool_mutex_own());
238
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
239
ut_ad(bpage->in_flush_list);
240
ut_d(bpage->in_flush_list = FALSE);
242
switch (buf_page_get_state(bpage)) {
243
case BUF_BLOCK_ZIP_PAGE:
244
/* clean compressed pages should not be on the flush list */
245
case BUF_BLOCK_ZIP_FREE:
246
case BUF_BLOCK_NOT_USED:
247
case BUF_BLOCK_READY_FOR_USE:
248
case BUF_BLOCK_MEMORY:
249
case BUF_BLOCK_REMOVE_HASH:
252
case BUF_BLOCK_ZIP_DIRTY:
253
buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
254
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
255
buf_LRU_insert_zip_clean(bpage);
257
case BUF_BLOCK_FILE_PAGE:
258
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
262
bpage->oldest_modification = 0;
264
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
265
ut_ad(ut_list_node_313->in_flush_list)));
268
/********************************************************************//**
269
Updates the flush system data structures when a write is completed. */
272
buf_flush_write_complete(
273
/*=====================*/
274
buf_page_t* bpage) /*!< in: pointer to the block in question */
276
enum buf_flush flush_type;
280
buf_flush_remove(bpage);
282
flush_type = buf_page_get_flush_type(bpage);
283
buf_pool->n_flush[flush_type]--;
285
if (flush_type == BUF_FLUSH_LRU) {
286
/* Put the block to the end of the LRU list to wait to be
287
moved to the free list */
289
buf_LRU_make_block_old(bpage);
291
buf_pool->LRU_flush_ended++;
294
/* fprintf(stderr, "n pending flush %lu\n",
295
buf_pool->n_flush[flush_type]); */
297
if ((buf_pool->n_flush[flush_type] == 0)
298
&& (buf_pool->init_flush[flush_type] == FALSE)) {
300
/* The running flush batch has ended */
302
os_event_set(buf_pool->no_flush[flush_type]);
306
/********************************************************************//**
307
Flushes possible buffered writes from the doublewrite memory buffer to disk,
308
and also wakes up the aio thread if simulated aio is used. It is very
309
important to call this function after a batch of writes has been posted,
310
and also when we may have to wait for a page latch! Otherwise a deadlock
311
of threads can occur. */
314
buf_flush_buffered_writes(void)
315
/*===========================*/
322
if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
323
os_aio_simulated_wake_handler_threads();
328
mutex_enter(&(trx_doublewrite->mutex));
330
/* Write first to doublewrite buffer blocks. We use synchronous
331
aio and thus know that file write has been completed when the
334
if (trx_doublewrite->first_free == 0) {
336
mutex_exit(&(trx_doublewrite->mutex));
341
for (i = 0; i < trx_doublewrite->first_free; i++) {
343
const buf_block_t* block;
345
block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
347
if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
348
|| block->page.zip.data) {
349
/* No simple validate for compressed pages exists. */
354
(memcmp(block->frame + (FIL_PAGE_LSN + 4),
355
block->frame + (UNIV_PAGE_SIZE
356
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
358
ut_print_timestamp(stderr);
360
" InnoDB: ERROR: The page to be written"
362
"InnoDB: The lsn fields do not match!"
363
" Noticed in the buffer pool\n"
364
"InnoDB: before posting to the"
365
" doublewrite buffer.\n");
368
if (!block->check_index_page_at_flush) {
369
} else if (page_is_comp(block->frame)) {
371
(!page_simple_validate_new(block->frame))) {
373
buf_page_print(block->frame, 0);
375
ut_print_timestamp(stderr);
377
" InnoDB: Apparent corruption of an"
378
" index page n:o %lu in space %lu\n"
379
"InnoDB: to be written to data file."
380
" We intentionally crash server\n"
381
"InnoDB: to prevent corrupt data"
382
" from ending up in data\n"
384
(ulong) buf_block_get_page_no(block),
385
(ulong) buf_block_get_space(block));
389
} else if (UNIV_UNLIKELY
390
(!page_simple_validate_old(block->frame))) {
396
/* increment the doublewrite flushed pages counter */
397
srv_dblwr_pages_written+= trx_doublewrite->first_free;
400
len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
401
trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
403
write_buf = trx_doublewrite->write_buf;
406
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
407
trx_doublewrite->block1, 0, len,
408
(void*) write_buf, NULL);
410
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
411
len2 += UNIV_PAGE_SIZE, i++) {
412
const buf_block_t* block = (buf_block_t*)
413
trx_doublewrite->buf_block_arr[i];
415
if (UNIV_LIKELY(!block->page.zip.data)
416
&& UNIV_LIKELY(buf_block_get_state(block)
417
== BUF_BLOCK_FILE_PAGE)
419
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
422
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
423
ut_print_timestamp(stderr);
425
" InnoDB: ERROR: The page to be written"
427
"InnoDB: The lsn fields do not match!"
428
" Noticed in the doublewrite block1.\n");
432
if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
436
len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
439
write_buf = trx_doublewrite->write_buf
440
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
441
ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
443
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
444
trx_doublewrite->block2, 0, len,
445
(void*) write_buf, NULL);
447
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
448
len2 += UNIV_PAGE_SIZE, i++) {
449
const buf_block_t* block = (buf_block_t*)
450
trx_doublewrite->buf_block_arr[i];
452
if (UNIV_LIKELY(!block->page.zip.data)
453
&& UNIV_LIKELY(buf_block_get_state(block)
454
== BUF_BLOCK_FILE_PAGE)
456
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
459
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
460
ut_print_timestamp(stderr);
462
" InnoDB: ERROR: The page to be"
463
" written seems corrupt!\n"
464
"InnoDB: The lsn fields do not match!"
466
" the doublewrite block2.\n");
471
/* Now flush the doublewrite buffer data to disk */
473
fil_flush(TRX_SYS_SPACE);
475
/* We know that the writes have been flushed to disk now
476
and in recovery we will find them in the doublewrite buffer
477
blocks. Next do the writes to the intended positions. */
479
for (i = 0; i < trx_doublewrite->first_free; i++) {
480
const buf_block_t* block = (buf_block_t*)
481
trx_doublewrite->buf_block_arr[i];
483
ut_a(buf_page_in_file(&block->page));
484
if (UNIV_LIKELY_NULL(block->page.zip.data)) {
485
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
486
FALSE, buf_page_get_space(&block->page),
487
buf_page_get_zip_size(&block->page),
488
buf_page_get_page_no(&block->page), 0,
489
buf_page_get_zip_size(&block->page),
490
(void*)block->page.zip.data,
493
/* Increment the counter of I/O operations used
494
for selecting LRU policy. */
495
buf_LRU_stat_inc_io();
500
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
502
if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
505
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
507
ut_print_timestamp(stderr);
509
" InnoDB: ERROR: The page to be written"
511
"InnoDB: The lsn fields do not match!"
512
" Noticed in the buffer pool\n"
513
"InnoDB: after posting and flushing"
514
" the doublewrite buffer.\n"
515
"InnoDB: Page buf fix count %lu,"
516
" io fix %lu, state %lu\n",
517
(ulong)block->page.buf_fix_count,
518
(ulong)buf_block_get_io_fix(block),
519
(ulong)buf_block_get_state(block));
522
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
523
FALSE, buf_block_get_space(block), 0,
524
buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
525
(void*)block->frame, (void*)block);
527
/* Increment the counter of I/O operations used
528
for selecting LRU policy. */
529
buf_LRU_stat_inc_io();
532
/* Wake possible simulated aio thread to actually post the
533
writes to the operating system */
535
os_aio_simulated_wake_handler_threads();
537
/* Wait that all async writes to tablespaces have been posted to
540
os_aio_wait_until_no_pending_writes();
542
/* Now we flush the data to disk (for example, with fsync) */
544
fil_flush_file_spaces(FIL_TABLESPACE);
546
/* We can now reuse the doublewrite memory buffer: */
548
trx_doublewrite->first_free = 0;
550
mutex_exit(&(trx_doublewrite->mutex));
553
/********************************************************************//**
554
Posts a buffer page for writing. If the doublewrite memory buffer is
555
full, calls buf_flush_buffered_writes and waits for for free space to
559
buf_flush_post_to_doublewrite_buf(
560
/*==============================*/
561
buf_page_t* bpage) /*!< in: buffer block to write */
565
mutex_enter(&(trx_doublewrite->mutex));
567
ut_a(buf_page_in_file(bpage));
569
if (trx_doublewrite->first_free
570
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
571
mutex_exit(&(trx_doublewrite->mutex));
573
buf_flush_buffered_writes();
578
zip_size = buf_page_get_zip_size(bpage);
580
if (UNIV_UNLIKELY(zip_size)) {
581
/* Copy the compressed page and clear the rest. */
582
memcpy(trx_doublewrite->write_buf
583
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
584
bpage->zip.data, zip_size);
585
memset(trx_doublewrite->write_buf
586
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free
587
+ zip_size, 0, UNIV_PAGE_SIZE - zip_size);
589
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
591
memcpy(trx_doublewrite->write_buf
592
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
593
((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
596
trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
598
trx_doublewrite->first_free++;
600
if (trx_doublewrite->first_free
601
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
602
mutex_exit(&(trx_doublewrite->mutex));
604
buf_flush_buffered_writes();
609
mutex_exit(&(trx_doublewrite->mutex));
611
#endif /* !UNIV_HOTBACKUP */
613
/********************************************************************//**
614
Initializes a page for writing to the tablespace. */
617
buf_flush_init_for_writing(
618
/*=======================*/
619
byte* page, /*!< in/out: page */
620
void* page_zip_, /*!< in/out: compressed page, or NULL */
621
ib_uint64_t newest_lsn) /*!< in: newest modification lsn
627
page_zip_des_t* page_zip = page_zip_;
628
ulint zip_size = page_zip_get_size(page_zip);
630
ut_ad(ut_is_2pow(zip_size));
631
ut_ad(zip_size <= UNIV_PAGE_SIZE);
633
switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
634
case FIL_PAGE_TYPE_ALLOCATED:
636
case FIL_PAGE_IBUF_BITMAP:
637
case FIL_PAGE_TYPE_FSP_HDR:
638
case FIL_PAGE_TYPE_XDES:
639
/* These are essentially uncompressed pages. */
640
memcpy(page_zip->data, page, zip_size);
642
case FIL_PAGE_TYPE_ZBLOB:
643
case FIL_PAGE_TYPE_ZBLOB2:
645
mach_write_ull(page_zip->data
646
+ FIL_PAGE_LSN, newest_lsn);
647
memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
648
mach_write_to_4(page_zip->data
649
+ FIL_PAGE_SPACE_OR_CHKSUM,
651
? page_zip_calc_checksum(
652
page_zip->data, zip_size)
653
: BUF_NO_CHECKSUM_MAGIC);
657
ut_print_timestamp(stderr);
658
fputs(" InnoDB: ERROR: The compressed page to be written"
659
" seems corrupt:", stderr);
660
ut_print_buf(stderr, page, zip_size);
661
fputs("\nInnoDB: Possibly older version of the page:", stderr);
662
ut_print_buf(stderr, page_zip->data, zip_size);
667
/* Write the newest modification lsn to the page header and trailer */
668
mach_write_ull(page + FIL_PAGE_LSN, newest_lsn);
670
mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
673
/* Store the new formula checksum */
675
mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
677
? buf_calc_page_new_checksum(page)
678
: BUF_NO_CHECKSUM_MAGIC);
680
/* We overwrite the first 4 bytes of the end lsn field to store
681
the old formula checksum. Since it depends also on the field
682
FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
683
new formula checksum. */
685
mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
687
? buf_calc_page_old_checksum(page)
688
: BUF_NO_CHECKSUM_MAGIC);
691
#ifndef UNIV_HOTBACKUP
692
/********************************************************************//**
693
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
694
also when the doublewrite buffer is used, we must call
695
buf_flush_buffered_writes after we have posted a batch of writes! */
698
buf_flush_write_block_low(
699
/*======================*/
700
buf_page_t* bpage) /*!< in: buffer block to write */
702
ulint zip_size = buf_page_get_zip_size(bpage);
703
page_t* frame = NULL;
704
#ifdef UNIV_LOG_DEBUG
705
static ibool univ_log_debug_warned;
706
#endif /* UNIV_LOG_DEBUG */
708
ut_ad(buf_page_in_file(bpage));
710
/* We are not holding buf_pool_mutex or block_mutex here.
711
Nevertheless, it is safe to access bpage, because it is
712
io_fixed and oldest_modification != 0. Thus, it cannot be
713
relocated in the buffer pool or removed from flush_list or
715
ut_ad(!buf_pool_mutex_own());
716
ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
717
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
718
ut_ad(bpage->oldest_modification != 0);
720
#ifdef UNIV_IBUF_COUNT_DEBUG
721
ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
723
ut_ad(bpage->newest_modification != 0);
725
#ifdef UNIV_LOG_DEBUG
726
if (!univ_log_debug_warned) {
727
univ_log_debug_warned = TRUE;
728
fputs("Warning: cannot force log to disk if"
729
" UNIV_LOG_DEBUG is defined!\n"
730
"Crash recovery will not work!\n",
734
/* Force the log to the disk before writing the modified block */
735
log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
737
switch (buf_page_get_state(bpage)) {
738
case BUF_BLOCK_ZIP_FREE:
739
case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
740
case BUF_BLOCK_NOT_USED:
741
case BUF_BLOCK_READY_FOR_USE:
742
case BUF_BLOCK_MEMORY:
743
case BUF_BLOCK_REMOVE_HASH:
746
case BUF_BLOCK_ZIP_DIRTY:
747
frame = bpage->zip.data;
748
if (UNIV_LIKELY(srv_use_checksums)) {
749
ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
750
== page_zip_calc_checksum(frame, zip_size));
752
mach_write_ull(frame + FIL_PAGE_LSN,
753
bpage->newest_modification);
754
memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
756
case BUF_BLOCK_FILE_PAGE:
757
frame = bpage->zip.data;
759
frame = ((buf_block_t*) bpage)->frame;
762
buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
764
? &bpage->zip : NULL,
765
bpage->newest_modification);
769
if (!srv_use_doublewrite_buf || !trx_doublewrite) {
770
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
771
FALSE, buf_page_get_space(bpage), zip_size,
772
buf_page_get_page_no(bpage), 0,
773
zip_size ? zip_size : UNIV_PAGE_SIZE,
776
buf_flush_post_to_doublewrite_buf(bpage);
780
/********************************************************************//**
781
Writes a flushable page asynchronously from the buffer pool to a file.
782
NOTE: in simulated aio we must call
783
os_aio_simulated_wake_handler_threads after we have posted a batch of
784
writes! NOTE: buf_pool_mutex and buf_page_get_mutex(bpage) must be
785
held upon entering this function, and they will be released by this
791
buf_page_t* bpage, /*!< in: buffer control block */
792
enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU
795
mutex_t* block_mutex;
796
ibool is_uncompressed;
798
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
799
ut_ad(buf_pool_mutex_own());
800
ut_ad(buf_page_in_file(bpage));
802
block_mutex = buf_page_get_mutex(bpage);
803
ut_ad(mutex_own(block_mutex));
805
ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
807
buf_page_set_io_fix(bpage, BUF_IO_WRITE);
809
buf_page_set_flush_type(bpage, flush_type);
811
if (buf_pool->n_flush[flush_type] == 0) {
813
os_event_reset(buf_pool->no_flush[flush_type]);
816
buf_pool->n_flush[flush_type]++;
818
is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
819
ut_ad(is_uncompressed == (block_mutex != &buf_pool_zip_mutex));
821
switch (flush_type) {
824
/* If the simulated aio thread is not running, we must
825
not wait for any latch, as we may end up in a deadlock:
826
if buf_fix_count == 0, then we know we need not wait */
828
is_s_latched = (bpage->buf_fix_count == 0);
829
if (is_s_latched && is_uncompressed) {
830
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
834
mutex_exit(block_mutex);
835
buf_pool_mutex_exit();
837
/* Even though bpage is not protected by any mutex at
838
this point, it is safe to access bpage, because it is
839
io_fixed and oldest_modification != 0. Thus, it
840
cannot be relocated in the buffer pool or removed from
841
flush_list or LRU_list. */
844
buf_flush_buffered_writes();
846
if (is_uncompressed) {
847
rw_lock_s_lock_gen(&((buf_block_t*) bpage)
848
->lock, BUF_IO_WRITE);
856
Because any thread may call the LRU flush, even when owning
857
locks on pages, to avoid deadlocks, we must make sure that the
858
s-lock is acquired on the page without waiting: this is
859
accomplished because buf_flush_ready_for_flush() must hold,
860
and that requires the page not to be bufferfixed. */
862
if (is_uncompressed) {
863
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
867
/* Note that the s-latch is acquired before releasing the
868
buf_pool mutex: this ensures that the latch is acquired
871
mutex_exit(block_mutex);
872
buf_pool_mutex_exit();
879
/* Even though bpage is not protected by any mutex at this
880
point, it is safe to access bpage, because it is io_fixed and
881
oldest_modification != 0. Thus, it cannot be relocated in the
882
buffer pool or removed from flush_list or LRU_list. */
885
if (buf_debug_prints) {
887
"Flushing %u space %u page %u\n",
888
flush_type, bpage->space, bpage->offset);
890
#endif /* UNIV_DEBUG */
891
buf_flush_write_block_low(bpage);
894
/***********************************************************//**
895
Flushes to disk all flushable pages within the flush area.
896
@return number of pages flushed */
899
buf_flush_try_neighbors(
900
/*====================*/
901
ulint space, /*!< in: space id */
902
ulint offset, /*!< in: page offset */
903
enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU or
911
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
913
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
914
/* If there is little space, it is better not to flush any
915
block except from the end of the LRU list */
920
/* When flushed, dirty blocks are searched in neighborhoods of
921
this size, and flushed along with the original page. */
923
ulint buf_flush_area = ut_min(BUF_READ_AHEAD_AREA,
924
buf_pool->curr_size / 16);
926
low = (offset / buf_flush_area) * buf_flush_area;
927
high = (offset / buf_flush_area + 1) * buf_flush_area;
930
/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
932
if (high > fil_space_get_size(space)) {
933
high = fil_space_get_size(space);
936
buf_pool_mutex_enter();
938
for (i = low; i < high; i++) {
940
bpage = buf_page_hash_get(space, i);
947
ut_a(buf_page_in_file(bpage));
949
/* We avoid flushing 'non-old' blocks in an LRU flush,
950
because the flushed blocks are soon freed */
952
if (flush_type != BUF_FLUSH_LRU
954
|| buf_page_is_old(bpage)) {
955
mutex_t* block_mutex = buf_page_get_mutex(bpage);
957
mutex_enter(block_mutex);
959
if (buf_flush_ready_for_flush(bpage, flush_type)
960
&& (i == offset || !bpage->buf_fix_count)) {
961
/* We only try to flush those
962
neighbors != offset where the buf fix count is
963
zero, as we then know that we probably can
964
latch the page without a semaphore wait.
965
Semaphore waits are expensive because we must
966
flush the doublewrite buffer before we start
969
buf_flush_page(bpage, flush_type);
970
ut_ad(!mutex_own(block_mutex));
973
buf_pool_mutex_enter();
975
mutex_exit(block_mutex);
980
buf_pool_mutex_exit();
985
/*******************************************************************//**
986
This utility flushes dirty blocks from the end of the LRU list or flush_list.
987
NOTE 1: in the case of an LRU flush the calling thread may own latches to
988
pages: to avoid deadlocks, this function must be written so that it cannot
989
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
990
the calling thread is not allowed to own any latches on pages!
991
@return number of blocks for which the write request was queued;
992
ULINT_UNDEFINED if there was a flush of the same type already running */
997
enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or
998
BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
999
then the caller must not own any
1001
ulint min_n, /*!< in: wished minimum mumber of blocks
1002
flushed (it is not guaranteed that the
1003
actual number is that big, though) */
1004
ib_uint64_t lsn_limit) /*!< in the case BUF_FLUSH_LIST all
1005
blocks whose oldest_modification is
1006
smaller than this should be flushed
1007
(if their number does not exceed
1008
min_n), otherwise ignored */
1011
ulint page_count = 0;
1012
ulint old_page_count;
1016
ut_ad((flush_type == BUF_FLUSH_LRU)
1017
|| (flush_type == BUF_FLUSH_LIST));
1018
#ifdef UNIV_SYNC_DEBUG
1019
ut_ad((flush_type != BUF_FLUSH_LIST)
1020
|| sync_thread_levels_empty_gen(TRUE));
1021
#endif /* UNIV_SYNC_DEBUG */
1022
buf_pool_mutex_enter();
1024
if ((buf_pool->n_flush[flush_type] > 0)
1025
|| (buf_pool->init_flush[flush_type] == TRUE)) {
1027
/* There is already a flush batch of the same type running */
1029
buf_pool_mutex_exit();
1031
return(ULINT_UNDEFINED);
1034
buf_pool->init_flush[flush_type] = TRUE;
1036
bool done_with_loop= false;
1037
for (;done_with_loop != true;) {
1039
/* If we have flushed enough, leave the loop */
1040
if (page_count >= min_n) {
1045
/* Start from the end of the list looking for a suitable
1046
block to be flushed. */
1048
if (flush_type == BUF_FLUSH_LRU) {
1049
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1051
ut_ad(flush_type == BUF_FLUSH_LIST);
1053
bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1055
|| bpage->oldest_modification >= lsn_limit) {
1056
/* We have flushed enough */
1060
ut_ad(bpage->in_flush_list);
1063
/* Note that after finding a single flushable page, we try to
1064
flush also all its neighbors, and after that start from the
1065
END of the LRU list or flush list again: the list may change
1066
during the flushing and we cannot safely preserve within this
1067
function a pointer to a block in the list! */
1070
mutex_t*block_mutex = buf_page_get_mutex(bpage);
1073
ut_a(buf_page_in_file(bpage));
1075
mutex_enter(block_mutex);
1076
ready = buf_flush_ready_for_flush(bpage, flush_type);
1077
mutex_exit(block_mutex);
1080
space = buf_page_get_space(bpage);
1081
offset = buf_page_get_page_no(bpage);
1083
buf_pool_mutex_exit();
1085
old_page_count = page_count;
1087
/* Try to flush also all the neighbors */
1088
page_count += buf_flush_try_neighbors(
1089
space, offset, flush_type);
1091
"Flush type %lu, page no %lu, neighb %lu\n",
1093
page_count - old_page_count); */
1095
buf_pool_mutex_enter();
1098
} else if (flush_type == BUF_FLUSH_LRU) {
1099
bpage = UT_LIST_GET_PREV(LRU, bpage);
1101
ut_ad(flush_type == BUF_FLUSH_LIST);
1103
bpage = UT_LIST_GET_PREV(list, bpage);
1104
ut_ad(!bpage || bpage->in_flush_list);
1106
} while (bpage != NULL);
1108
/* If we could not find anything to flush, leave the loop */
1110
done_with_loop= true;
1114
buf_pool->init_flush[flush_type] = FALSE;
1116
if (buf_pool->n_flush[flush_type] == 0) {
1118
/* The running flush batch has ended */
1120
os_event_set(buf_pool->no_flush[flush_type]);
1123
buf_pool_mutex_exit();
1125
buf_flush_buffered_writes();
1128
if (buf_debug_prints && page_count > 0) {
1129
ut_a(flush_type == BUF_FLUSH_LRU
1130
|| flush_type == BUF_FLUSH_LIST);
1131
fprintf(stderr, flush_type == BUF_FLUSH_LRU
1132
? "Flushed %lu pages in LRU flush\n"
1133
: "Flushed %lu pages in flush list flush\n",
1134
(ulong) page_count);
1136
#endif /* UNIV_DEBUG */
1138
srv_buf_pool_flushed += page_count;
1140
/* We keep track of all flushes happening as part of LRU
1141
flush. When estimating the desired rate at which flush_list
1142
should be flushed we factor in this value. */
1143
if (flush_type == BUF_FLUSH_LRU) {
1144
buf_lru_flush_page_count += page_count;
1150
/******************************************************************//**
1151
Waits until a flush batch of the given type ends */
1154
buf_flush_wait_batch_end(
1155
/*=====================*/
1156
enum buf_flush type) /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
1158
ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
1160
os_event_wait(buf_pool->no_flush[type]);
1163
/******************************************************************//**
1164
Gives a recommendation of how many blocks should be flushed to establish
1165
a big enough margin of replaceable blocks near the end of the LRU list
1166
and in the free list.
1167
@return number of blocks which should be flushed from the end of the
1171
buf_flush_LRU_recommendation(void)
1172
/*==============================*/
1175
ulint n_replaceable;
1178
buf_pool_mutex_enter();
1180
n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
1182
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1184
while ((bpage != NULL)
1185
&& (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
1186
+ BUF_FLUSH_EXTRA_MARGIN)
1187
&& (distance < BUF_LRU_FREE_SEARCH_LEN)) {
1189
mutex_t* block_mutex = buf_page_get_mutex(bpage);
1191
mutex_enter(block_mutex);
1193
if (buf_flush_ready_for_replace(bpage)) {
1197
mutex_exit(block_mutex);
1201
bpage = UT_LIST_GET_PREV(LRU, bpage);
1204
buf_pool_mutex_exit();
1206
if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
1211
return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
1215
/*********************************************************************//**
1216
Flushes pages from the end of the LRU list if there is too small a margin
1217
of replaceable pages there or in the free list. VERY IMPORTANT: this function
1218
is called also by threads which have locks on pages. To avoid deadlocks, we
1219
flush only pages such that the s-lock required for flushing can be acquired
1220
immediately, without waiting. */
1223
buf_flush_free_margin(void)
1224
/*=======================*/
1229
n_to_flush = buf_flush_LRU_recommendation();
1231
if (n_to_flush > 0) {
1232
n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0);
1233
if (n_flushed == ULINT_UNDEFINED) {
1234
/* There was an LRU type flush batch already running;
1235
let us wait for it to end */
1237
buf_flush_wait_batch_end(BUF_FLUSH_LRU);
1242
/*********************************************************************
1243
Update the historical stats that we are collecting for flush rate
1244
heuristics at the end of each interval.
1245
Flush rate heuristic depends on (a) rate of redo log generation and
1246
(b) the rate at which LRU flush is happening. */
1249
buf_flush_stat_update(void)
1250
/*=======================*/
1252
buf_flush_stat_t* item;
1253
ib_uint64_t lsn_diff;
1257
lsn = log_get_lsn();
1258
if (buf_flush_stat_cur.redo == 0) {
1259
/* First time around. Just update the current LSN
1261
buf_flush_stat_cur.redo = lsn;
1265
item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
1267
/* values for this interval */
1268
lsn_diff = lsn - buf_flush_stat_cur.redo;
1269
n_flushed = buf_lru_flush_page_count
1270
- buf_flush_stat_cur.n_flushed;
1272
/* add the current value and subtract the obsolete entry. */
1273
buf_flush_stat_sum.redo += lsn_diff - item->redo;
1274
buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
1276
/* put current entry in the array. */
1277
item->redo = lsn_diff;
1278
item->n_flushed = n_flushed;
1280
/* update the index */
1281
buf_flush_stat_arr_ind++;
1282
buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
1284
/* reset the current entry. */
1285
buf_flush_stat_cur.redo = lsn;
1286
buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
1289
/*********************************************************************
1290
Determines the fraction of dirty pages that need to be flushed based
1291
on the speed at which we generate redo log. Note that if redo log
1292
is generated at a significant rate without corresponding increase
1293
in the number of dirty pages (for example, an in-memory workload)
1294
it can cause IO bursts of flushing. This function implements heuristics
1295
to avoid this burstiness.
1296
@return number of dirty pages to be flushed / second */
1299
buf_flush_get_desired_flush_rate(void)
1300
/*==================================*/
1303
ulint lru_flush_avg;
1307
ib_uint64_t lsn = log_get_lsn();
1308
ulint log_capacity = log_get_capacity();
1310
/* log_capacity should never be zero after the initialization
1311
of log subsystem. */
1312
ut_ad(log_capacity != 0);
1314
/* Get total number of dirty pages. It is OK to access
1315
flush_list without holding any mtex as we are using this
1316
only for heuristics. */
1317
n_dirty = UT_LIST_GET_LEN(buf_pool->flush_list);
1319
/* An overflow can happen if we generate more than 2^32 bytes
1320
of redo in this interval i.e.: 4G of redo in 1 second. We can
1321
safely consider this as infinity because if we ever come close
1322
to 4G we'll start a synchronous flush of dirty pages. */
1323
/* redo_avg below is average at which redo is generated in
1324
past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
1326
redo_avg = (ulint) (buf_flush_stat_sum.redo
1327
/ BUF_FLUSH_STAT_N_INTERVAL
1328
+ (lsn - buf_flush_stat_cur.redo));
1330
/* An overflow can happen possibly if we flush more than 2^32
1331
pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
1332
unlikely scenario. Even when this happens it means that our
1333
flush rate will be off the mark. It won't affect correctness
1334
of any subsystem. */
1335
/* lru_flush_avg below is rate at which pages are flushed as
1336
part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
1337
number of pages flushed in the current interval. */
1338
lru_flush_avg = buf_flush_stat_sum.n_flushed
1339
/ BUF_FLUSH_STAT_N_INTERVAL
1340
+ (buf_lru_flush_page_count
1341
- buf_flush_stat_cur.n_flushed);
1343
n_flush_req = (n_dirty * redo_avg) / log_capacity;
1345
/* The number of pages that we want to flush from the flush
1346
list is the difference between the required rate and the
1347
number of pages that we are historically flushing from the
1349
rate = n_flush_req - lru_flush_avg;
1350
return(rate > 0 ? (ulint) rate : 0);
1353
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
1354
/******************************************************************//**
1355
Validates the flush list.
1356
@return TRUE if ok */
1359
buf_flush_validate_low(void)
1360
/*========================*/
1364
UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
1365
ut_ad(ut_list_node_313->in_flush_list));
1367
bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1369
while (bpage != NULL) {
1370
const ib_uint64_t om = bpage->oldest_modification;
1371
ut_ad(bpage->in_flush_list);
1372
ut_a(buf_page_in_file(bpage));
1375
bpage = UT_LIST_GET_NEXT(list, bpage);
1377
ut_a(!bpage || om >= bpage->oldest_modification);
1383
/******************************************************************//**
1384
Validates the flush list.
1385
@return TRUE if ok */
1388
buf_flush_validate(void)
1389
/*====================*/
1393
buf_pool_mutex_enter();
1395
ret = buf_flush_validate_low();
1397
buf_pool_mutex_exit();
1401
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
1402
#endif /* !UNIV_HOTBACKUP */