1
/*****************************************************************************
3
Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
5
This program is free software; you can redistribute it and/or modify it under
6
the terms of the GNU General Public License as published by the Free Software
7
Foundation; version 2 of the License.
9
This program is distributed in the hope that it will be useful, but WITHOUT
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License along with
14
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15
St, Fifth Floor, Boston, MA 02110-1301 USA
17
*****************************************************************************/
19
/**************************************************//**
21
The database buffer buf_pool flush algorithm
23
Created 11/11/1995 Heikki Tuuri
24
*******************************************************/
35
#ifndef UNIV_HOTBACKUP
38
#include "page0page.h"
42
#include "ibuf0ibuf.h"
47
/**********************************************************************
48
These statistics are generated for heuristics used in estimating the
49
rate at which we should flush the dirty blocks to avoid bursty IO
50
activity. Note that the rate of flushing not only depends on how many
51
dirty pages we have in the buffer pool but it is also a fucntion of
52
how much redo the workload is generating and at what rate. */
55
/** Number of intervals for which we keep the history of these stats.
56
Each interval is 1 second, defined by the rate at which
57
srv_error_monitor_thread() calls buf_flush_stat_update(). */
58
#define BUF_FLUSH_STAT_N_INTERVAL 20
60
/** Sampled values buf_flush_stat_cur.
61
Not protected by any mutex. Updated by buf_flush_stat_update(). */
62
static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
64
/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
65
static ulint buf_flush_stat_arr_ind;
67
/** Values at start of the current interval. Reset by
68
buf_flush_stat_update(). */
69
static buf_flush_stat_t buf_flush_stat_cur;
71
/** Running sum of past values of buf_flush_stat_cur.
72
Updated by buf_flush_stat_update(). Not protected by any mutex. */
73
static buf_flush_stat_t buf_flush_stat_sum;
75
/** Number of pages flushed through non flush_list flushes. */
76
static ulint buf_lru_flush_page_count = 0;
80
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
81
/******************************************************************//**
82
Validates the flush list.
86
buf_flush_validate_low(
87
/*===================*/
88
buf_pool_t* buf_pool); /*!< in: Buffer pool instance */
89
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
91
/******************************************************************//**
92
Insert a block in the flush_rbt and returns a pointer to its
93
predecessor or NULL if no predecessor. The ordering is maintained
94
on the basis of the <oldest_modification, space, offset> key.
95
@return pointer to the predecessor or NULL if no predecessor. */
98
buf_flush_insert_in_flush_rbt(
99
/*==========================*/
100
buf_page_t* bpage) /*!< in: bpage to be inserted. */
102
const ib_rbt_node_t* c_node;
103
const ib_rbt_node_t* p_node;
104
buf_page_t* prev = NULL;
105
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
107
ut_ad(buf_flush_list_mutex_own(buf_pool));
109
/* Insert this buffer into the rbt. */
110
c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
111
ut_a(c_node != NULL);
113
/* Get the predecessor. */
114
p_node = rbt_prev(buf_pool->flush_rbt, c_node);
116
if (p_node != NULL) {
118
value = rbt_value(buf_page_t*, p_node);
126
/*********************************************************//**
127
Delete a bpage from the flush_rbt. */
130
buf_flush_delete_from_flush_rbt(
131
/*============================*/
132
buf_page_t* bpage) /*!< in: bpage to be removed. */
136
#endif /* UNIV_DEBUG */
137
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
139
ut_ad(buf_flush_list_mutex_own(buf_pool));
143
#endif /* UNIV_DEBUG */
144
rbt_delete(buf_pool->flush_rbt, &bpage);
148
/*****************************************************************//**
149
Compare two modified blocks in the buffer pool. The key for comparison
151
key = <oldest_modification, space, offset>
152
This comparison is used to maintian ordering of blocks in the
154
Note that for the purpose of flush_rbt, we only need to order blocks
155
on the oldest_modification. The other two fields are used to uniquely
157
@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
162
const void* p1, /*!< in: block1 */
163
const void* p2) /*!< in: block2 */
166
const buf_page_t* b1 = *(const buf_page_t**) p1;
167
const buf_page_t* b2 = *(const buf_page_t**) p2;
169
buf_pool_t* buf_pool = buf_pool_from_bpage(b1);
170
#endif /* UNIV_DEBUG */
175
ut_ad(buf_flush_list_mutex_own(buf_pool));
177
ut_ad(b1->in_flush_list);
178
ut_ad(b2->in_flush_list);
180
if (b2->oldest_modification > b1->oldest_modification) {
182
} else if (b2->oldest_modification < b1->oldest_modification) {
186
/* If oldest_modification is same then decide on the space. */
187
ret = (int)(b2->space - b1->space);
189
/* Or else decide ordering on the offset field. */
190
return(ret ? ret : (int)(b2->offset - b1->offset));
193
/********************************************************************//**
194
Initialize the red-black tree to speed up insertions into the flush_list
195
during recovery process. Should be called at the start of recovery
196
process before any page has been read/written. */
199
buf_flush_init_flush_rbt(void)
200
/*==========================*/
204
for (i = 0; i < srv_buf_pool_instances; i++) {
205
buf_pool_t* buf_pool;
207
buf_pool = buf_pool_from_array(i);
209
buf_flush_list_mutex_enter(buf_pool);
211
/* Create red black tree for speedy insertions in flush list. */
212
buf_pool->flush_rbt = rbt_create(
213
sizeof(buf_page_t*), buf_flush_block_cmp);
215
buf_flush_list_mutex_exit(buf_pool);
219
/********************************************************************//**
220
Frees up the red-black tree. */
223
buf_flush_free_flush_rbt(void)
224
/*==========================*/
228
for (i = 0; i < srv_buf_pool_instances; i++) {
229
buf_pool_t* buf_pool;
231
buf_pool = buf_pool_from_array(i);
233
buf_flush_list_mutex_enter(buf_pool);
235
#ifdef UNIV_DEBUG_VALGRIND
237
ulint zip_size = buf_block_get_zip_size(block);
239
if (UNIV_UNLIKELY(zip_size)) {
240
UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
242
UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
245
#endif /* UNIV_DEBUG_VALGRIND */
246
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
247
ut_a(buf_flush_validate_low(buf_pool));
248
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
250
rbt_free(buf_pool->flush_rbt);
251
buf_pool->flush_rbt = NULL;
253
buf_flush_list_mutex_exit(buf_pool);
257
/********************************************************************//**
258
Inserts a modified block into the flush list. */
261
buf_flush_insert_into_flush_list(
262
/*=============================*/
263
buf_pool_t* buf_pool, /*!< buffer pool instance */
264
buf_block_t* block, /*!< in/out: block which is modified */
265
ib_uint64_t lsn) /*!< in: oldest modification */
267
ut_ad(!buf_pool_mutex_own(buf_pool));
268
ut_ad(log_flush_order_mutex_own());
269
ut_ad(mutex_own(&block->mutex));
271
buf_flush_list_mutex_enter(buf_pool);
273
ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
274
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
277
/* If we are in the recovery then we need to update the flush
278
red-black tree as well. */
279
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
280
buf_flush_list_mutex_exit(buf_pool);
281
buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
285
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
286
ut_ad(!block->page.in_flush_list);
288
ut_d(block->page.in_flush_list = TRUE);
289
block->page.oldest_modification = lsn;
290
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
292
#ifdef UNIV_DEBUG_VALGRIND
294
ulint zip_size = buf_block_get_zip_size(block);
296
if (UNIV_UNLIKELY(zip_size)) {
297
UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
299
UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
302
#endif /* UNIV_DEBUG_VALGRIND */
303
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
304
ut_a(buf_flush_validate_low(buf_pool));
305
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
307
buf_flush_list_mutex_exit(buf_pool);
310
/********************************************************************//**
311
Inserts a modified block into the flush list in the right sorted position.
312
This function is used by recovery, because there the modifications do not
313
necessarily come in the order of lsn's. */
316
buf_flush_insert_sorted_into_flush_list(
317
/*====================================*/
318
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
319
buf_block_t* block, /*!< in/out: block which is modified */
320
ib_uint64_t lsn) /*!< in: oldest modification */
325
ut_ad(!buf_pool_mutex_own(buf_pool));
326
ut_ad(log_flush_order_mutex_own());
327
ut_ad(mutex_own(&block->mutex));
328
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
330
buf_flush_list_mutex_enter(buf_pool);
332
/* The field in_LRU_list is protected by buf_pool_mutex, which
333
we are not holding. However, while a block is in the flush
334
list, it is dirty and cannot be discarded, not from the
335
page_hash or from the LRU list. At most, the uncompressed
336
page frame of a compressed block may be discarded or created
337
(copying the block->page to or from a buf_page_t that is
338
dynamically allocated from buf_buddy_alloc()). Because those
339
transitions hold block->mutex and the flush list mutex (via
340
buf_flush_relocate_on_flush_list()), there is no possibility
341
of a race condition in the assertions below. */
342
ut_ad(block->page.in_LRU_list);
343
ut_ad(block->page.in_page_hash);
344
/* buf_buddy_block_register() will take a block in the
345
BUF_BLOCK_MEMORY state, not a file page. */
346
ut_ad(!block->page.in_zip_hash);
348
ut_ad(!block->page.in_flush_list);
349
ut_d(block->page.in_flush_list = TRUE);
350
block->page.oldest_modification = lsn;
352
#ifdef UNIV_DEBUG_VALGRIND
354
ulint zip_size = buf_block_get_zip_size(block);
356
if (UNIV_UNLIKELY(zip_size)) {
357
UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
359
UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
362
#endif /* UNIV_DEBUG_VALGRIND */
364
#ifdef UNIV_DEBUG_VALGRIND
366
ulint zip_size = buf_block_get_zip_size(block);
368
if (UNIV_UNLIKELY(zip_size)) {
369
UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
371
UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
374
#endif /* UNIV_DEBUG_VALGRIND */
378
/* For the most part when this function is called the flush_rbt
379
should not be NULL. In a very rare boundary case it is possible
380
that the flush_rbt has already been freed by the recovery thread
381
before the last page was hooked up in the flush_list by the
382
io-handler thread. In that case we'll just do a simple
383
linear search in the else block. */
384
if (buf_pool->flush_rbt) {
386
prev_b = buf_flush_insert_in_flush_rbt(&block->page);
390
b = UT_LIST_GET_FIRST(buf_pool->flush_list);
392
while (b && b->oldest_modification
393
> block->page.oldest_modification) {
394
ut_ad(b->in_flush_list);
396
b = UT_LIST_GET_NEXT(list, b);
400
if (prev_b == NULL) {
401
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
403
UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
404
prev_b, &block->page);
407
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
408
ut_a(buf_flush_validate_low(buf_pool));
409
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
411
buf_flush_list_mutex_exit(buf_pool);
414
/********************************************************************//**
415
Returns TRUE if the file page block is immediately suitable for replacement,
416
i.e., the transition FILE_PAGE => NOT_USED allowed.
417
@return TRUE if can replace immediately */
420
buf_flush_ready_for_replace(
421
/*========================*/
422
buf_page_t* bpage) /*!< in: buffer control block, must be
423
buf_page_in_file(bpage) and in the LRU list */
426
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
427
ut_ad(buf_pool_mutex_own(buf_pool));
429
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
430
ut_ad(bpage->in_LRU_list);
432
if (UNIV_LIKELY(buf_page_in_file(bpage))) {
434
return(bpage->oldest_modification == 0
435
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE
436
&& bpage->buf_fix_count == 0);
439
ut_print_timestamp(stderr);
441
" InnoDB: Error: buffer block state %lu"
442
" in the LRU list!\n",
443
(ulong) buf_page_get_state(bpage));
444
ut_print_buf(stderr, bpage, sizeof(buf_page_t));
450
/********************************************************************//**
451
Returns TRUE if the block is modified and ready for flushing.
452
@return TRUE if can flush immediately */
455
buf_flush_ready_for_flush(
456
/*======================*/
457
buf_page_t* bpage, /*!< in: buffer control block, must be
458
buf_page_in_file(bpage) */
459
enum buf_flush flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
462
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
463
ut_ad(buf_pool_mutex_own(buf_pool));
465
ut_a(buf_page_in_file(bpage));
466
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
467
ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
469
if (bpage->oldest_modification != 0
470
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
471
ut_ad(bpage->in_flush_list);
473
if (flush_type != BUF_FLUSH_LRU) {
477
} else if (bpage->buf_fix_count == 0) {
479
/* If we are flushing the LRU list, to avoid deadlocks
480
we require the block not to be bufferfixed, and hence
490
/********************************************************************//**
491
Remove a block from the flush list of modified blocks. */
496
buf_page_t* bpage) /*!< in: pointer to the block in question */
498
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
500
ut_ad(buf_pool_mutex_own(buf_pool));
501
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
502
ut_ad(bpage->in_flush_list);
504
buf_flush_list_mutex_enter(buf_pool);
506
switch (buf_page_get_state(bpage)) {
507
case BUF_BLOCK_ZIP_PAGE:
508
/* Clean compressed pages should not be on the flush list */
509
case BUF_BLOCK_ZIP_FREE:
510
case BUF_BLOCK_NOT_USED:
511
case BUF_BLOCK_READY_FOR_USE:
512
case BUF_BLOCK_MEMORY:
513
case BUF_BLOCK_REMOVE_HASH:
516
case BUF_BLOCK_ZIP_DIRTY:
517
buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
518
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
519
buf_LRU_insert_zip_clean(bpage);
521
case BUF_BLOCK_FILE_PAGE:
522
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
526
/* If the flush_rbt is active then delete from there as well. */
527
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
528
buf_flush_delete_from_flush_rbt(bpage);
531
/* Must be done after we have removed it from the flush_rbt
532
because we assert on in_flush_list in comparison function. */
533
ut_d(bpage->in_flush_list = FALSE);
535
bpage->oldest_modification = 0;
537
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
538
ut_a(buf_flush_validate_low(buf_pool));
539
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
541
buf_flush_list_mutex_exit(buf_pool);
544
/*******************************************************************//**
545
Relocates a buffer control block on the flush_list.
546
Note that it is assumed that the contents of bpage have already been
548
IMPORTANT: When this function is called bpage and dpage are not
549
exact copies of each other. For example, they both will have different
550
::state. Also the ::list pointers in dpage may be stale. We need to
551
use the current list node (bpage) to do the list manipulation because
552
the list pointers could have changed between the time that we copied
553
the contents of bpage to the dpage and the flush list manipulation
557
buf_flush_relocate_on_flush_list(
558
/*=============================*/
559
buf_page_t* bpage, /*!< in/out: control block being moved */
560
buf_page_t* dpage) /*!< in/out: destination block */
563
buf_page_t* prev_b = NULL;
564
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
566
ut_ad(buf_pool_mutex_own(buf_pool));
567
/* Must reside in the same buffer pool. */
568
ut_ad(buf_pool == buf_pool_from_bpage(dpage));
570
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
572
buf_flush_list_mutex_enter(buf_pool);
574
/* FIXME: At this point we have both buf_pool and flush_list
575
mutexes. Theoretically removal of a block from flush list is
576
only covered by flush_list mutex but currently we do
577
have buf_pool mutex in buf_flush_remove() therefore this block
578
is guaranteed to be in the flush list. We need to check if
579
this will work without the assumption of block removing code
580
having the buf_pool mutex. */
581
ut_ad(bpage->in_flush_list);
582
ut_ad(dpage->in_flush_list);
584
/* If recovery is active we must swap the control blocks in
585
the flush_rbt as well. */
586
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
587
buf_flush_delete_from_flush_rbt(bpage);
588
prev_b = buf_flush_insert_in_flush_rbt(dpage);
591
/* Must be done after we have removed it from the flush_rbt
592
because we assert on in_flush_list in comparison function. */
593
ut_d(bpage->in_flush_list = FALSE);
595
prev = UT_LIST_GET_PREV(list, bpage);
596
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
599
ut_ad(prev->in_flush_list);
600
UT_LIST_INSERT_AFTER(
602
buf_pool->flush_list,
607
buf_pool->flush_list,
611
/* Just an extra check. Previous in flush_list
612
should be the same control block as in flush_rbt. */
613
ut_a(!buf_pool->flush_rbt || prev_b == prev);
615
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
616
ut_a(buf_flush_validate_low(buf_pool));
617
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
619
buf_flush_list_mutex_exit(buf_pool);
622
/********************************************************************//**
623
Updates the flush system data structures when a write is completed. */
626
buf_flush_write_complete(
627
/*=====================*/
628
buf_page_t* bpage) /*!< in: pointer to the block in question */
630
enum buf_flush flush_type;
631
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
635
buf_flush_remove(bpage);
637
flush_type = buf_page_get_flush_type(bpage);
638
buf_pool->n_flush[flush_type]--;
640
if (flush_type == BUF_FLUSH_LRU) {
641
/* Put the block to the end of the LRU list to wait to be
642
moved to the free list */
644
buf_LRU_make_block_old(bpage);
646
buf_pool->LRU_flush_ended++;
649
/* fprintf(stderr, "n pending flush %lu\n",
650
buf_pool->n_flush[flush_type]); */
652
if (buf_pool->n_flush[flush_type] == 0
653
&& buf_pool->init_flush[flush_type] == FALSE) {
655
/* The running flush batch has ended */
657
os_event_set(buf_pool->no_flush[flush_type]);
661
/********************************************************************//**
662
Flush a batch of writes to the datafiles that have already been
663
written by the OS. */
666
buf_flush_sync_datafiles(void)
667
/*==========================*/
669
/* Wake possible simulated aio thread to actually post the
670
writes to the operating system */
671
os_aio_simulated_wake_handler_threads();
673
/* Wait that all async writes to tablespaces have been posted to
675
os_aio_wait_until_no_pending_writes();
677
/* Now we flush the data to disk (for example, with fsync) */
678
fil_flush_file_spaces(FIL_TABLESPACE);
683
/********************************************************************//**
684
Flushes possible buffered writes from the doublewrite memory buffer to disk,
685
and also wakes up the aio thread if simulated aio is used. It is very
686
important to call this function after a batch of writes has been posted,
687
and also when we may have to wait for a page latch! Otherwise a deadlock
688
of threads can occur. */
691
buf_flush_buffered_writes(void)
692
/*===========================*/
699
if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
700
/* Sync the writes to the disk. */
701
buf_flush_sync_datafiles();
705
mutex_enter(&(trx_doublewrite->mutex));
707
/* Write first to doublewrite buffer blocks. We use synchronous
708
aio and thus know that file write has been completed when the
711
if (trx_doublewrite->first_free == 0) {
713
mutex_exit(&(trx_doublewrite->mutex));
718
for (i = 0; i < trx_doublewrite->first_free; i++) {
720
const buf_block_t* block;
722
block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
724
if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
725
|| block->page.zip.data) {
726
/* No simple validate for compressed pages exists. */
731
(memcmp(block->frame + (FIL_PAGE_LSN + 4),
732
block->frame + (UNIV_PAGE_SIZE
733
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
735
ut_print_timestamp(stderr);
737
" InnoDB: ERROR: The page to be written"
739
"InnoDB: The lsn fields do not match!"
740
" Noticed in the buffer pool\n"
741
"InnoDB: before posting to the"
742
" doublewrite buffer.\n");
745
if (!block->check_index_page_at_flush) {
746
} else if (page_is_comp(block->frame)) {
748
(!page_simple_validate_new(block->frame))) {
750
buf_page_print(block->frame, 0);
752
ut_print_timestamp(stderr);
754
" InnoDB: Apparent corruption of an"
755
" index page n:o %lu in space %lu\n"
756
"InnoDB: to be written to data file."
757
" We intentionally crash server\n"
758
"InnoDB: to prevent corrupt data"
759
" from ending up in data\n"
761
(ulong) buf_block_get_page_no(block),
762
(ulong) buf_block_get_space(block));
766
} else if (UNIV_UNLIKELY
767
(!page_simple_validate_old(block->frame))) {
773
/* increment the doublewrite flushed pages counter */
774
srv_dblwr_pages_written+= trx_doublewrite->first_free;
777
len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
778
trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
780
write_buf = trx_doublewrite->write_buf;
783
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
784
trx_doublewrite->block1, 0, len,
785
(void*) write_buf, NULL);
787
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
788
len2 += UNIV_PAGE_SIZE, i++) {
789
const buf_block_t* block = (buf_block_t*)
790
trx_doublewrite->buf_block_arr[i];
792
if (UNIV_LIKELY(!block->page.zip.data)
793
&& UNIV_LIKELY(buf_block_get_state(block)
794
== BUF_BLOCK_FILE_PAGE)
796
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
799
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
800
ut_print_timestamp(stderr);
802
" InnoDB: ERROR: The page to be written"
804
"InnoDB: The lsn fields do not match!"
805
" Noticed in the doublewrite block1.\n");
809
if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
813
len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
816
write_buf = trx_doublewrite->write_buf
817
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
818
ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
820
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
821
trx_doublewrite->block2, 0, len,
822
(void*) write_buf, NULL);
824
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
825
len2 += UNIV_PAGE_SIZE, i++) {
826
const buf_block_t* block = (buf_block_t*)
827
trx_doublewrite->buf_block_arr[i];
829
if (UNIV_LIKELY(!block->page.zip.data)
830
&& UNIV_LIKELY(buf_block_get_state(block)
831
== BUF_BLOCK_FILE_PAGE)
833
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
836
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
837
ut_print_timestamp(stderr);
839
" InnoDB: ERROR: The page to be"
840
" written seems corrupt!\n"
841
"InnoDB: The lsn fields do not match!"
843
" the doublewrite block2.\n");
848
/* Now flush the doublewrite buffer data to disk */
850
fil_flush(TRX_SYS_SPACE);
852
/* We know that the writes have been flushed to disk now
853
and in recovery we will find them in the doublewrite buffer
854
blocks. Next do the writes to the intended positions. */
856
for (i = 0; i < trx_doublewrite->first_free; i++) {
857
const buf_block_t* block = (buf_block_t*)
858
trx_doublewrite->buf_block_arr[i];
860
ut_a(buf_page_in_file(&block->page));
861
if (UNIV_LIKELY_NULL(block->page.zip.data)) {
862
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
863
FALSE, buf_page_get_space(&block->page),
864
buf_page_get_zip_size(&block->page),
865
buf_page_get_page_no(&block->page), 0,
866
buf_page_get_zip_size(&block->page),
867
(void*)block->page.zip.data,
870
/* Increment the counter of I/O operations used
871
for selecting LRU policy. */
872
buf_LRU_stat_inc_io();
877
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
879
if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
882
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
884
ut_print_timestamp(stderr);
886
" InnoDB: ERROR: The page to be written"
888
"InnoDB: The lsn fields do not match!"
889
" Noticed in the buffer pool\n"
890
"InnoDB: after posting and flushing"
891
" the doublewrite buffer.\n"
892
"InnoDB: Page buf fix count %lu,"
893
" io fix %lu, state %lu\n",
894
(ulong)block->page.buf_fix_count,
895
(ulong)buf_block_get_io_fix(block),
896
(ulong)buf_block_get_state(block));
899
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
900
FALSE, buf_block_get_space(block), 0,
901
buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
902
(void*)block->frame, (void*)block);
904
/* Increment the counter of I/O operations used
905
for selecting LRU policy. */
906
buf_LRU_stat_inc_io();
909
/* Sync the writes to the disk. */
910
buf_flush_sync_datafiles();
912
/* We can now reuse the doublewrite memory buffer: */
913
trx_doublewrite->first_free = 0;
915
mutex_exit(&(trx_doublewrite->mutex));
918
/********************************************************************//**
919
Posts a buffer page for writing. If the doublewrite memory buffer is
920
full, calls buf_flush_buffered_writes and waits for for free space to
924
buf_flush_post_to_doublewrite_buf(
925
/*==============================*/
926
buf_page_t* bpage) /*!< in: buffer block to write */
930
mutex_enter(&(trx_doublewrite->mutex));
932
ut_a(buf_page_in_file(bpage));
934
if (trx_doublewrite->first_free
935
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
936
mutex_exit(&(trx_doublewrite->mutex));
938
buf_flush_buffered_writes();
943
zip_size = buf_page_get_zip_size(bpage);
945
if (UNIV_UNLIKELY(zip_size)) {
946
UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
947
/* Copy the compressed page and clear the rest. */
948
memcpy(trx_doublewrite->write_buf
949
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
950
bpage->zip.data, zip_size);
951
memset(trx_doublewrite->write_buf
952
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free
953
+ zip_size, 0, UNIV_PAGE_SIZE - zip_size);
955
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
956
UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
959
memcpy(trx_doublewrite->write_buf
960
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
961
((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
964
trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
966
trx_doublewrite->first_free++;
968
if (trx_doublewrite->first_free
969
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
970
mutex_exit(&(trx_doublewrite->mutex));
972
buf_flush_buffered_writes();
977
mutex_exit(&(trx_doublewrite->mutex));
979
#endif /* !UNIV_HOTBACKUP */
981
/********************************************************************//**
982
Initializes a page for writing to the tablespace. */
985
buf_flush_init_for_writing(
986
/*=======================*/
987
byte* page, /*!< in/out: page */
988
void* page_zip_, /*!< in/out: compressed page, or NULL */
989
ib_uint64_t newest_lsn) /*!< in: newest modification lsn
995
page_zip_des_t* page_zip = page_zip_;
996
ulint zip_size = page_zip_get_size(page_zip);
998
ut_ad(ut_is_2pow(zip_size));
999
ut_ad(zip_size <= UNIV_PAGE_SIZE);
1001
switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
1002
case FIL_PAGE_TYPE_ALLOCATED:
1003
case FIL_PAGE_INODE:
1004
case FIL_PAGE_IBUF_BITMAP:
1005
case FIL_PAGE_TYPE_FSP_HDR:
1006
case FIL_PAGE_TYPE_XDES:
1007
/* These are essentially uncompressed pages. */
1008
memcpy(page_zip->data, page, zip_size);
1010
case FIL_PAGE_TYPE_ZBLOB:
1011
case FIL_PAGE_TYPE_ZBLOB2:
1012
case FIL_PAGE_INDEX:
1013
mach_write_to_8(page_zip->data
1014
+ FIL_PAGE_LSN, newest_lsn);
1015
memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
1016
mach_write_to_4(page_zip->data
1017
+ FIL_PAGE_SPACE_OR_CHKSUM,
1019
? page_zip_calc_checksum(
1020
page_zip->data, zip_size)
1021
: BUF_NO_CHECKSUM_MAGIC);
1025
ut_print_timestamp(stderr);
1026
fputs(" InnoDB: ERROR: The compressed page to be written"
1027
" seems corrupt:", stderr);
1028
ut_print_buf(stderr, page, zip_size);
1029
fputs("\nInnoDB: Possibly older version of the page:", stderr);
1030
ut_print_buf(stderr, page_zip->data, zip_size);
1035
/* Write the newest modification lsn to the page header and trailer */
1036
mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
1038
mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
1041
/* Store the new formula checksum */
1043
mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
1045
? buf_calc_page_new_checksum(page)
1046
: BUF_NO_CHECKSUM_MAGIC);
1048
/* We overwrite the first 4 bytes of the end lsn field to store
1049
the old formula checksum. Since it depends also on the field
1050
FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
1051
new formula checksum. */
1053
mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
1055
? buf_calc_page_old_checksum(page)
1056
: BUF_NO_CHECKSUM_MAGIC);
1059
#ifndef UNIV_HOTBACKUP
1060
/********************************************************************//**
1061
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
1062
also when the doublewrite buffer is used, we must call
1063
buf_flush_buffered_writes after we have posted a batch of writes! */
1066
buf_flush_write_block_low(
1067
/*======================*/
1068
buf_page_t* bpage) /*!< in: buffer block to write */
1070
ulint zip_size = buf_page_get_zip_size(bpage);
1071
page_t* frame = NULL;
1074
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1075
ut_ad(!buf_pool_mutex_own(buf_pool));
1078
#ifdef UNIV_LOG_DEBUG
1079
static ibool univ_log_debug_warned;
1080
#endif /* UNIV_LOG_DEBUG */
1082
ut_ad(buf_page_in_file(bpage));
1084
/* We are not holding buf_pool_mutex or block_mutex here.
1085
Nevertheless, it is safe to access bpage, because it is
1086
io_fixed and oldest_modification != 0. Thus, it cannot be
1087
relocated in the buffer pool or removed from flush_list or
1089
ut_ad(!buf_pool_mutex_own(buf_pool));
1090
ut_ad(!buf_flush_list_mutex_own(buf_pool));
1091
ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
1092
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1093
ut_ad(bpage->oldest_modification != 0);
1095
#ifdef UNIV_IBUF_COUNT_DEBUG
1096
ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
1098
ut_ad(bpage->newest_modification != 0);
1100
#ifdef UNIV_LOG_DEBUG
1101
if (!univ_log_debug_warned) {
1102
univ_log_debug_warned = TRUE;
1103
fputs("Warning: cannot force log to disk if"
1104
" UNIV_LOG_DEBUG is defined!\n"
1105
"Crash recovery will not work!\n",
1109
/* Force the log to the disk before writing the modified block */
1110
log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
1112
switch (buf_page_get_state(bpage)) {
1113
case BUF_BLOCK_ZIP_FREE:
1114
case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
1115
case BUF_BLOCK_NOT_USED:
1116
case BUF_BLOCK_READY_FOR_USE:
1117
case BUF_BLOCK_MEMORY:
1118
case BUF_BLOCK_REMOVE_HASH:
1121
case BUF_BLOCK_ZIP_DIRTY:
1122
frame = bpage->zip.data;
1123
if (UNIV_LIKELY(srv_use_checksums)) {
1124
ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
1125
== page_zip_calc_checksum(frame, zip_size));
1127
mach_write_to_8(frame + FIL_PAGE_LSN,
1128
bpage->newest_modification);
1129
memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
1131
case BUF_BLOCK_FILE_PAGE:
1132
frame = bpage->zip.data;
1134
frame = ((buf_block_t*) bpage)->frame;
1137
buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
1139
? &bpage->zip : NULL,
1140
bpage->newest_modification);
1144
if (!srv_use_doublewrite_buf || !trx_doublewrite) {
1145
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
1146
FALSE, buf_page_get_space(bpage), zip_size,
1147
buf_page_get_page_no(bpage), 0,
1148
zip_size ? zip_size : UNIV_PAGE_SIZE,
1151
buf_flush_post_to_doublewrite_buf(bpage);
1155
/********************************************************************//**
1156
Writes a flushable page asynchronously from the buffer pool to a file.
1157
NOTE: in simulated aio we must call
1158
os_aio_simulated_wake_handler_threads after we have posted a batch of
1159
writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
1160
held upon entering this function, and they will be released by this
1166
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1167
buf_page_t* bpage, /*!< in: buffer control block */
1168
enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU
1169
or BUF_FLUSH_LIST */
1171
mutex_t* block_mutex;
1172
ibool is_uncompressed;
1174
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1175
ut_ad(buf_pool_mutex_own(buf_pool));
1176
ut_ad(buf_page_in_file(bpage));
1178
block_mutex = buf_page_get_mutex(bpage);
1179
ut_ad(mutex_own(block_mutex));
1181
ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1183
buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1185
buf_page_set_flush_type(bpage, flush_type);
1187
if (buf_pool->n_flush[flush_type] == 0) {
1189
os_event_reset(buf_pool->no_flush[flush_type]);
1192
buf_pool->n_flush[flush_type]++;
1194
is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1195
ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1197
switch (flush_type) {
1199
case BUF_FLUSH_LIST:
1200
/* If the simulated aio thread is not running, we must
1201
not wait for any latch, as we may end up in a deadlock:
1202
if buf_fix_count == 0, then we know we need not wait */
1204
is_s_latched = (bpage->buf_fix_count == 0);
1205
if (is_s_latched && is_uncompressed) {
1206
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
1210
mutex_exit(block_mutex);
1211
buf_pool_mutex_exit(buf_pool);
1213
/* Even though bpage is not protected by any mutex at
1214
this point, it is safe to access bpage, because it is
1215
io_fixed and oldest_modification != 0. Thus, it
1216
cannot be relocated in the buffer pool or removed from
1217
flush_list or LRU_list. */
1219
if (!is_s_latched) {
1220
buf_flush_buffered_writes();
1222
if (is_uncompressed) {
1223
rw_lock_s_lock_gen(&((buf_block_t*) bpage)
1224
->lock, BUF_IO_WRITE);
1232
Because any thread may call the LRU flush, even when owning
1233
locks on pages, to avoid deadlocks, we must make sure that the
1234
s-lock is acquired on the page without waiting: this is
1235
accomplished because buf_flush_ready_for_flush() must hold,
1236
and that requires the page not to be bufferfixed. */
1238
if (is_uncompressed) {
1239
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
1243
/* Note that the s-latch is acquired before releasing the
1244
buf_pool mutex: this ensures that the latch is acquired
1247
mutex_exit(block_mutex);
1248
buf_pool_mutex_exit(buf_pool);
1255
/* Even though bpage is not protected by any mutex at this
1256
point, it is safe to access bpage, because it is io_fixed and
1257
oldest_modification != 0. Thus, it cannot be relocated in the
1258
buffer pool or removed from flush_list or LRU_list. */
1261
if (buf_debug_prints) {
1263
"Flushing %u space %u page %u\n",
1264
flush_type, bpage->space, bpage->offset);
1266
#endif /* UNIV_DEBUG */
1267
buf_flush_write_block_low(bpage);
1270
/***********************************************************//**
1271
Flushes to disk all flushable pages within the flush area.
1272
@return number of pages flushed */
1275
buf_flush_try_neighbors(
1276
/*====================*/
1277
ulint space, /*!< in: space id */
1278
ulint offset, /*!< in: page offset */
1279
enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or
1281
ulint n_flushed, /*!< in: number of pages
1282
flushed so far in this batch */
1283
ulint n_to_flush) /*!< in: maximum number of pages
1284
we are allowed to flush */
1290
buf_pool_t* buf_pool = buf_pool_get(space, offset);
1292
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1294
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
1295
/* If there is little space, it is better not to flush
1296
any block except from the end of the LRU list */
1301
/* When flushed, dirty blocks are searched in
1302
neighborhoods of this size, and flushed along with the
1305
ulint buf_flush_area;
1307
buf_flush_area = ut_min(
1308
BUF_READ_AHEAD_AREA(buf_pool),
1309
buf_pool->curr_size / 16);
1311
low = (offset / buf_flush_area) * buf_flush_area;
1312
high = (offset / buf_flush_area + 1) * buf_flush_area;
1315
/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
1317
if (high > fil_space_get_size(space)) {
1318
high = fil_space_get_size(space);
1321
for (i = low; i < high; i++) {
1325
if ((count + n_flushed) >= n_to_flush) {
1327
/* We have already flushed enough pages and
1328
should call it a day. There is, however, one
1329
exception. If the page whose neighbors we
1330
are flushing has not been flushed yet then
1331
we'll try to flush the victim that we
1332
selected originally. */
1340
buf_pool = buf_pool_get(space, i);
1342
buf_pool_mutex_enter(buf_pool);
1344
/* We only want to flush pages from this buffer pool. */
1345
bpage = buf_page_hash_get(buf_pool, space, i);
1349
buf_pool_mutex_exit(buf_pool);
1353
ut_a(buf_page_in_file(bpage));
1355
/* We avoid flushing 'non-old' blocks in an LRU flush,
1356
because the flushed blocks are soon freed */
1358
if (flush_type != BUF_FLUSH_LRU
1360
|| buf_page_is_old(bpage)) {
1361
mutex_t* block_mutex = buf_page_get_mutex(bpage);
1363
mutex_enter(block_mutex);
1365
if (buf_flush_ready_for_flush(bpage, flush_type)
1366
&& (i == offset || !bpage->buf_fix_count)) {
1367
/* We only try to flush those
1368
neighbors != offset where the buf fix
1369
count is zero, as we then know that we
1370
probably can latch the page without a
1371
semaphore wait. Semaphore waits are
1372
expensive because we must flush the
1373
doublewrite buffer before we start
1376
buf_flush_page(buf_pool, bpage, flush_type);
1377
ut_ad(!mutex_own(block_mutex));
1378
ut_ad(!buf_pool_mutex_own(buf_pool));
1382
mutex_exit(block_mutex);
1385
buf_pool_mutex_exit(buf_pool);
1391
/********************************************************************//**
1392
Check if the block is modified and ready for flushing. If the the block
1393
is ready to flush then flush the page and try o flush its neighbors.
1395
@return TRUE if buf_pool mutex was not released during this function.
1396
This does not guarantee that some pages were written as well.
1397
Number of pages written are incremented to the count. */
1400
buf_flush_page_and_try_neighbors(
1401
/*=============================*/
1402
buf_page_t* bpage, /*!< in: buffer control block,
1404
buf_page_in_file(bpage) */
1405
enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU
1406
or BUF_FLUSH_LIST */
1407
ulint n_to_flush, /*!< in: number of pages to
1409
ulint* count) /*!< in/out: number of pages
1412
mutex_t* block_mutex;
1413
ibool flushed = FALSE;
1415
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1416
#endif /* UNIV_DEBUG */
1418
ut_ad(buf_pool_mutex_own(buf_pool));
1420
block_mutex = buf_page_get_mutex(bpage);
1421
mutex_enter(block_mutex);
1423
ut_a(buf_page_in_file(bpage));
1425
if (buf_flush_ready_for_flush(bpage, flush_type)) {
1428
buf_pool_t* buf_pool;
1430
buf_pool = buf_pool_from_bpage(bpage);
1432
buf_pool_mutex_exit(buf_pool);
1434
/* These fields are protected by both the
1435
buffer pool mutex and block mutex. */
1436
space = buf_page_get_space(bpage);
1437
offset = buf_page_get_page_no(bpage);
1439
mutex_exit(block_mutex);
1441
/* Try to flush also all the neighbors */
1442
*count += buf_flush_try_neighbors(space,
1448
buf_pool_mutex_enter(buf_pool);
1451
mutex_exit(block_mutex);
1454
ut_ad(buf_pool_mutex_own(buf_pool));
1459
/*******************************************************************//**
1460
This utility flushes dirty blocks from the end of the LRU list.
1461
In the case of an LRU flush the calling thread may own latches to
1462
pages: to avoid deadlocks, this function must be written so that it
1463
cannot end up waiting for these latches!
1464
@return number of blocks for which the write request was queued. */
1467
buf_flush_LRU_list_batch(
1468
/*=====================*/
1469
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1470
ulint max) /*!< in: max of blocks to flush */
1475
ut_ad(buf_pool_mutex_own(buf_pool));
1478
/* Start from the end of the list looking for a
1479
suitable block to be flushed. */
1480
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1482
/* Iterate backwards over the flush list till we find
1483
a page that isn't ready for flushing. */
1484
while (bpage != NULL
1485
&& !buf_flush_page_and_try_neighbors(
1486
bpage, BUF_FLUSH_LRU, max, &count)) {
1488
bpage = UT_LIST_GET_PREV(LRU, bpage);
1490
} while (bpage != NULL && count < max);
1492
/* We keep track of all flushes happening as part of LRU
1493
flush. When estimating the desired rate at which flush_list
1494
should be flushed, we factor in this value. */
1495
buf_lru_flush_page_count += count;
1497
ut_ad(buf_pool_mutex_own(buf_pool));
1502
/*******************************************************************//**
1503
This utility flushes dirty blocks from the end of the flush_list.
1504
the calling thread is not allowed to own any latches on pages!
1505
@return number of blocks for which the write request was queued;
1506
ULINT_UNDEFINED if there was a flush of the same type already
1510
buf_flush_flush_list_batch(
1511
/*=======================*/
1512
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1513
ulint min_n, /*!< in: wished minimum mumber
1514
of blocks flushed (it is not
1515
guaranteed that the actual
1516
number is that big, though) */
1517
ib_uint64_t lsn_limit) /*!< all blocks whose
1518
oldest_modification is smaller
1519
than this should be flushed (if
1520
their number does not exceed
1527
ut_ad(buf_pool_mutex_own(buf_pool));
1529
/* If we have flushed enough, leave the loop */
1531
/* Start from the end of the list looking for a suitable
1532
block to be flushed. */
1534
buf_flush_list_mutex_enter(buf_pool);
1536
/* We use len here because theoretically insertions can
1537
happen in the flush_list below while we are traversing
1538
it for a suitable candidate for flushing. We'd like to
1539
set a limit on how farther we are willing to traverse
1541
len = UT_LIST_GET_LEN(buf_pool->flush_list);
1542
bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1545
ut_a(bpage->oldest_modification > 0);
1548
if (!bpage || bpage->oldest_modification >= lsn_limit) {
1550
/* We have flushed enough */
1551
buf_flush_list_mutex_exit(buf_pool);
1555
ut_a(bpage->oldest_modification > 0);
1557
ut_ad(bpage->in_flush_list);
1559
buf_flush_list_mutex_exit(buf_pool);
1561
/* The list may change during the flushing and we cannot
1562
safely preserve within this function a pointer to a
1563
block in the list! */
1564
while (bpage != NULL
1566
&& !buf_flush_page_and_try_neighbors(
1567
bpage, BUF_FLUSH_LIST, min_n, &count)) {
1569
buf_flush_list_mutex_enter(buf_pool);
1571
/* If we are here that means that buf_pool->mutex
1572
was not released in buf_flush_page_and_try_neighbors()
1573
above and this guarantees that bpage didn't get
1574
relocated since we released the flush_list
1575
mutex above. There is a chance, however, that
1576
the bpage got removed from flush_list (not
1577
currently possible because flush_list_remove()
1578
also obtains buf_pool mutex but that may change
1579
in future). To avoid this scenario we check
1580
the oldest_modification and if it is zero
1581
we start all over again. */
1582
if (bpage->oldest_modification == 0) {
1583
buf_flush_list_mutex_exit(buf_pool);
1587
bpage = UT_LIST_GET_PREV(list, bpage);
1589
ut_ad(!bpage || bpage->in_flush_list);
1591
buf_flush_list_mutex_exit(buf_pool);
1596
} while (count < min_n && bpage != NULL && len > 0);
1598
ut_ad(buf_pool_mutex_own(buf_pool));
1603
/*******************************************************************//**
1604
This utility flushes dirty blocks from the end of the LRU list or flush_list.
1605
NOTE 1: in the case of an LRU flush the calling thread may own latches to
1606
pages: to avoid deadlocks, this function must be written so that it cannot
1607
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1608
the calling thread is not allowed to own any latches on pages!
1609
@return number of blocks for which the write request was queued;
1610
ULINT_UNDEFINED if there was a flush of the same type already running */
1615
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1616
enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU or
1617
BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
1618
then the caller must not own any
1620
ulint min_n, /*!< in: wished minimum mumber of blocks
1621
flushed (it is not guaranteed that the
1622
actual number is that big, though) */
1623
ib_uint64_t lsn_limit) /*!< in: in the case of BUF_FLUSH_LIST
1624
all blocks whose oldest_modification is
1625
smaller than this should be flushed
1626
(if their number does not exceed
1627
min_n), otherwise ignored */
1631
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1632
#ifdef UNIV_SYNC_DEBUG
1633
ut_ad((flush_type != BUF_FLUSH_LIST)
1634
|| sync_thread_levels_empty_gen(TRUE));
1635
#endif /* UNIV_SYNC_DEBUG */
1637
buf_pool_mutex_enter(buf_pool);
1639
/* Note: The buffer pool mutex is released and reacquired within
1640
the flush functions. */
1641
switch(flush_type) {
1643
count = buf_flush_LRU_list_batch(buf_pool, min_n);
1645
case BUF_FLUSH_LIST:
1646
count = buf_flush_flush_list_batch(buf_pool, min_n, lsn_limit);
1652
buf_pool_mutex_exit(buf_pool);
1654
buf_flush_buffered_writes();
1657
if (buf_debug_prints && count > 0) {
1658
fprintf(stderr, flush_type == BUF_FLUSH_LRU
1659
? "Flushed %lu pages in LRU flush\n"
1660
: "Flushed %lu pages in flush list flush\n",
1663
#endif /* UNIV_DEBUG */
1665
srv_buf_pool_flushed += count;
1670
/******************************************************************//**
1671
Gather the aggregated stats for both flush list and LRU list flushing */
1676
enum buf_flush flush_type, /*!< in: type of flush */
1677
ulint page_count) /*!< in: number of pages flushed */
1679
buf_flush_buffered_writes();
1681
ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1684
if (buf_debug_prints && page_count > 0) {
1685
fprintf(stderr, flush_type == BUF_FLUSH_LRU
1686
? "Flushed %lu pages in LRU flush\n"
1687
: "Flushed %lu pages in flush list flush\n",
1688
(ulong) page_count);
1690
#endif /* UNIV_DEBUG */
1692
srv_buf_pool_flushed += page_count;
1694
if (flush_type == BUF_FLUSH_LRU) {
1695
/* We keep track of all flushes happening as part of LRU
1696
flush. When estimating the desired rate at which flush_list
1697
should be flushed we factor in this value. */
1698
buf_lru_flush_page_count += page_count;
1702
/******************************************************************//**
1703
Start a buffer flush batch for LRU or flush list */
1708
buf_pool_t* buf_pool, /*!< buffer pool instance */
1709
enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU
1710
or BUF_FLUSH_LIST */
1712
buf_pool_mutex_enter(buf_pool);
1714
if (buf_pool->n_flush[flush_type] > 0
1715
|| buf_pool->init_flush[flush_type] == TRUE) {
1717
/* There is already a flush batch of the same type running */
1719
buf_pool_mutex_exit(buf_pool);
1724
buf_pool->init_flush[flush_type] = TRUE;
1726
buf_pool_mutex_exit(buf_pool);
1731
/******************************************************************//**
1732
End a buffer flush batch for LRU or flush list */
1737
buf_pool_t* buf_pool, /*!< buffer pool instance */
1738
enum buf_flush flush_type) /*!< in: BUF_FLUSH_LRU
1739
or BUF_FLUSH_LIST */
1741
buf_pool_mutex_enter(buf_pool);
1743
buf_pool->init_flush[flush_type] = FALSE;
1745
if (buf_pool->n_flush[flush_type] == 0) {
1747
/* The running flush batch has ended */
1749
os_event_set(buf_pool->no_flush[flush_type]);
1752
buf_pool_mutex_exit(buf_pool);
1755
/******************************************************************//**
1756
Waits until a flush batch of the given type ends */
1759
buf_flush_wait_batch_end(
1760
/*=====================*/
1761
buf_pool_t* buf_pool, /*!< buffer pool instance */
1762
enum buf_flush type) /*!< in: BUF_FLUSH_LRU
1763
or BUF_FLUSH_LIST */
1765
ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1767
if (buf_pool == NULL) {
1770
for (i = 0; i < srv_buf_pool_instances; ++i) {
1771
buf_pool_t* buf_pool;
1773
buf_pool = buf_pool_from_array(i);
1775
os_event_wait(buf_pool->no_flush[type]);
1778
os_event_wait(buf_pool->no_flush[type]);
1782
/*******************************************************************//**
1783
This utility flushes dirty blocks from the end of the LRU list.
1784
NOTE: The calling thread may own latches to pages: to avoid deadlocks,
1785
this function must be written so that it cannot end up waiting for these
1787
@return number of blocks for which the write request was queued;
1788
ULINT_UNDEFINED if there was a flush of the same type already running */
1793
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1794
ulint min_n) /*!< in: wished minimum mumber of blocks
1795
flushed (it is not guaranteed that the
1796
actual number is that big, though) */
1800
if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
1801
return(ULINT_UNDEFINED);
1804
page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0);
1806
buf_flush_end(buf_pool, BUF_FLUSH_LRU);
1808
buf_flush_common(BUF_FLUSH_LRU, page_count);
1813
/*******************************************************************//**
1814
This utility flushes dirty blocks from the end of the flush list of
1815
all buffer pool instances.
1816
NOTE: The calling thread is not allowed to own any latches on pages!
1817
@return number of blocks for which the write request was queued;
1818
ULINT_UNDEFINED if there was a flush of the same type already running */
1823
ulint min_n, /*!< in: wished minimum mumber of blocks
1824
flushed (it is not guaranteed that the
1825
actual number is that big, though) */
1826
ib_uint64_t lsn_limit) /*!< in the case BUF_FLUSH_LIST all
1827
blocks whose oldest_modification is
1828
smaller than this should be flushed
1829
(if their number does not exceed
1830
min_n), otherwise ignored */
1833
ulint total_page_count = 0;
1834
ibool skipped = FALSE;
1836
if (min_n != ULINT_MAX) {
1837
/* Ensure that flushing is spread evenly amongst the
1838
buffer pool instances. When min_n is ULINT_MAX
1839
we need to flush everything up to the lsn limit
1840
so no limit here. */
1841
min_n = (min_n + srv_buf_pool_instances - 1)
1842
/ srv_buf_pool_instances;
1845
/* Flush to lsn_limit in all buffer pool instances */
1846
for (i = 0; i < srv_buf_pool_instances; i++) {
1847
buf_pool_t* buf_pool;
1848
ulint page_count = 0;
1850
buf_pool = buf_pool_from_array(i);
1852
if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) {
1853
/* We have two choices here. If lsn_limit was
1854
specified then skipping an instance of buffer
1855
pool means we cannot guarantee that all pages
1856
up to lsn_limit has been flushed. We can
1857
return right now with failure or we can try
1858
to flush remaining buffer pools up to the
1859
lsn_limit. We attempt to flush other buffer
1860
pools based on the assumption that it will
1861
help in the retry which will follow the
1868
page_count = buf_flush_batch(
1869
buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit);
1871
buf_flush_end(buf_pool, BUF_FLUSH_LIST);
1873
buf_flush_common(BUF_FLUSH_LIST, page_count);
1875
total_page_count += page_count;
1878
return(lsn_limit != IB_ULONGLONG_MAX && skipped
1879
? ULINT_UNDEFINED : total_page_count);
1882
/******************************************************************//**
1883
Gives a recommendation of how many blocks should be flushed to establish
1884
a big enough margin of replaceable blocks near the end of the LRU list
1885
and in the free list.
1886
@return number of blocks which should be flushed from the end of the
1890
buf_flush_LRU_recommendation(
1891
/*=========================*/
1892
buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
1895
ulint n_replaceable;
1898
buf_pool_mutex_enter(buf_pool);
1900
n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
1902
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1904
while ((bpage != NULL)
1905
&& (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
1906
+ BUF_FLUSH_EXTRA_MARGIN(buf_pool))
1907
&& (distance < BUF_LRU_FREE_SEARCH_LEN(buf_pool))) {
1909
mutex_t* block_mutex = buf_page_get_mutex(bpage);
1911
mutex_enter(block_mutex);
1913
if (buf_flush_ready_for_replace(bpage)) {
1917
mutex_exit(block_mutex);
1921
bpage = UT_LIST_GET_PREV(LRU, bpage);
1924
buf_pool_mutex_exit(buf_pool);
1926
if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)) {
1931
return(BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
1932
+ BUF_FLUSH_EXTRA_MARGIN(buf_pool)
1936
/*********************************************************************//**
1937
Flushes pages from the end of the LRU list if there is too small a margin
1938
of replaceable pages there or in the free list. VERY IMPORTANT: this function
1939
is called also by threads which have locks on pages. To avoid deadlocks, we
1940
flush only pages such that the s-lock required for flushing can be acquired
1941
immediately, without waiting. */
1944
buf_flush_free_margin(
1945
/*==================*/
1946
buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
1950
n_to_flush = buf_flush_LRU_recommendation(buf_pool);
1952
if (n_to_flush > 0) {
1955
n_flushed = buf_flush_LRU(buf_pool, n_to_flush);
1957
if (n_flushed == ULINT_UNDEFINED) {
1958
/* There was an LRU type flush batch already running;
1959
let us wait for it to end */
1961
buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
1966
/*********************************************************************//**
1967
Flushes pages from the end of all the LRU lists. */
1970
buf_flush_free_margins(void)
1971
/*========================*/
1975
for (i = 0; i < srv_buf_pool_instances; i++) {
1976
buf_pool_t* buf_pool;
1978
buf_pool = buf_pool_from_array(i);
1980
buf_flush_free_margin(buf_pool);
1984
/*********************************************************************
1985
Update the historical stats that we are collecting for flush rate
1986
heuristics at the end of each interval.
1987
Flush rate heuristic depends on (a) rate of redo log generation and
1988
(b) the rate at which LRU flush is happening. */
1991
buf_flush_stat_update(void)
1992
/*=======================*/
1994
buf_flush_stat_t* item;
1995
ib_uint64_t lsn_diff;
1999
lsn = log_get_lsn();
2000
if (buf_flush_stat_cur.redo == 0) {
2001
/* First time around. Just update the current LSN
2003
buf_flush_stat_cur.redo = lsn;
2007
item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
2009
/* values for this interval */
2010
lsn_diff = lsn - buf_flush_stat_cur.redo;
2011
n_flushed = buf_lru_flush_page_count
2012
- buf_flush_stat_cur.n_flushed;
2014
/* add the current value and subtract the obsolete entry. */
2015
buf_flush_stat_sum.redo += lsn_diff - item->redo;
2016
buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
2018
/* put current entry in the array. */
2019
item->redo = lsn_diff;
2020
item->n_flushed = n_flushed;
2022
/* update the index */
2023
buf_flush_stat_arr_ind++;
2024
buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
2026
/* reset the current entry. */
2027
buf_flush_stat_cur.redo = lsn;
2028
buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
2031
/*********************************************************************
2032
Determines the fraction of dirty pages that need to be flushed based
2033
on the speed at which we generate redo log. Note that if redo log
2034
is generated at a significant rate without corresponding increase
2035
in the number of dirty pages (for example, an in-memory workload)
2036
it can cause IO bursts of flushing. This function implements heuristics
2037
to avoid this burstiness.
2038
@return number of dirty pages to be flushed / second */
2041
buf_flush_get_desired_flush_rate(void)
2042
/*==================================*/
2049
ulint lru_flush_avg;
2050
ib_uint64_t lsn = log_get_lsn();
2051
ulint log_capacity = log_get_capacity();
2053
/* log_capacity should never be zero after the initialization
2054
of log subsystem. */
2055
ut_ad(log_capacity != 0);
2057
/* Get total number of dirty pages. It is OK to access
2058
flush_list without holding any mutex as we are using this
2059
only for heuristics. */
2060
for (i = 0; i < srv_buf_pool_instances; i++) {
2061
buf_pool_t* buf_pool;
2063
buf_pool = buf_pool_from_array(i);
2064
n_dirty += UT_LIST_GET_LEN(buf_pool->flush_list);
2067
/* An overflow can happen if we generate more than 2^32 bytes
2068
of redo in this interval i.e.: 4G of redo in 1 second. We can
2069
safely consider this as infinity because if we ever come close
2070
to 4G we'll start a synchronous flush of dirty pages. */
2071
/* redo_avg below is average at which redo is generated in
2072
past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
2074
redo_avg = (ulint) (buf_flush_stat_sum.redo
2075
/ BUF_FLUSH_STAT_N_INTERVAL
2076
+ (lsn - buf_flush_stat_cur.redo));
2078
/* An overflow can happen possibly if we flush more than 2^32
2079
pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
2080
unlikely scenario. Even when this happens it means that our
2081
flush rate will be off the mark. It won't affect correctness
2082
of any subsystem. */
2083
/* lru_flush_avg below is rate at which pages are flushed as
2084
part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
2085
number of pages flushed in the current interval. */
2086
lru_flush_avg = buf_flush_stat_sum.n_flushed
2087
/ BUF_FLUSH_STAT_N_INTERVAL
2088
+ (buf_lru_flush_page_count
2089
- buf_flush_stat_cur.n_flushed);
2091
n_flush_req = (n_dirty * redo_avg) / log_capacity;
2093
/* The number of pages that we want to flush from the flush
2094
list is the difference between the required rate and the
2095
number of pages that we are historically flushing from the
2097
rate = n_flush_req - lru_flush_avg;
2098
return(rate > 0 ? (ulint) rate : 0);
2101
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2102
/******************************************************************//**
2103
Validates the flush list.
2104
@return TRUE if ok */
2107
buf_flush_validate_low(
2108
/*===================*/
2109
buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
2112
const ib_rbt_node_t* rnode = NULL;
2114
ut_ad(buf_flush_list_mutex_own(buf_pool));
2116
UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
2117
ut_ad(ut_list_node_313->in_flush_list));
2119
bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
2121
/* If we are in recovery mode i.e.: flush_rbt != NULL
2122
then each block in the flush_list must also be present
2123
in the flush_rbt. */
2124
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2125
rnode = rbt_first(buf_pool->flush_rbt);
2128
while (bpage != NULL) {
2129
const ib_uint64_t om = bpage->oldest_modification;
2131
ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
2133
ut_ad(bpage->in_flush_list);
2135
/* A page in flush_list can be in BUF_BLOCK_REMOVE_HASH
2136
state. This happens when a page is in the middle of
2137
being relocated. In that case the original descriptor
2138
can have this state and still be in the flush list
2139
waiting to acquire the flush_list_mutex to complete
2141
ut_a(buf_page_in_file(bpage)
2142
|| buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
2145
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2146
buf_page_t** prpage;
2149
prpage = rbt_value(buf_page_t*, rnode);
2152
ut_a(*prpage == bpage);
2153
rnode = rbt_next(buf_pool->flush_rbt, rnode);
2156
bpage = UT_LIST_GET_NEXT(list, bpage);
2158
ut_a(!bpage || om >= bpage->oldest_modification);
2161
/* By this time we must have exhausted the traversal of
2162
flush_rbt (if active) as well. */
2163
ut_a(rnode == NULL);
2168
/******************************************************************//**
2169
Validates the flush list.
2170
@return TRUE if ok */
2175
buf_pool_t* buf_pool) /*!< buffer pool instance */
2179
buf_flush_list_mutex_enter(buf_pool);
2181
ret = buf_flush_validate_low(buf_pool);
2183
buf_flush_list_mutex_exit(buf_pool);
2187
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2188
#endif /* !UNIV_HOTBACKUP */