1
/******************************************************
2
The database buffer buf_pool flush algorithm
4
(c) 1995-2001 Innobase Oy
6
Created 11/11/1995 Heikki Tuuri
7
*******************************************************/
18
#include "page0page.h"
24
#include "ibuf0ibuf.h"
30
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
31
/**********************************************************************
32
Validates the flush list. */
35
buf_flush_validate_low(void);
36
/*========================*/
38
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
40
/************************************************************************
41
Inserts a modified block into the flush list. */
44
buf_flush_insert_into_flush_list(
45
/*=============================*/
46
buf_page_t* bpage) /* in: block which is modified */
48
ut_ad(buf_pool_mutex_own());
49
ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
50
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
51
<= bpage->oldest_modification));
53
switch (buf_page_get_state(bpage)) {
54
case BUF_BLOCK_ZIP_PAGE:
55
mutex_enter(&buf_pool_zip_mutex);
56
buf_page_set_state(bpage, BUF_BLOCK_ZIP_DIRTY);
57
mutex_exit(&buf_pool_zip_mutex);
58
UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage);
60
case BUF_BLOCK_ZIP_DIRTY:
61
case BUF_BLOCK_FILE_PAGE:
62
ut_ad(bpage->in_LRU_list);
63
ut_ad(bpage->in_page_hash);
64
ut_ad(!bpage->in_zip_hash);
65
ut_ad(!bpage->in_flush_list);
66
ut_d(bpage->in_flush_list = TRUE);
67
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, bpage);
69
case BUF_BLOCK_ZIP_FREE:
70
case BUF_BLOCK_NOT_USED:
71
case BUF_BLOCK_READY_FOR_USE:
72
case BUF_BLOCK_MEMORY:
73
case BUF_BLOCK_REMOVE_HASH:
78
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
79
ut_a(buf_flush_validate_low());
80
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
83
/************************************************************************
84
Inserts a modified block into the flush list in the right sorted position.
85
This function is used by recovery, because there the modifications do not
86
necessarily come in the order of lsn's. */
89
buf_flush_insert_sorted_into_flush_list(
90
/*====================================*/
91
buf_page_t* bpage) /* in: block which is modified */
96
ut_ad(buf_pool_mutex_own());
98
switch (buf_page_get_state(bpage)) {
99
case BUF_BLOCK_ZIP_PAGE:
100
mutex_enter(&buf_pool_zip_mutex);
101
buf_page_set_state(bpage, BUF_BLOCK_ZIP_DIRTY);
102
mutex_exit(&buf_pool_zip_mutex);
103
UT_LIST_REMOVE(list, buf_pool->zip_clean, bpage);
105
case BUF_BLOCK_ZIP_DIRTY:
106
case BUF_BLOCK_FILE_PAGE:
107
ut_ad(bpage->in_LRU_list);
108
ut_ad(bpage->in_page_hash);
109
ut_ad(!bpage->in_zip_hash);
110
ut_ad(!bpage->in_flush_list);
111
ut_d(bpage->in_flush_list = TRUE);
113
case BUF_BLOCK_ZIP_FREE:
114
case BUF_BLOCK_NOT_USED:
115
case BUF_BLOCK_READY_FOR_USE:
116
case BUF_BLOCK_MEMORY:
117
case BUF_BLOCK_REMOVE_HASH:
123
b = UT_LIST_GET_FIRST(buf_pool->flush_list);
125
while (b && b->oldest_modification > bpage->oldest_modification) {
126
ut_ad(b->in_flush_list);
128
b = UT_LIST_GET_NEXT(list, b);
131
if (prev_b == NULL) {
132
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, bpage);
134
UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
138
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
139
ut_a(buf_flush_validate_low());
140
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
143
/************************************************************************
144
Returns TRUE if the file page block is immediately suitable for replacement,
145
i.e., the transition FILE_PAGE => NOT_USED allowed. */
148
buf_flush_ready_for_replace(
149
/*========================*/
150
/* out: TRUE if can replace immediately */
151
buf_page_t* bpage) /* in: buffer control block, must be
152
buf_page_in_file(bpage) and in the LRU list */
154
ut_ad(buf_pool_mutex_own());
155
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
156
ut_ad(bpage->in_LRU_list);
158
if (UNIV_LIKELY(buf_page_in_file(bpage))) {
160
return(bpage->oldest_modification == 0
161
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE
162
&& bpage->buf_fix_count == 0);
165
ut_print_timestamp(stderr);
167
" InnoDB: Error: buffer block state %lu"
168
" in the LRU list!\n",
169
(ulong) buf_page_get_state(bpage));
170
ut_print_buf(stderr, bpage, sizeof(buf_page_t));
175
/************************************************************************
176
Returns TRUE if the block is modified and ready for flushing. */
179
buf_flush_ready_for_flush(
180
/*======================*/
181
/* out: TRUE if can flush immediately */
182
buf_page_t* bpage, /* in: buffer control block, must be
183
buf_page_in_file(bpage) */
184
enum buf_flush flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
186
ut_a(buf_page_in_file(bpage));
187
ut_ad(buf_pool_mutex_own());
188
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
190
if (bpage->oldest_modification != 0
191
&& buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
192
ut_ad(bpage->in_flush_list);
194
if (flush_type != BUF_FLUSH_LRU) {
198
} else if (bpage->buf_fix_count == 0) {
200
/* If we are flushing the LRU list, to avoid deadlocks
201
we require the block not to be bufferfixed, and hence
211
/************************************************************************
212
Remove a block from the flush list of modified blocks. */
217
buf_page_t* bpage) /* in: pointer to the block in question */
219
ut_ad(buf_pool_mutex_own());
220
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
221
ut_ad(bpage->in_flush_list);
222
ut_d(bpage->in_flush_list = FALSE);
224
switch (buf_page_get_state(bpage)) {
225
case BUF_BLOCK_ZIP_PAGE:
226
/* clean compressed pages should not be on the flush list */
227
case BUF_BLOCK_ZIP_FREE:
228
case BUF_BLOCK_NOT_USED:
229
case BUF_BLOCK_READY_FOR_USE:
230
case BUF_BLOCK_MEMORY:
231
case BUF_BLOCK_REMOVE_HASH:
234
case BUF_BLOCK_ZIP_DIRTY:
235
buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
236
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
237
buf_LRU_insert_zip_clean(bpage);
239
case BUF_BLOCK_FILE_PAGE:
240
UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
244
bpage->oldest_modification = 0;
246
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list));
249
/************************************************************************
250
Updates the flush system data structures when a write is completed. */
253
buf_flush_write_complete(
254
/*=====================*/
255
buf_page_t* bpage) /* in: pointer to the block in question */
257
enum buf_flush flush_type;
261
buf_flush_remove(bpage);
263
flush_type = buf_page_get_flush_type(bpage);
264
buf_pool->n_flush[flush_type]--;
266
if (flush_type == BUF_FLUSH_LRU) {
267
/* Put the block to the end of the LRU list to wait to be
268
moved to the free list */
270
buf_LRU_make_block_old(bpage);
272
buf_pool->LRU_flush_ended++;
275
/* fprintf(stderr, "n pending flush %lu\n",
276
buf_pool->n_flush[flush_type]); */
278
if ((buf_pool->n_flush[flush_type] == 0)
279
&& (buf_pool->init_flush[flush_type] == FALSE)) {
281
/* The running flush batch has ended */
283
os_event_set(buf_pool->no_flush[flush_type]);
287
/************************************************************************
288
Flushes possible buffered writes from the doublewrite memory buffer to disk,
289
and also wakes up the aio thread if simulated aio is used. It is very
290
important to call this function after a batch of writes has been posted,
291
and also when we may have to wait for a page latch! Otherwise a deadlock
292
of threads can occur. */
295
buf_flush_buffered_writes(void)
296
/*===========================*/
303
if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
304
os_aio_simulated_wake_handler_threads();
309
mutex_enter(&(trx_doublewrite->mutex));
311
/* Write first to doublewrite buffer blocks. We use synchronous
312
aio and thus know that file write has been completed when the
315
if (trx_doublewrite->first_free == 0) {
317
mutex_exit(&(trx_doublewrite->mutex));
322
for (i = 0; i < trx_doublewrite->first_free; i++) {
324
const buf_block_t* block;
326
block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
328
if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
329
|| block->page.zip.data) {
330
/* No simple validate for compressed pages exists. */
335
(memcmp(block->frame + (FIL_PAGE_LSN + 4),
336
block->frame + (UNIV_PAGE_SIZE
337
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
339
ut_print_timestamp(stderr);
341
" InnoDB: ERROR: The page to be written"
343
"InnoDB: The lsn fields do not match!"
344
" Noticed in the buffer pool\n"
345
"InnoDB: before posting to the"
346
" doublewrite buffer.\n");
349
if (!block->check_index_page_at_flush) {
350
} else if (page_is_comp(block->frame)) {
352
(!page_simple_validate_new(block->frame))) {
354
buf_page_print(block->frame, 0);
356
ut_print_timestamp(stderr);
358
" InnoDB: Apparent corruption of an"
359
" index page n:o %lu in space %lu\n"
360
"InnoDB: to be written to data file."
361
" We intentionally crash server\n"
362
"InnoDB: to prevent corrupt data"
363
" from ending up in data\n"
365
(ulong) buf_block_get_page_no(block),
366
(ulong) buf_block_get_space(block));
370
} else if (UNIV_UNLIKELY
371
(!page_simple_validate_old(block->frame))) {
377
/* increment the doublewrite flushed pages counter */
378
srv_dblwr_pages_written+= trx_doublewrite->first_free;
381
len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
382
trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
384
write_buf = trx_doublewrite->write_buf;
387
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
388
trx_doublewrite->block1, 0, len,
389
(void*) write_buf, NULL);
391
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
392
len2 += UNIV_PAGE_SIZE, i++) {
393
const buf_block_t* block = (buf_block_t*)
394
trx_doublewrite->buf_block_arr[i];
396
if (UNIV_LIKELY(!block->page.zip.data)
397
&& UNIV_LIKELY(buf_block_get_state(block)
398
== BUF_BLOCK_FILE_PAGE)
400
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
403
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
404
ut_print_timestamp(stderr);
406
" InnoDB: ERROR: The page to be written"
408
"InnoDB: The lsn fields do not match!"
409
" Noticed in the doublewrite block1.\n");
413
if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
417
len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
420
write_buf = trx_doublewrite->write_buf
421
+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
422
ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
424
fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
425
trx_doublewrite->block2, 0, len,
426
(void*) write_buf, NULL);
428
for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
429
len2 += UNIV_PAGE_SIZE, i++) {
430
const buf_block_t* block = (buf_block_t*)
431
trx_doublewrite->buf_block_arr[i];
433
if (UNIV_LIKELY(!block->page.zip.data)
434
&& UNIV_LIKELY(buf_block_get_state(block)
435
== BUF_BLOCK_FILE_PAGE)
437
(memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
440
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
441
ut_print_timestamp(stderr);
443
" InnoDB: ERROR: The page to be"
444
" written seems corrupt!\n"
445
"InnoDB: The lsn fields do not match!"
447
" the doublewrite block2.\n");
452
/* Now flush the doublewrite buffer data to disk */
454
fil_flush(TRX_SYS_SPACE);
456
/* We know that the writes have been flushed to disk now
457
and in recovery we will find them in the doublewrite buffer
458
blocks. Next do the writes to the intended positions. */
460
for (i = 0; i < trx_doublewrite->first_free; i++) {
461
const buf_block_t* block = (buf_block_t*)
462
trx_doublewrite->buf_block_arr[i];
464
ut_a(buf_page_in_file(&block->page));
465
if (UNIV_LIKELY_NULL(block->page.zip.data)) {
466
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
467
FALSE, buf_page_get_space(&block->page),
468
buf_page_get_zip_size(&block->page),
469
buf_page_get_page_no(&block->page), 0,
470
buf_page_get_zip_size(&block->page),
471
(void*)block->page.zip.data,
474
/* Increment the counter of I/O operations used
475
for selecting LRU policy. */
476
buf_LRU_stat_inc_io();
481
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
483
if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
486
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
488
ut_print_timestamp(stderr);
490
" InnoDB: ERROR: The page to be written"
492
"InnoDB: The lsn fields do not match!"
493
" Noticed in the buffer pool\n"
494
"InnoDB: after posting and flushing"
495
" the doublewrite buffer.\n"
496
"InnoDB: Page buf fix count %lu,"
497
" io fix %lu, state %lu\n",
498
(ulong)block->page.buf_fix_count,
499
(ulong)buf_block_get_io_fix(block),
500
(ulong)buf_block_get_state(block));
503
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
504
FALSE, buf_block_get_space(block), 0,
505
buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
506
(void*)block->frame, (void*)block);
508
/* Increment the counter of I/O operations used
509
for selecting LRU policy. */
510
buf_LRU_stat_inc_io();
513
/* Wake possible simulated aio thread to actually post the
514
writes to the operating system */
516
os_aio_simulated_wake_handler_threads();
518
/* Wait that all async writes to tablespaces have been posted to
521
os_aio_wait_until_no_pending_writes();
523
/* Now we flush the data to disk (for example, with fsync) */
525
fil_flush_file_spaces(FIL_TABLESPACE);
527
/* We can now reuse the doublewrite memory buffer: */
529
trx_doublewrite->first_free = 0;
531
mutex_exit(&(trx_doublewrite->mutex));
534
/************************************************************************
535
Posts a buffer page for writing. If the doublewrite memory buffer is
536
full, calls buf_flush_buffered_writes and waits for for free space to
540
buf_flush_post_to_doublewrite_buf(
541
/*==============================*/
542
buf_page_t* bpage) /* in: buffer block to write */
546
mutex_enter(&(trx_doublewrite->mutex));
548
ut_a(buf_page_in_file(bpage));
550
if (trx_doublewrite->first_free
551
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
552
mutex_exit(&(trx_doublewrite->mutex));
554
buf_flush_buffered_writes();
559
zip_size = buf_page_get_zip_size(bpage);
561
if (UNIV_UNLIKELY(zip_size)) {
562
/* Copy the compressed page and clear the rest. */
563
memcpy(trx_doublewrite->write_buf
564
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
565
bpage->zip.data, zip_size);
566
memset(trx_doublewrite->write_buf
567
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free
568
+ zip_size, 0, UNIV_PAGE_SIZE - zip_size);
570
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
572
memcpy(trx_doublewrite->write_buf
573
+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
574
((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
577
trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
579
trx_doublewrite->first_free++;
581
if (trx_doublewrite->first_free
582
>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
583
mutex_exit(&(trx_doublewrite->mutex));
585
buf_flush_buffered_writes();
590
mutex_exit(&(trx_doublewrite->mutex));
593
/************************************************************************
594
Initializes a page for writing to the tablespace. */
597
buf_flush_init_for_writing(
598
/*=======================*/
599
byte* page, /* in/out: page */
600
void* page_zip_, /* in/out: compressed page, or NULL */
601
ib_uint64_t newest_lsn) /* in: newest modification lsn
607
page_zip_des_t* page_zip = page_zip_;
608
ulint zip_size = page_zip_get_size(page_zip);
610
ut_ad(ut_is_2pow(zip_size));
611
ut_ad(zip_size <= UNIV_PAGE_SIZE);
613
switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
614
case FIL_PAGE_TYPE_ALLOCATED:
616
case FIL_PAGE_IBUF_BITMAP:
617
case FIL_PAGE_TYPE_FSP_HDR:
618
case FIL_PAGE_TYPE_XDES:
619
/* These are essentially uncompressed pages. */
620
memcpy(page_zip->data, page, zip_size);
622
case FIL_PAGE_TYPE_ZBLOB:
623
case FIL_PAGE_TYPE_ZBLOB2:
625
mach_write_ull(page_zip->data
626
+ FIL_PAGE_LSN, newest_lsn);
627
memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
628
mach_write_to_4(page_zip->data
629
+ FIL_PAGE_SPACE_OR_CHKSUM,
631
? page_zip_calc_checksum(
632
page_zip->data, zip_size)
633
: BUF_NO_CHECKSUM_MAGIC);
640
/* Write the newest modification lsn to the page header and trailer */
641
mach_write_ull(page + FIL_PAGE_LSN, newest_lsn);
643
mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
646
/* Store the new formula checksum */
648
mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
650
? buf_calc_page_new_checksum(page)
651
: BUF_NO_CHECKSUM_MAGIC);
653
/* We overwrite the first 4 bytes of the end lsn field to store
654
the old formula checksum. Since it depends also on the field
655
FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
656
new formula checksum. */
658
mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
660
? buf_calc_page_old_checksum(page)
661
: BUF_NO_CHECKSUM_MAGIC);
664
/************************************************************************
665
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
666
also when the doublewrite buffer is used, we must call
667
buf_flush_buffered_writes after we have posted a batch of writes! */
670
buf_flush_write_block_low(
671
/*======================*/
672
buf_page_t* bpage) /* in: buffer block to write */
674
ulint zip_size = buf_page_get_zip_size(bpage);
675
page_t* frame = NULL;
676
#ifdef UNIV_LOG_DEBUG
677
static ibool univ_log_debug_warned;
678
#endif /* UNIV_LOG_DEBUG */
680
ut_ad(buf_page_in_file(bpage));
682
#ifdef UNIV_IBUF_COUNT_DEBUG
683
ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
685
ut_ad(bpage->newest_modification != 0);
687
#ifdef UNIV_LOG_DEBUG
688
if (!univ_log_debug_warned) {
689
univ_log_debug_warned = TRUE;
690
fputs("Warning: cannot force log to disk if"
691
" UNIV_LOG_DEBUG is defined!\n"
692
"Crash recovery will not work!\n",
696
/* Force the log to the disk before writing the modified block */
697
log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
699
switch (buf_page_get_state(bpage)) {
700
case BUF_BLOCK_ZIP_FREE:
701
case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
702
case BUF_BLOCK_NOT_USED:
703
case BUF_BLOCK_READY_FOR_USE:
704
case BUF_BLOCK_MEMORY:
705
case BUF_BLOCK_REMOVE_HASH:
708
case BUF_BLOCK_ZIP_DIRTY:
709
frame = bpage->zip.data;
710
if (UNIV_LIKELY(srv_use_checksums)) {
711
ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
712
== page_zip_calc_checksum(frame, zip_size));
714
mach_write_ull(frame + FIL_PAGE_LSN,
715
bpage->newest_modification);
716
memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
718
case BUF_BLOCK_FILE_PAGE:
719
frame = bpage->zip.data;
721
frame = ((buf_block_t*) bpage)->frame;
724
buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
726
? &bpage->zip : NULL,
727
bpage->newest_modification);
731
if (!srv_use_doublewrite_buf || !trx_doublewrite) {
732
fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
733
FALSE, buf_page_get_space(bpage), zip_size,
734
buf_page_get_page_no(bpage), 0,
735
zip_size ? zip_size : UNIV_PAGE_SIZE,
738
buf_flush_post_to_doublewrite_buf(bpage);
742
/************************************************************************
743
Writes a page asynchronously from the buffer buf_pool to a file, if it can be
744
found in the buf_pool and it is in a flushable state. NOTE: in simulated aio
745
we must call os_aio_simulated_wake_handler_threads after we have posted a batch
751
/* out: 1 if a page was
752
flushed, 0 otherwise */
753
ulint space, /* in: space id */
754
ulint offset, /* in: page offset */
755
enum buf_flush flush_type) /* in: BUF_FLUSH_LRU, BUF_FLUSH_LIST,
756
or BUF_FLUSH_SINGLE_PAGE */
759
mutex_t* block_mutex;
762
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
763
|| flush_type == BUF_FLUSH_SINGLE_PAGE);
765
buf_pool_mutex_enter();
767
bpage = buf_page_hash_get(space, offset);
770
buf_pool_mutex_exit();
774
ut_a(buf_page_in_file(bpage));
775
block_mutex = buf_page_get_mutex(bpage);
777
mutex_enter(block_mutex);
779
if (!buf_flush_ready_for_flush(bpage, flush_type)) {
780
mutex_exit(block_mutex);
781
buf_pool_mutex_exit();
785
switch (flush_type) {
787
buf_page_set_io_fix(bpage, BUF_IO_WRITE);
789
buf_page_set_flush_type(bpage, flush_type);
791
if (buf_pool->n_flush[flush_type] == 0) {
793
os_event_reset(buf_pool->no_flush[flush_type]);
796
buf_pool->n_flush[flush_type]++;
798
/* If the simulated aio thread is not running, we must
799
not wait for any latch, as we may end up in a deadlock:
800
if buf_fix_count == 0, then we know we need not wait */
802
locked = bpage->buf_fix_count == 0;
804
&& buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
805
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
809
mutex_exit(block_mutex);
810
buf_pool_mutex_exit();
813
buf_flush_buffered_writes();
815
if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
816
rw_lock_s_lock_gen(&((buf_block_t*) bpage)
817
->lock, BUF_IO_WRITE);
825
Because any thread may call the LRU flush, even when owning
826
locks on pages, to avoid deadlocks, we must make sure that the
827
s-lock is acquired on the page without waiting: this is
828
accomplished because in the if-condition above we require
829
the page not to be bufferfixed (in function
830
..._ready_for_flush). */
832
buf_page_set_io_fix(bpage, BUF_IO_WRITE);
834
buf_page_set_flush_type(bpage, flush_type);
836
if (buf_pool->n_flush[flush_type] == 0) {
838
os_event_reset(buf_pool->no_flush[flush_type]);
841
buf_pool->n_flush[flush_type]++;
843
if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
844
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
848
/* Note that the s-latch is acquired before releasing the
849
buf_pool mutex: this ensures that the latch is acquired
852
mutex_exit(block_mutex);
853
buf_pool_mutex_exit();
856
case BUF_FLUSH_SINGLE_PAGE:
857
buf_page_set_io_fix(bpage, BUF_IO_WRITE);
859
buf_page_set_flush_type(bpage, flush_type);
861
if (buf_pool->n_flush[flush_type] == 0) {
863
os_event_reset(buf_pool->no_flush[flush_type]);
866
buf_pool->n_flush[flush_type]++;
868
mutex_exit(block_mutex);
869
buf_pool_mutex_exit();
871
if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
872
rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
882
if (buf_debug_prints) {
884
"Flushing %u space %u page %u\n",
885
flush_type, bpage->space, bpage->offset);
887
#endif /* UNIV_DEBUG */
888
buf_flush_write_block_low(bpage);
893
/***************************************************************
894
Flushes to disk all flushable pages within the flush area. */
897
buf_flush_try_neighbors(
898
/*====================*/
899
/* out: number of pages flushed */
900
ulint space, /* in: space id */
901
ulint offset, /* in: page offset */
902
enum buf_flush flush_type) /* in: BUF_FLUSH_LRU or
910
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
912
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
913
/* If there is little space, it is better not to flush any
914
block except from the end of the LRU list */
919
/* When flushed, dirty blocks are searched in neighborhoods of
920
this size, and flushed along with the original page. */
922
ulint buf_flush_area = ut_min(BUF_READ_AHEAD_AREA,
923
buf_pool->curr_size / 16);
925
low = (offset / buf_flush_area) * buf_flush_area;
926
high = (offset / buf_flush_area + 1) * buf_flush_area;
929
/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
931
if (high > fil_space_get_size(space)) {
932
high = fil_space_get_size(space);
935
buf_pool_mutex_enter();
937
for (i = low; i < high; i++) {
939
bpage = buf_page_hash_get(space, i);
940
ut_a(!bpage || buf_page_in_file(bpage));
946
} else if (flush_type == BUF_FLUSH_LRU && i != offset
947
&& !buf_page_is_old(bpage)) {
949
/* We avoid flushing 'non-old' blocks in an LRU flush,
950
because the flushed blocks are soon freed */
955
mutex_t* block_mutex = buf_page_get_mutex(bpage);
957
mutex_enter(block_mutex);
959
if (buf_flush_ready_for_flush(bpage, flush_type)
960
&& (i == offset || !bpage->buf_fix_count)) {
961
/* We only try to flush those
962
neighbors != offset where the buf fix count is
963
zero, as we then know that we probably can
964
latch the page without a semaphore wait.
965
Semaphore waits are expensive because we must
966
flush the doublewrite buffer before we start
969
buf_pool_mutex_exit();
971
mutex_exit(block_mutex);
973
/* Note: as we release the buf_pool mutex
974
above, in buf_flush_try_page we cannot be sure
975
the page is still in a flushable state:
976
therefore we check it again inside that
979
count += buf_flush_try_page(space, i,
982
buf_pool_mutex_enter();
984
mutex_exit(block_mutex);
989
buf_pool_mutex_exit();
994
/***********************************************************************
995
This utility flushes dirty blocks from the end of the LRU list or flush_list.
996
NOTE 1: in the case of an LRU flush the calling thread may own latches to
997
pages: to avoid deadlocks, this function must be written so that it cannot
998
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
999
the calling thread is not allowed to own any latches on pages! */
1004
/* out: number of blocks for which the
1005
write request was queued;
1006
ULINT_UNDEFINED if there was a flush
1007
of the same type already running */
1008
enum buf_flush flush_type, /* in: BUF_FLUSH_LRU or
1009
BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
1010
then the caller must not own any
1012
ulint min_n, /* in: wished minimum mumber of blocks
1013
flushed (it is not guaranteed that the
1014
actual number is that big, though) */
1015
ib_uint64_t lsn_limit) /* in the case BUF_FLUSH_LIST all
1016
blocks whose oldest_modification is
1017
smaller than this should be flushed
1018
(if their number does not exceed
1019
min_n), otherwise ignored */
1022
ulint page_count = 0;
1023
ulint old_page_count;
1027
ut_ad((flush_type == BUF_FLUSH_LRU)
1028
|| (flush_type == BUF_FLUSH_LIST));
1029
#ifdef UNIV_SYNC_DEBUG
1030
ut_ad((flush_type != BUF_FLUSH_LIST)
1031
|| sync_thread_levels_empty_gen(TRUE));
1032
#endif /* UNIV_SYNC_DEBUG */
1033
buf_pool_mutex_enter();
1035
if ((buf_pool->n_flush[flush_type] > 0)
1036
|| (buf_pool->init_flush[flush_type] == TRUE)) {
1038
/* There is already a flush batch of the same type running */
1040
buf_pool_mutex_exit();
1042
return(ULINT_UNDEFINED);
1045
buf_pool->init_flush[flush_type] = TRUE;
1049
/* If we have flushed enough, leave the loop */
1050
if (page_count >= min_n) {
1055
/* Start from the end of the list looking for a suitable
1056
block to be flushed. */
1058
if (flush_type == BUF_FLUSH_LRU) {
1059
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1061
ut_ad(flush_type == BUF_FLUSH_LIST);
1063
bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1065
|| bpage->oldest_modification >= lsn_limit) {
1066
/* We have flushed enough */
1070
ut_ad(bpage->in_flush_list);
1073
/* Note that after finding a single flushable page, we try to
1074
flush also all its neighbors, and after that start from the
1075
END of the LRU list or flush list again: the list may change
1076
during the flushing and we cannot safely preserve within this
1077
function a pointer to a block in the list! */
1080
mutex_t* block_mutex = buf_page_get_mutex(bpage);
1082
ut_a(buf_page_in_file(bpage));
1084
mutex_enter(block_mutex);
1086
if (buf_flush_ready_for_flush(bpage, flush_type)) {
1088
space = buf_page_get_space(bpage);
1089
offset = buf_page_get_page_no(bpage);
1091
buf_pool_mutex_exit();
1092
mutex_exit(block_mutex);
1094
old_page_count = page_count;
1096
/* Try to flush also all the neighbors */
1097
page_count += buf_flush_try_neighbors(
1098
space, offset, flush_type);
1100
"Flush type %lu, page no %lu, neighb %lu\n",
1102
page_count - old_page_count); */
1104
buf_pool_mutex_enter();
1107
} else if (flush_type == BUF_FLUSH_LRU) {
1109
mutex_exit(block_mutex);
1111
bpage = UT_LIST_GET_PREV(LRU, bpage);
1113
ut_ad(flush_type == BUF_FLUSH_LIST);
1115
mutex_exit(block_mutex);
1117
bpage = UT_LIST_GET_PREV(list, bpage);
1118
ut_ad(!bpage || bpage->in_flush_list);
1120
} while (bpage != NULL);
1122
/* If we could not find anything to flush, leave the loop */
1127
buf_pool->init_flush[flush_type] = FALSE;
1129
if ((buf_pool->n_flush[flush_type] == 0)
1130
&& (buf_pool->init_flush[flush_type] == FALSE)) {
1132
/* The running flush batch has ended */
1134
os_event_set(buf_pool->no_flush[flush_type]);
1137
buf_pool_mutex_exit();
1139
buf_flush_buffered_writes();
1142
if (buf_debug_prints && page_count > 0) {
1143
ut_a(flush_type == BUF_FLUSH_LRU
1144
|| flush_type == BUF_FLUSH_LIST);
1145
fprintf(stderr, flush_type == BUF_FLUSH_LRU
1146
? "Flushed %lu pages in LRU flush\n"
1147
: "Flushed %lu pages in flush list flush\n",
1148
(ulong) page_count);
1150
#endif /* UNIV_DEBUG */
1152
srv_buf_pool_flushed += page_count;
1157
/**********************************************************************
1158
Waits until a flush batch of the given type ends */
1161
buf_flush_wait_batch_end(
1162
/*=====================*/
1163
enum buf_flush type) /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
1165
ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
1167
os_event_wait(buf_pool->no_flush[type]);
1170
/**********************************************************************
1171
Gives a recommendation of how many blocks should be flushed to establish
1172
a big enough margin of replaceable blocks near the end of the LRU list
1173
and in the free list. */
1176
buf_flush_LRU_recommendation(void)
1177
/*==============================*/
1178
/* out: number of blocks which should be flushed
1179
from the end of the LRU list */
1182
ulint n_replaceable;
1185
buf_pool_mutex_enter();
1187
n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
1189
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1191
while ((bpage != NULL)
1192
&& (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
1193
+ BUF_FLUSH_EXTRA_MARGIN)
1194
&& (distance < BUF_LRU_FREE_SEARCH_LEN)) {
1196
mutex_t* block_mutex = buf_page_get_mutex(bpage);
1198
mutex_enter(block_mutex);
1200
if (buf_flush_ready_for_replace(bpage)) {
1204
mutex_exit(block_mutex);
1208
bpage = UT_LIST_GET_PREV(LRU, bpage);
1211
buf_pool_mutex_exit();
1213
if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
1218
return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
1222
/*************************************************************************
1223
Flushes pages from the end of the LRU list if there is too small a margin
1224
of replaceable pages there or in the free list. VERY IMPORTANT: this function
1225
is called also by threads which have locks on pages. To avoid deadlocks, we
1226
flush only pages such that the s-lock required for flushing can be acquired
1227
immediately, without waiting. */
1230
buf_flush_free_margin(void)
1231
/*=======================*/
1236
n_to_flush = buf_flush_LRU_recommendation();
1238
if (n_to_flush > 0) {
1239
n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0);
1240
if (n_flushed == ULINT_UNDEFINED) {
1241
/* There was an LRU type flush batch already running;
1242
let us wait for it to end */
1244
buf_flush_wait_batch_end(BUF_FLUSH_LRU);
1249
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
1250
/**********************************************************************
1251
Validates the flush list. */
1254
buf_flush_validate_low(void)
1255
/*========================*/
1256
/* out: TRUE if ok */
1260
UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list);
1262
bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1264
while (bpage != NULL) {
1265
const ib_uint64_t om = bpage->oldest_modification;
1266
ut_ad(bpage->in_flush_list);
1267
ut_a(buf_page_in_file(bpage));
1270
bpage = UT_LIST_GET_NEXT(list, bpage);
1272
ut_a(!bpage || om >= bpage->oldest_modification);
1278
/**********************************************************************
1279
Validates the flush list. */
1282
buf_flush_validate(void)
1283
/*====================*/
1284
/* out: TRUE if ok */
1288
buf_pool_mutex_enter();
1290
ret = buf_flush_validate_low();
1292
buf_pool_mutex_exit();
1296
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */