~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to plugin/innobase/buf/buf0flu.c

  • Committer: Brian Aker
  • Date: 2009-06-08 02:42:24 UTC
  • mfrom: (1054.1.6 merge)
  • Revision ID: brian@gaz-20090608024224-zlff1bpq62r8m5gy
Removal of LOCK TABLES.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*****************************************************************************
2
 
 
3
 
Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
4
 
 
5
 
This program is free software; you can redistribute it and/or modify it under
6
 
the terms of the GNU General Public License as published by the Free Software
7
 
Foundation; version 2 of the License.
8
 
 
9
 
This program is distributed in the hope that it will be useful, but WITHOUT
10
 
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
 
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
 
 
13
 
You should have received a copy of the GNU General Public License along with
14
 
this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
15
 
St, Fifth Floor, Boston, MA 02110-1301 USA
16
 
 
17
 
*****************************************************************************/
18
 
 
19
 
/**************************************************//**
20
 
@file buf/buf0flu.c
21
 
The database buffer buf_pool flush algorithm
22
 
 
23
 
Created 11/11/1995 Heikki Tuuri
24
 
*******************************************************/
25
 
 
26
 
#include "buf0flu.h"
27
 
 
28
 
#ifdef UNIV_NONINL
29
 
#include "buf0flu.ic"
30
 
#endif
31
 
 
32
 
#include "buf0buf.h"
33
 
#include "srv0srv.h"
34
 
#include "page0zip.h"
35
 
#ifndef UNIV_HOTBACKUP
36
 
#include "ut0byte.h"
37
 
#include "ut0lst.h"
38
 
#include "page0page.h"
39
 
#include "fil0fil.h"
40
 
#include "buf0lru.h"
41
 
#include "buf0rea.h"
42
 
#include "ibuf0ibuf.h"
43
 
#include "log0log.h"
44
 
#include "os0file.h"
45
 
#include "trx0sys.h"
46
 
 
47
 
/**********************************************************************
48
 
These statistics are generated for heuristics used in estimating the
49
 
rate at which we should flush the dirty blocks to avoid bursty IO
50
 
activity. Note that the rate of flushing not only depends on how many
51
 
dirty pages we have in the buffer pool but it is also a fucntion of
52
 
how much redo the workload is generating and at what rate. */
53
 
/* @{ */
54
 
 
55
 
/** Number of intervals for which we keep the history of these stats.
56
 
Each interval is 1 second, defined by the rate at which
57
 
srv_error_monitor_thread() calls buf_flush_stat_update(). */
58
 
#define BUF_FLUSH_STAT_N_INTERVAL 20
59
 
 
60
 
/** Sampled values buf_flush_stat_cur.
61
 
Not protected by any mutex.  Updated by buf_flush_stat_update(). */
62
 
static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
63
 
 
64
 
/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
65
 
static ulint            buf_flush_stat_arr_ind;
66
 
 
67
 
/** Values at start of the current interval. Reset by
68
 
buf_flush_stat_update(). */
69
 
static buf_flush_stat_t buf_flush_stat_cur;
70
 
 
71
 
/** Running sum of past values of buf_flush_stat_cur.
72
 
Updated by buf_flush_stat_update(). Not protected by any mutex. */
73
 
static buf_flush_stat_t buf_flush_stat_sum;
74
 
 
75
 
/** Number of pages flushed through non flush_list flushes. */
76
 
static ulint buf_lru_flush_page_count = 0;
77
 
 
78
 
/* @} */
79
 
 
80
 
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
81
 
/******************************************************************//**
82
 
Validates the flush list.
83
 
@return TRUE if ok */
84
 
static
85
 
ibool
86
 
buf_flush_validate_low(
87
 
/*===================*/
88
 
        buf_pool_t*     buf_pool);      /*!< in: Buffer pool instance */
89
 
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
90
 
 
91
 
/******************************************************************//**
92
 
Insert a block in the flush_rbt and returns a pointer to its
93
 
predecessor or NULL if no predecessor. The ordering is maintained
94
 
on the basis of the <oldest_modification, space, offset> key.
95
 
@return pointer to the predecessor or NULL if no predecessor. */
96
 
static
97
 
buf_page_t*
98
 
buf_flush_insert_in_flush_rbt(
99
 
/*==========================*/
100
 
        buf_page_t*     bpage)  /*!< in: bpage to be inserted. */
101
 
{
102
 
        const ib_rbt_node_t*    c_node;
103
 
        const ib_rbt_node_t*    p_node;
104
 
        buf_page_t*             prev = NULL;
105
 
        buf_pool_t*             buf_pool = buf_pool_from_bpage(bpage);
106
 
 
107
 
        ut_ad(buf_flush_list_mutex_own(buf_pool));
108
 
 
109
 
        /* Insert this buffer into the rbt. */
110
 
        c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
111
 
        ut_a(c_node != NULL);
112
 
 
113
 
        /* Get the predecessor. */
114
 
        p_node = rbt_prev(buf_pool->flush_rbt, c_node);
115
 
 
116
 
        if (p_node != NULL) {
117
 
                buf_page_t**    value;
118
 
                value = rbt_value(buf_page_t*, p_node);
119
 
                prev = *value;
120
 
                ut_a(prev != NULL);
121
 
        }
122
 
 
123
 
        return(prev);
124
 
}
125
 
 
126
 
/*********************************************************//**
127
 
Delete a bpage from the flush_rbt. */
128
 
static
129
 
void
130
 
buf_flush_delete_from_flush_rbt(
131
 
/*============================*/
132
 
        buf_page_t*     bpage)  /*!< in: bpage to be removed. */
133
 
{
134
 
#ifdef UNIV_DEBUG
135
 
        ibool           ret = FALSE;
136
 
#endif /* UNIV_DEBUG */
137
 
        buf_pool_t*     buf_pool = buf_pool_from_bpage(bpage);
138
 
 
139
 
        ut_ad(buf_flush_list_mutex_own(buf_pool));
140
 
 
141
 
#ifdef UNIV_DEBUG
142
 
        ret =
143
 
#endif /* UNIV_DEBUG */
144
 
        rbt_delete(buf_pool->flush_rbt, &bpage);
145
 
        ut_ad(ret);
146
 
}
147
 
 
148
 
/*****************************************************************//**
149
 
Compare two modified blocks in the buffer pool. The key for comparison
150
 
is:
151
 
key = <oldest_modification, space, offset>
152
 
This comparison is used to maintian ordering of blocks in the
153
 
buf_pool->flush_rbt.
154
 
Note that for the purpose of flush_rbt, we only need to order blocks
155
 
on the oldest_modification. The other two fields are used to uniquely
156
 
identify the blocks.
157
 
@return  < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
158
 
static
159
 
int
160
 
buf_flush_block_cmp(
161
 
/*================*/
162
 
        const void*     p1,             /*!< in: block1 */
163
 
        const void*     p2)             /*!< in: block2 */
164
 
{
165
 
        int                     ret;
166
 
        const buf_page_t*       b1 = *(const buf_page_t**) p1;
167
 
        const buf_page_t*       b2 = *(const buf_page_t**) p2;
168
 
#ifdef UNIV_DEBUG
169
 
        buf_pool_t*             buf_pool = buf_pool_from_bpage(b1);
170
 
#endif /* UNIV_DEBUG */
171
 
 
172
 
        ut_ad(b1 != NULL);
173
 
        ut_ad(b2 != NULL);
174
 
 
175
 
        ut_ad(buf_flush_list_mutex_own(buf_pool));
176
 
 
177
 
        ut_ad(b1->in_flush_list);
178
 
        ut_ad(b2->in_flush_list);
179
 
 
180
 
        if (b2->oldest_modification > b1->oldest_modification) {
181
 
                return(1);
182
 
        } else if (b2->oldest_modification < b1->oldest_modification) {
183
 
                return(-1);
184
 
        }
185
 
 
186
 
        /* If oldest_modification is same then decide on the space. */
187
 
        ret = (int)(b2->space - b1->space);
188
 
 
189
 
        /* Or else decide ordering on the offset field. */
190
 
        return(ret ? ret : (int)(b2->offset - b1->offset));
191
 
}
192
 
 
193
 
/********************************************************************//**
194
 
Initialize the red-black tree to speed up insertions into the flush_list
195
 
during recovery process. Should be called at the start of recovery
196
 
process before any page has been read/written. */
197
 
UNIV_INTERN
198
 
void
199
 
buf_flush_init_flush_rbt(void)
200
 
/*==========================*/
201
 
{
202
 
        ulint   i;
203
 
 
204
 
        for (i = 0; i < srv_buf_pool_instances; i++) {
205
 
                buf_pool_t*     buf_pool;
206
 
 
207
 
                buf_pool = buf_pool_from_array(i);
208
 
 
209
 
                buf_flush_list_mutex_enter(buf_pool);
210
 
 
211
 
                /* Create red black tree for speedy insertions in flush list. */
212
 
                buf_pool->flush_rbt = rbt_create(
213
 
                        sizeof(buf_page_t*), buf_flush_block_cmp);
214
 
 
215
 
                buf_flush_list_mutex_exit(buf_pool);
216
 
        }
217
 
}
218
 
 
219
 
/********************************************************************//**
220
 
Frees up the red-black tree. */
221
 
UNIV_INTERN
222
 
void
223
 
buf_flush_free_flush_rbt(void)
224
 
/*==========================*/
225
 
{
226
 
        ulint   i;
227
 
 
228
 
        for (i = 0; i < srv_buf_pool_instances; i++) {
229
 
                buf_pool_t*     buf_pool;
230
 
 
231
 
                buf_pool = buf_pool_from_array(i);
232
 
 
233
 
                buf_flush_list_mutex_enter(buf_pool);
234
 
 
235
 
#ifdef UNIV_DEBUG_VALGRIND
236
 
        {
237
 
                ulint   zip_size = buf_block_get_zip_size(block);
238
 
 
239
 
                if (UNIV_UNLIKELY(zip_size)) {
240
 
                        UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
241
 
                } else {
242
 
                        UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
243
 
                }
244
 
        }
245
 
#endif /* UNIV_DEBUG_VALGRIND */
246
 
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
247
 
                ut_a(buf_flush_validate_low(buf_pool));
248
 
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
249
 
 
250
 
                rbt_free(buf_pool->flush_rbt);
251
 
                buf_pool->flush_rbt = NULL;
252
 
 
253
 
                buf_flush_list_mutex_exit(buf_pool);
254
 
        }
255
 
}
256
 
 
257
 
/********************************************************************//**
258
 
Inserts a modified block into the flush list. */
259
 
UNIV_INTERN
260
 
void
261
 
buf_flush_insert_into_flush_list(
262
 
/*=============================*/
263
 
        buf_pool_t*     buf_pool,       /*!< buffer pool instance */
264
 
        buf_block_t*    block,          /*!< in/out: block which is modified */
265
 
        ib_uint64_t     lsn)            /*!< in: oldest modification */
266
 
{
267
 
        ut_ad(!buf_pool_mutex_own(buf_pool));
268
 
        ut_ad(log_flush_order_mutex_own());
269
 
        ut_ad(mutex_own(&block->mutex));
270
 
 
271
 
        buf_flush_list_mutex_enter(buf_pool);
272
 
 
273
 
        ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
274
 
              || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
275
 
                  <= lsn));
276
 
 
277
 
        /* If we are in the recovery then we need to update the flush
278
 
        red-black tree as well. */
279
 
        if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
280
 
                buf_flush_list_mutex_exit(buf_pool);
281
 
                buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
282
 
                return;
283
 
        }
284
 
 
285
 
        ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
286
 
        ut_ad(!block->page.in_flush_list);
287
 
 
288
 
        ut_d(block->page.in_flush_list = TRUE);
289
 
        block->page.oldest_modification = lsn;
290
 
        UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
291
 
 
292
 
#ifdef UNIV_DEBUG_VALGRIND
293
 
        {
294
 
                ulint   zip_size = buf_block_get_zip_size(block);
295
 
 
296
 
                if (UNIV_UNLIKELY(zip_size)) {
297
 
                        UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
298
 
                } else {
299
 
                        UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
300
 
                }
301
 
        }
302
 
#endif /* UNIV_DEBUG_VALGRIND */
303
 
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
304
 
        ut_a(buf_flush_validate_low(buf_pool));
305
 
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
306
 
 
307
 
        buf_flush_list_mutex_exit(buf_pool);
308
 
}
309
 
 
310
 
/********************************************************************//**
311
 
Inserts a modified block into the flush list in the right sorted position.
312
 
This function is used by recovery, because there the modifications do not
313
 
necessarily come in the order of lsn's. */
314
 
UNIV_INTERN
315
 
void
316
 
buf_flush_insert_sorted_into_flush_list(
317
 
/*====================================*/
318
 
        buf_pool_t*     buf_pool,       /*!< in: buffer pool instance */
319
 
        buf_block_t*    block,          /*!< in/out: block which is modified */
320
 
        ib_uint64_t     lsn)            /*!< in: oldest modification */
321
 
{
322
 
        buf_page_t*     prev_b;
323
 
        buf_page_t*     b;
324
 
 
325
 
        ut_ad(!buf_pool_mutex_own(buf_pool));
326
 
        ut_ad(log_flush_order_mutex_own());
327
 
        ut_ad(mutex_own(&block->mutex));
328
 
        ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
329
 
 
330
 
        buf_flush_list_mutex_enter(buf_pool);
331
 
 
332
 
        /* The field in_LRU_list is protected by buf_pool_mutex, which
333
 
        we are not holding.  However, while a block is in the flush
334
 
        list, it is dirty and cannot be discarded, not from the
335
 
        page_hash or from the LRU list.  At most, the uncompressed
336
 
        page frame of a compressed block may be discarded or created
337
 
        (copying the block->page to or from a buf_page_t that is
338
 
        dynamically allocated from buf_buddy_alloc()).  Because those
339
 
        transitions hold block->mutex and the flush list mutex (via
340
 
        buf_flush_relocate_on_flush_list()), there is no possibility
341
 
        of a race condition in the assertions below. */
342
 
        ut_ad(block->page.in_LRU_list);
343
 
        ut_ad(block->page.in_page_hash);
344
 
        /* buf_buddy_block_register() will take a block in the
345
 
        BUF_BLOCK_MEMORY state, not a file page. */
346
 
        ut_ad(!block->page.in_zip_hash);
347
 
 
348
 
        ut_ad(!block->page.in_flush_list);
349
 
        ut_d(block->page.in_flush_list = TRUE);
350
 
        block->page.oldest_modification = lsn;
351
 
 
352
 
#ifdef UNIV_DEBUG_VALGRIND
353
 
        {
354
 
                ulint   zip_size = buf_block_get_zip_size(block);
355
 
 
356
 
                if (UNIV_UNLIKELY(zip_size)) {
357
 
                        UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
358
 
                } else {
359
 
                        UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
360
 
                }
361
 
        }
362
 
#endif /* UNIV_DEBUG_VALGRIND */
363
 
 
364
 
#ifdef UNIV_DEBUG_VALGRIND
365
 
        {
366
 
                ulint   zip_size = buf_block_get_zip_size(block);
367
 
 
368
 
                if (UNIV_UNLIKELY(zip_size)) {
369
 
                        UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
370
 
                } else {
371
 
                        UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
372
 
                }
373
 
        }
374
 
#endif /* UNIV_DEBUG_VALGRIND */
375
 
 
376
 
        prev_b = NULL;
377
 
 
378
 
        /* For the most part when this function is called the flush_rbt
379
 
        should not be NULL. In a very rare boundary case it is possible
380
 
        that the flush_rbt has already been freed by the recovery thread
381
 
        before the last page was hooked up in the flush_list by the
382
 
        io-handler thread. In that case we'll  just do a simple
383
 
        linear search in the else block. */
384
 
        if (buf_pool->flush_rbt) {
385
 
 
386
 
                prev_b = buf_flush_insert_in_flush_rbt(&block->page);
387
 
 
388
 
        } else {
389
 
 
390
 
                b = UT_LIST_GET_FIRST(buf_pool->flush_list);
391
 
 
392
 
                while (b && b->oldest_modification
393
 
                       > block->page.oldest_modification) {
394
 
                        ut_ad(b->in_flush_list);
395
 
                        prev_b = b;
396
 
                        b = UT_LIST_GET_NEXT(list, b);
397
 
                }
398
 
        }
399
 
 
400
 
        if (prev_b == NULL) {
401
 
                UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
402
 
        } else {
403
 
                UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
404
 
                                     prev_b, &block->page);
405
 
        }
406
 
 
407
 
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
408
 
        ut_a(buf_flush_validate_low(buf_pool));
409
 
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
410
 
 
411
 
        buf_flush_list_mutex_exit(buf_pool);
412
 
}
413
 
 
414
 
/********************************************************************//**
415
 
Returns TRUE if the file page block is immediately suitable for replacement,
416
 
i.e., the transition FILE_PAGE => NOT_USED allowed.
417
 
@return TRUE if can replace immediately */
418
 
UNIV_INTERN
419
 
ibool
420
 
buf_flush_ready_for_replace(
421
 
/*========================*/
422
 
        buf_page_t*     bpage)  /*!< in: buffer control block, must be
423
 
                                buf_page_in_file(bpage) and in the LRU list */
424
 
{
425
 
#ifdef UNIV_DEBUG
426
 
        buf_pool_t*     buf_pool = buf_pool_from_bpage(bpage);
427
 
        ut_ad(buf_pool_mutex_own(buf_pool));
428
 
#endif
429
 
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
430
 
        ut_ad(bpage->in_LRU_list);
431
 
 
432
 
        if (UNIV_LIKELY(buf_page_in_file(bpage))) {
433
 
 
434
 
                return(bpage->oldest_modification == 0
435
 
                       && buf_page_get_io_fix(bpage) == BUF_IO_NONE
436
 
                       && bpage->buf_fix_count == 0);
437
 
        }
438
 
 
439
 
        ut_print_timestamp(stderr);
440
 
        fprintf(stderr,
441
 
                "  InnoDB: Error: buffer block state %lu"
442
 
                " in the LRU list!\n",
443
 
                (ulong) buf_page_get_state(bpage));
444
 
        ut_print_buf(stderr, bpage, sizeof(buf_page_t));
445
 
        putc('\n', stderr);
446
 
 
447
 
        return(FALSE);
448
 
}
449
 
 
450
 
/********************************************************************//**
451
 
Returns TRUE if the block is modified and ready for flushing.
452
 
@return TRUE if can flush immediately */
453
 
UNIV_INLINE
454
 
ibool
455
 
buf_flush_ready_for_flush(
456
 
/*======================*/
457
 
        buf_page_t*     bpage,  /*!< in: buffer control block, must be
458
 
                                buf_page_in_file(bpage) */
459
 
        enum buf_flush  flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
460
 
{
461
 
#ifdef UNIV_DEBUG
462
 
        buf_pool_t*     buf_pool = buf_pool_from_bpage(bpage);
463
 
        ut_ad(buf_pool_mutex_own(buf_pool));
464
 
#endif
465
 
        ut_a(buf_page_in_file(bpage));
466
 
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
467
 
        ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
468
 
 
469
 
        if (bpage->oldest_modification != 0
470
 
            && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
471
 
                ut_ad(bpage->in_flush_list);
472
 
 
473
 
                if (flush_type != BUF_FLUSH_LRU) {
474
 
 
475
 
                        return(TRUE);
476
 
 
477
 
                } else if (bpage->buf_fix_count == 0) {
478
 
 
479
 
                        /* If we are flushing the LRU list, to avoid deadlocks
480
 
                        we require the block not to be bufferfixed, and hence
481
 
                        not latched. */
482
 
 
483
 
                        return(TRUE);
484
 
                }
485
 
        }
486
 
 
487
 
        return(FALSE);
488
 
}
489
 
 
490
 
/********************************************************************//**
491
 
Remove a block from the flush list of modified blocks. */
492
 
UNIV_INTERN
493
 
void
494
 
buf_flush_remove(
495
 
/*=============*/
496
 
        buf_page_t*     bpage)  /*!< in: pointer to the block in question */
497
 
{
498
 
        buf_pool_t*     buf_pool = buf_pool_from_bpage(bpage);
499
 
 
500
 
        ut_ad(buf_pool_mutex_own(buf_pool));
501
 
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
502
 
        ut_ad(bpage->in_flush_list);
503
 
 
504
 
        buf_flush_list_mutex_enter(buf_pool);
505
 
 
506
 
        switch (buf_page_get_state(bpage)) {
507
 
        case BUF_BLOCK_ZIP_PAGE:
508
 
                /* Clean compressed pages should not be on the flush list */
509
 
        case BUF_BLOCK_ZIP_FREE:
510
 
        case BUF_BLOCK_NOT_USED:
511
 
        case BUF_BLOCK_READY_FOR_USE:
512
 
        case BUF_BLOCK_MEMORY:
513
 
        case BUF_BLOCK_REMOVE_HASH:
514
 
                ut_error;
515
 
                return;
516
 
        case BUF_BLOCK_ZIP_DIRTY:
517
 
                buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
518
 
                UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
519
 
                buf_LRU_insert_zip_clean(bpage);
520
 
                break;
521
 
        case BUF_BLOCK_FILE_PAGE:
522
 
                UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
523
 
                break;
524
 
        }
525
 
 
526
 
        /* If the flush_rbt is active then delete from there as well. */
527
 
        if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
528
 
                buf_flush_delete_from_flush_rbt(bpage);
529
 
        }
530
 
 
531
 
        /* Must be done after we have removed it from the flush_rbt
532
 
        because we assert on in_flush_list in comparison function. */
533
 
        ut_d(bpage->in_flush_list = FALSE);
534
 
 
535
 
        bpage->oldest_modification = 0;
536
 
 
537
 
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
538
 
        ut_a(buf_flush_validate_low(buf_pool));
539
 
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
540
 
 
541
 
        buf_flush_list_mutex_exit(buf_pool);
542
 
}
543
 
 
544
 
/*******************************************************************//**
545
 
Relocates a buffer control block on the flush_list.
546
 
Note that it is assumed that the contents of bpage have already been
547
 
copied to dpage.
548
 
IMPORTANT: When this function is called bpage and dpage are not
549
 
exact copies of each other. For example, they both will have different
550
 
::state. Also the ::list pointers in dpage may be stale. We need to
551
 
use the current list node (bpage) to do the list manipulation because
552
 
the list pointers could have changed between the time that we copied
553
 
the contents of bpage to the dpage and the flush list manipulation
554
 
below. */
555
 
UNIV_INTERN
556
 
void
557
 
buf_flush_relocate_on_flush_list(
558
 
/*=============================*/
559
 
        buf_page_t*     bpage,  /*!< in/out: control block being moved */
560
 
        buf_page_t*     dpage)  /*!< in/out: destination block */
561
 
{
562
 
        buf_page_t*     prev;
563
 
        buf_page_t*     prev_b = NULL;
564
 
        buf_pool_t*     buf_pool = buf_pool_from_bpage(bpage);
565
 
 
566
 
        ut_ad(buf_pool_mutex_own(buf_pool));
567
 
        /* Must reside in the same buffer pool. */
568
 
        ut_ad(buf_pool == buf_pool_from_bpage(dpage));
569
 
 
570
 
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
571
 
 
572
 
        buf_flush_list_mutex_enter(buf_pool);
573
 
 
574
 
        /* FIXME: At this point we have both buf_pool and flush_list
575
 
        mutexes. Theoretically removal of a block from flush list is
576
 
        only covered by flush_list mutex but currently we do
577
 
        have buf_pool mutex in buf_flush_remove() therefore this block
578
 
        is guaranteed to be in the flush list. We need to check if
579
 
        this will work without the assumption of block removing code
580
 
        having the buf_pool mutex. */
581
 
        ut_ad(bpage->in_flush_list);
582
 
        ut_ad(dpage->in_flush_list);
583
 
 
584
 
        /* If recovery is active we must swap the control blocks in
585
 
        the flush_rbt as well. */
586
 
        if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
587
 
                buf_flush_delete_from_flush_rbt(bpage);
588
 
                prev_b = buf_flush_insert_in_flush_rbt(dpage);
589
 
        }
590
 
 
591
 
        /* Must be done after we have removed it from the flush_rbt
592
 
        because we assert on in_flush_list in comparison function. */
593
 
        ut_d(bpage->in_flush_list = FALSE);
594
 
 
595
 
        prev = UT_LIST_GET_PREV(list, bpage);
596
 
        UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
597
 
 
598
 
        if (prev) {
599
 
                ut_ad(prev->in_flush_list);
600
 
                UT_LIST_INSERT_AFTER(
601
 
                        list,
602
 
                        buf_pool->flush_list,
603
 
                        prev, dpage);
604
 
        } else {
605
 
                UT_LIST_ADD_FIRST(
606
 
                        list,
607
 
                        buf_pool->flush_list,
608
 
                        dpage);
609
 
        }
610
 
 
611
 
        /* Just an extra check. Previous in flush_list
612
 
        should be the same control block as in flush_rbt. */
613
 
        ut_a(!buf_pool->flush_rbt || prev_b == prev);
614
 
 
615
 
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
616
 
        ut_a(buf_flush_validate_low(buf_pool));
617
 
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
618
 
 
619
 
        buf_flush_list_mutex_exit(buf_pool);
620
 
}
621
 
 
622
 
/********************************************************************//**
623
 
Updates the flush system data structures when a write is completed. */
624
 
UNIV_INTERN
625
 
void
626
 
buf_flush_write_complete(
627
 
/*=====================*/
628
 
        buf_page_t*     bpage)  /*!< in: pointer to the block in question */
629
 
{
630
 
        enum buf_flush  flush_type;
631
 
        buf_pool_t*     buf_pool = buf_pool_from_bpage(bpage);
632
 
 
633
 
        ut_ad(bpage);
634
 
 
635
 
        buf_flush_remove(bpage);
636
 
 
637
 
        flush_type = buf_page_get_flush_type(bpage);
638
 
        buf_pool->n_flush[flush_type]--;
639
 
 
640
 
        if (flush_type == BUF_FLUSH_LRU) {
641
 
                /* Put the block to the end of the LRU list to wait to be
642
 
                moved to the free list */
643
 
 
644
 
                buf_LRU_make_block_old(bpage);
645
 
 
646
 
                buf_pool->LRU_flush_ended++;
647
 
        }
648
 
 
649
 
        /* fprintf(stderr, "n pending flush %lu\n",
650
 
        buf_pool->n_flush[flush_type]); */
651
 
 
652
 
        if (buf_pool->n_flush[flush_type] == 0
653
 
            && buf_pool->init_flush[flush_type] == FALSE) {
654
 
 
655
 
                /* The running flush batch has ended */
656
 
 
657
 
                os_event_set(buf_pool->no_flush[flush_type]);
658
 
        }
659
 
}
660
 
 
661
 
/********************************************************************//**
662
 
Flush a batch of writes to the datafiles that have already been
663
 
written by the OS. */
664
 
static
665
 
void
666
 
buf_flush_sync_datafiles(void)
667
 
/*==========================*/
668
 
{
669
 
        /* Wake possible simulated aio thread to actually post the
670
 
        writes to the operating system */
671
 
        os_aio_simulated_wake_handler_threads();
672
 
 
673
 
        /* Wait that all async writes to tablespaces have been posted to
674
 
        the OS */
675
 
        os_aio_wait_until_no_pending_writes();
676
 
 
677
 
        /* Now we flush the data to disk (for example, with fsync) */
678
 
        fil_flush_file_spaces(FIL_TABLESPACE);
679
 
 
680
 
        return;
681
 
}
682
 
 
683
 
/********************************************************************//**
684
 
Flushes possible buffered writes from the doublewrite memory buffer to disk,
685
 
and also wakes up the aio thread if simulated aio is used. It is very
686
 
important to call this function after a batch of writes has been posted,
687
 
and also when we may have to wait for a page latch! Otherwise a deadlock
688
 
of threads can occur. */
689
 
static
690
 
void
691
 
buf_flush_buffered_writes(void)
692
 
/*===========================*/
693
 
{
694
 
        byte*           write_buf;
695
 
        ulint           len;
696
 
        ulint           len2;
697
 
        ulint           i;
698
 
 
699
 
        if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
700
 
                /* Sync the writes to the disk. */
701
 
                buf_flush_sync_datafiles();
702
 
                return;
703
 
        }
704
 
 
705
 
        mutex_enter(&(trx_doublewrite->mutex));
706
 
 
707
 
        /* Write first to doublewrite buffer blocks. We use synchronous
708
 
        aio and thus know that file write has been completed when the
709
 
        control returns. */
710
 
 
711
 
        if (trx_doublewrite->first_free == 0) {
712
 
 
713
 
                mutex_exit(&(trx_doublewrite->mutex));
714
 
 
715
 
                return;
716
 
        }
717
 
 
718
 
        for (i = 0; i < trx_doublewrite->first_free; i++) {
719
 
 
720
 
                const buf_block_t*      block;
721
 
 
722
 
                block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
723
 
 
724
 
                if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
725
 
                    || block->page.zip.data) {
726
 
                        /* No simple validate for compressed pages exists. */
727
 
                        continue;
728
 
                }
729
 
 
730
 
                if (UNIV_UNLIKELY
731
 
                    (memcmp(block->frame + (FIL_PAGE_LSN + 4),
732
 
                            block->frame + (UNIV_PAGE_SIZE
733
 
                                            - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
734
 
                            4))) {
735
 
                        ut_print_timestamp(stderr);
736
 
                        fprintf(stderr,
737
 
                                "  InnoDB: ERROR: The page to be written"
738
 
                                " seems corrupt!\n"
739
 
                                "InnoDB: The lsn fields do not match!"
740
 
                                " Noticed in the buffer pool\n"
741
 
                                "InnoDB: before posting to the"
742
 
                                " doublewrite buffer.\n");
743
 
                }
744
 
 
745
 
                if (!block->check_index_page_at_flush) {
746
 
                } else if (page_is_comp(block->frame)) {
747
 
                        if (UNIV_UNLIKELY
748
 
                            (!page_simple_validate_new(block->frame))) {
749
 
corrupted_page:
750
 
                                buf_page_print(block->frame, 0);
751
 
 
752
 
                                ut_print_timestamp(stderr);
753
 
                                fprintf(stderr,
754
 
                                        "  InnoDB: Apparent corruption of an"
755
 
                                        " index page n:o %lu in space %lu\n"
756
 
                                        "InnoDB: to be written to data file."
757
 
                                        " We intentionally crash server\n"
758
 
                                        "InnoDB: to prevent corrupt data"
759
 
                                        " from ending up in data\n"
760
 
                                        "InnoDB: files.\n",
761
 
                                        (ulong) buf_block_get_page_no(block),
762
 
                                        (ulong) buf_block_get_space(block));
763
 
 
764
 
                                ut_error;
765
 
                        }
766
 
                } else if (UNIV_UNLIKELY
767
 
                           (!page_simple_validate_old(block->frame))) {
768
 
 
769
 
                        goto corrupted_page;
770
 
                }
771
 
        }
772
 
 
773
 
        /* increment the doublewrite flushed pages counter */
774
 
        srv_dblwr_pages_written+= trx_doublewrite->first_free;
775
 
        srv_dblwr_writes++;
776
 
 
777
 
        len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
778
 
                     trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
779
 
 
780
 
        write_buf = trx_doublewrite->write_buf;
781
 
        i = 0;
782
 
 
783
 
        fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
784
 
               trx_doublewrite->block1, 0, len,
785
 
               (void*) write_buf, NULL);
786
 
 
787
 
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
788
 
             len2 += UNIV_PAGE_SIZE, i++) {
789
 
                const buf_block_t* block = (buf_block_t*)
790
 
                        trx_doublewrite->buf_block_arr[i];
791
 
 
792
 
                if (UNIV_LIKELY(!block->page.zip.data)
793
 
                    && UNIV_LIKELY(buf_block_get_state(block)
794
 
                                   == BUF_BLOCK_FILE_PAGE)
795
 
                    && UNIV_UNLIKELY
796
 
                    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
797
 
                            write_buf + len2
798
 
                            + (UNIV_PAGE_SIZE
799
 
                               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
800
 
                        ut_print_timestamp(stderr);
801
 
                        fprintf(stderr,
802
 
                                "  InnoDB: ERROR: The page to be written"
803
 
                                " seems corrupt!\n"
804
 
                                "InnoDB: The lsn fields do not match!"
805
 
                                " Noticed in the doublewrite block1.\n");
806
 
                }
807
 
        }
808
 
 
809
 
        if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
810
 
                goto flush;
811
 
        }
812
 
 
813
 
        len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
814
 
                * UNIV_PAGE_SIZE;
815
 
 
816
 
        write_buf = trx_doublewrite->write_buf
817
 
                + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
818
 
        ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
819
 
 
820
 
        fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
821
 
               trx_doublewrite->block2, 0, len,
822
 
               (void*) write_buf, NULL);
823
 
 
824
 
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
825
 
             len2 += UNIV_PAGE_SIZE, i++) {
826
 
                const buf_block_t* block = (buf_block_t*)
827
 
                        trx_doublewrite->buf_block_arr[i];
828
 
 
829
 
                if (UNIV_LIKELY(!block->page.zip.data)
830
 
                    && UNIV_LIKELY(buf_block_get_state(block)
831
 
                                   == BUF_BLOCK_FILE_PAGE)
832
 
                    && UNIV_UNLIKELY
833
 
                    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
834
 
                            write_buf + len2
835
 
                            + (UNIV_PAGE_SIZE
836
 
                               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
837
 
                        ut_print_timestamp(stderr);
838
 
                        fprintf(stderr,
839
 
                                "  InnoDB: ERROR: The page to be"
840
 
                                " written seems corrupt!\n"
841
 
                                "InnoDB: The lsn fields do not match!"
842
 
                                " Noticed in"
843
 
                                " the doublewrite block2.\n");
844
 
                }
845
 
        }
846
 
 
847
 
flush:
848
 
        /* Now flush the doublewrite buffer data to disk */
849
 
 
850
 
        fil_flush(TRX_SYS_SPACE);
851
 
 
852
 
        /* We know that the writes have been flushed to disk now
853
 
        and in recovery we will find them in the doublewrite buffer
854
 
        blocks. Next do the writes to the intended positions. */
855
 
 
856
 
        for (i = 0; i < trx_doublewrite->first_free; i++) {
857
 
                const buf_block_t* block = (buf_block_t*)
858
 
                        trx_doublewrite->buf_block_arr[i];
859
 
 
860
 
                ut_a(buf_page_in_file(&block->page));
861
 
                if (UNIV_LIKELY_NULL(block->page.zip.data)) {
862
 
                        fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
863
 
                               FALSE, buf_page_get_space(&block->page),
864
 
                               buf_page_get_zip_size(&block->page),
865
 
                               buf_page_get_page_no(&block->page), 0,
866
 
                               buf_page_get_zip_size(&block->page),
867
 
                               (void*)block->page.zip.data,
868
 
                               (void*)block);
869
 
 
870
 
                        /* Increment the counter of I/O operations used
871
 
                        for selecting LRU policy. */
872
 
                        buf_LRU_stat_inc_io();
873
 
 
874
 
                        continue;
875
 
                }
876
 
 
877
 
                ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
878
 
 
879
 
                if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
880
 
                                         block->frame
881
 
                                         + (UNIV_PAGE_SIZE
882
 
                                            - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
883
 
                                         4))) {
884
 
                        ut_print_timestamp(stderr);
885
 
                        fprintf(stderr,
886
 
                                "  InnoDB: ERROR: The page to be written"
887
 
                                " seems corrupt!\n"
888
 
                                "InnoDB: The lsn fields do not match!"
889
 
                                " Noticed in the buffer pool\n"
890
 
                                "InnoDB: after posting and flushing"
891
 
                                " the doublewrite buffer.\n"
892
 
                                "InnoDB: Page buf fix count %lu,"
893
 
                                " io fix %lu, state %lu\n",
894
 
                                (ulong)block->page.buf_fix_count,
895
 
                                (ulong)buf_block_get_io_fix(block),
896
 
                                (ulong)buf_block_get_state(block));
897
 
                }
898
 
 
899
 
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
900
 
                       FALSE, buf_block_get_space(block), 0,
901
 
                       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
902
 
                       (void*)block->frame, (void*)block);
903
 
 
904
 
                /* Increment the counter of I/O operations used
905
 
                for selecting LRU policy. */
906
 
                buf_LRU_stat_inc_io();
907
 
        }
908
 
 
909
 
        /* Sync the writes to the disk. */
910
 
        buf_flush_sync_datafiles();
911
 
 
912
 
        /* We can now reuse the doublewrite memory buffer: */
913
 
        trx_doublewrite->first_free = 0;
914
 
 
915
 
        mutex_exit(&(trx_doublewrite->mutex));
916
 
}
917
 
 
918
 
/********************************************************************//**
919
 
Posts a buffer page for writing. If the doublewrite memory buffer is
920
 
full, calls buf_flush_buffered_writes and waits for for free space to
921
 
appear. */
922
 
static
923
 
void
924
 
buf_flush_post_to_doublewrite_buf(
925
 
/*==============================*/
926
 
        buf_page_t*     bpage)  /*!< in: buffer block to write */
927
 
{
928
 
        ulint   zip_size;
929
 
try_again:
930
 
        mutex_enter(&(trx_doublewrite->mutex));
931
 
 
932
 
        ut_a(buf_page_in_file(bpage));
933
 
 
934
 
        if (trx_doublewrite->first_free
935
 
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
936
 
                mutex_exit(&(trx_doublewrite->mutex));
937
 
 
938
 
                buf_flush_buffered_writes();
939
 
 
940
 
                goto try_again;
941
 
        }
942
 
 
943
 
        zip_size = buf_page_get_zip_size(bpage);
944
 
 
945
 
        if (UNIV_UNLIKELY(zip_size)) {
946
 
                UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
947
 
                /* Copy the compressed page and clear the rest. */
948
 
                memcpy(trx_doublewrite->write_buf
949
 
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
950
 
                       bpage->zip.data, zip_size);
951
 
                memset(trx_doublewrite->write_buf
952
 
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free
953
 
                       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
954
 
        } else {
955
 
                ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
956
 
                UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
957
 
                                   UNIV_PAGE_SIZE);
958
 
 
959
 
                memcpy(trx_doublewrite->write_buf
960
 
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
961
 
                       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
962
 
        }
963
 
 
964
 
        trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
965
 
 
966
 
        trx_doublewrite->first_free++;
967
 
 
968
 
        if (trx_doublewrite->first_free
969
 
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
970
 
                mutex_exit(&(trx_doublewrite->mutex));
971
 
 
972
 
                buf_flush_buffered_writes();
973
 
 
974
 
                return;
975
 
        }
976
 
 
977
 
        mutex_exit(&(trx_doublewrite->mutex));
978
 
}
979
 
#endif /* !UNIV_HOTBACKUP */
980
 
 
981
 
/********************************************************************//**
982
 
Initializes a page for writing to the tablespace. */
983
 
UNIV_INTERN
984
 
void
985
 
buf_flush_init_for_writing(
986
 
/*=======================*/
987
 
        byte*           page,           /*!< in/out: page */
988
 
        void*           page_zip_,      /*!< in/out: compressed page, or NULL */
989
 
        ib_uint64_t     newest_lsn)     /*!< in: newest modification lsn
990
 
                                        to the page */
991
 
{
992
 
        ut_ad(page);
993
 
 
994
 
        if (page_zip_) {
995
 
                page_zip_des_t* page_zip = page_zip_;
996
 
                ulint           zip_size = page_zip_get_size(page_zip);
997
 
                ut_ad(zip_size);
998
 
                ut_ad(ut_is_2pow(zip_size));
999
 
                ut_ad(zip_size <= UNIV_PAGE_SIZE);
1000
 
 
1001
 
                switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
1002
 
                case FIL_PAGE_TYPE_ALLOCATED:
1003
 
                case FIL_PAGE_INODE:
1004
 
                case FIL_PAGE_IBUF_BITMAP:
1005
 
                case FIL_PAGE_TYPE_FSP_HDR:
1006
 
                case FIL_PAGE_TYPE_XDES:
1007
 
                        /* These are essentially uncompressed pages. */
1008
 
                        memcpy(page_zip->data, page, zip_size);
1009
 
                        /* fall through */
1010
 
                case FIL_PAGE_TYPE_ZBLOB:
1011
 
                case FIL_PAGE_TYPE_ZBLOB2:
1012
 
                case FIL_PAGE_INDEX:
1013
 
                        mach_write_to_8(page_zip->data
1014
 
                                        + FIL_PAGE_LSN, newest_lsn);
1015
 
                        memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
1016
 
                        mach_write_to_4(page_zip->data
1017
 
                                        + FIL_PAGE_SPACE_OR_CHKSUM,
1018
 
                                        srv_use_checksums
1019
 
                                        ? page_zip_calc_checksum(
1020
 
                                                page_zip->data, zip_size)
1021
 
                                        : BUF_NO_CHECKSUM_MAGIC);
1022
 
                        return;
1023
 
                }
1024
 
 
1025
 
                ut_print_timestamp(stderr);
1026
 
                fputs("  InnoDB: ERROR: The compressed page to be written"
1027
 
                      " seems corrupt:", stderr);
1028
 
                ut_print_buf(stderr, page, zip_size);
1029
 
                fputs("\nInnoDB: Possibly older version of the page:", stderr);
1030
 
                ut_print_buf(stderr, page_zip->data, zip_size);
1031
 
                putc('\n', stderr);
1032
 
                ut_error;
1033
 
        }
1034
 
 
1035
 
        /* Write the newest modification lsn to the page header and trailer */
1036
 
        mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
1037
 
 
1038
 
        mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
1039
 
                        newest_lsn);
1040
 
 
1041
 
        /* Store the new formula checksum */
1042
 
 
1043
 
        mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
1044
 
                        srv_use_checksums
1045
 
                        ? buf_calc_page_new_checksum(page)
1046
 
                        : BUF_NO_CHECKSUM_MAGIC);
1047
 
 
1048
 
        /* We overwrite the first 4 bytes of the end lsn field to store
1049
 
        the old formula checksum. Since it depends also on the field
1050
 
        FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
1051
 
        new formula checksum. */
1052
 
 
1053
 
        mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
1054
 
                        srv_use_checksums
1055
 
                        ? buf_calc_page_old_checksum(page)
1056
 
                        : BUF_NO_CHECKSUM_MAGIC);
1057
 
}
1058
 
 
1059
 
#ifndef UNIV_HOTBACKUP
1060
 
/********************************************************************//**
1061
 
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
1062
 
also when the doublewrite buffer is used, we must call
1063
 
buf_flush_buffered_writes after we have posted a batch of writes! */
1064
 
static
1065
 
void
1066
 
buf_flush_write_block_low(
1067
 
/*======================*/
1068
 
        buf_page_t*     bpage)  /*!< in: buffer block to write */
1069
 
{
1070
 
        ulint   zip_size        = buf_page_get_zip_size(bpage);
1071
 
        page_t* frame           = NULL;
1072
 
 
1073
 
#ifdef UNIV_DEBUG
1074
 
        buf_pool_t*     buf_pool = buf_pool_from_bpage(bpage);
1075
 
        ut_ad(!buf_pool_mutex_own(buf_pool));
1076
 
#endif
1077
 
 
1078
 
#ifdef UNIV_LOG_DEBUG
1079
 
        static ibool univ_log_debug_warned;
1080
 
#endif /* UNIV_LOG_DEBUG */
1081
 
 
1082
 
        ut_ad(buf_page_in_file(bpage));
1083
 
 
1084
 
        /* We are not holding buf_pool_mutex or block_mutex here.
1085
 
        Nevertheless, it is safe to access bpage, because it is
1086
 
        io_fixed and oldest_modification != 0.  Thus, it cannot be
1087
 
        relocated in the buffer pool or removed from flush_list or
1088
 
        LRU_list. */
1089
 
        ut_ad(!buf_pool_mutex_own(buf_pool));
1090
 
        ut_ad(!buf_flush_list_mutex_own(buf_pool));
1091
 
        ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
1092
 
        ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1093
 
        ut_ad(bpage->oldest_modification != 0);
1094
 
 
1095
 
#ifdef UNIV_IBUF_COUNT_DEBUG
1096
 
        ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
1097
 
#endif
1098
 
        ut_ad(bpage->newest_modification != 0);
1099
 
 
1100
 
#ifdef UNIV_LOG_DEBUG
1101
 
        if (!univ_log_debug_warned) {
1102
 
                univ_log_debug_warned = TRUE;
1103
 
                fputs("Warning: cannot force log to disk if"
1104
 
                      " UNIV_LOG_DEBUG is defined!\n"
1105
 
                      "Crash recovery will not work!\n",
1106
 
                      stderr);
1107
 
        }
1108
 
#else
1109
 
        /* Force the log to the disk before writing the modified block */
1110
 
        log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
1111
 
#endif
1112
 
        switch (buf_page_get_state(bpage)) {
1113
 
        case BUF_BLOCK_ZIP_FREE:
1114
 
        case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
1115
 
        case BUF_BLOCK_NOT_USED:
1116
 
        case BUF_BLOCK_READY_FOR_USE:
1117
 
        case BUF_BLOCK_MEMORY:
1118
 
        case BUF_BLOCK_REMOVE_HASH:
1119
 
                ut_error;
1120
 
                break;
1121
 
        case BUF_BLOCK_ZIP_DIRTY:
1122
 
                frame = bpage->zip.data;
1123
 
                if (UNIV_LIKELY(srv_use_checksums)) {
1124
 
                        ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
1125
 
                             == page_zip_calc_checksum(frame, zip_size));
1126
 
                }
1127
 
                mach_write_to_8(frame + FIL_PAGE_LSN,
1128
 
                                bpage->newest_modification);
1129
 
                memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
1130
 
                break;
1131
 
        case BUF_BLOCK_FILE_PAGE:
1132
 
                frame = bpage->zip.data;
1133
 
                if (!frame) {
1134
 
                        frame = ((buf_block_t*) bpage)->frame;
1135
 
                }
1136
 
 
1137
 
                buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
1138
 
                                           bpage->zip.data
1139
 
                                           ? &bpage->zip : NULL,
1140
 
                                           bpage->newest_modification);
1141
 
                break;
1142
 
        }
1143
 
 
1144
 
        if (!srv_use_doublewrite_buf || !trx_doublewrite) {
1145
 
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
1146
 
                       FALSE, buf_page_get_space(bpage), zip_size,
1147
 
                       buf_page_get_page_no(bpage), 0,
1148
 
                       zip_size ? zip_size : UNIV_PAGE_SIZE,
1149
 
                       frame, bpage);
1150
 
        } else {
1151
 
                buf_flush_post_to_doublewrite_buf(bpage);
1152
 
        }
1153
 
}
1154
 
 
1155
 
/********************************************************************//**
1156
 
Writes a flushable page asynchronously from the buffer pool to a file.
1157
 
NOTE: in simulated aio we must call
1158
 
os_aio_simulated_wake_handler_threads after we have posted a batch of
1159
 
writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
1160
 
held upon entering this function, and they will be released by this
1161
 
function. */
1162
 
static
1163
 
void
1164
 
buf_flush_page(
1165
 
/*===========*/
1166
 
        buf_pool_t*     buf_pool,       /*!< in: buffer pool instance */
1167
 
        buf_page_t*     bpage,          /*!< in: buffer control block */
1168
 
        enum buf_flush  flush_type)     /*!< in: BUF_FLUSH_LRU
1169
 
                                        or BUF_FLUSH_LIST */
1170
 
{
1171
 
        mutex_t*        block_mutex;
1172
 
        ibool           is_uncompressed;
1173
 
 
1174
 
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1175
 
        ut_ad(buf_pool_mutex_own(buf_pool));
1176
 
        ut_ad(buf_page_in_file(bpage));
1177
 
 
1178
 
        block_mutex = buf_page_get_mutex(bpage);
1179
 
        ut_ad(mutex_own(block_mutex));
1180
 
 
1181
 
        ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1182
 
 
1183
 
        buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1184
 
 
1185
 
        buf_page_set_flush_type(bpage, flush_type);
1186
 
 
1187
 
        if (buf_pool->n_flush[flush_type] == 0) {
1188
 
 
1189
 
                os_event_reset(buf_pool->no_flush[flush_type]);
1190
 
        }
1191
 
 
1192
 
        buf_pool->n_flush[flush_type]++;
1193
 
 
1194
 
        is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1195
 
        ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1196
 
 
1197
 
        switch (flush_type) {
1198
 
                ibool   is_s_latched;
1199
 
        case BUF_FLUSH_LIST:
1200
 
                /* If the simulated aio thread is not running, we must
1201
 
                not wait for any latch, as we may end up in a deadlock:
1202
 
                if buf_fix_count == 0, then we know we need not wait */
1203
 
 
1204
 
                is_s_latched = (bpage->buf_fix_count == 0);
1205
 
                if (is_s_latched && is_uncompressed) {
1206
 
                        rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
1207
 
                                           BUF_IO_WRITE);
1208
 
                }
1209
 
 
1210
 
                mutex_exit(block_mutex);
1211
 
                buf_pool_mutex_exit(buf_pool);
1212
 
 
1213
 
                /* Even though bpage is not protected by any mutex at
1214
 
                this point, it is safe to access bpage, because it is
1215
 
                io_fixed and oldest_modification != 0.  Thus, it
1216
 
                cannot be relocated in the buffer pool or removed from
1217
 
                flush_list or LRU_list. */
1218
 
 
1219
 
                if (!is_s_latched) {
1220
 
                        buf_flush_buffered_writes();
1221
 
 
1222
 
                        if (is_uncompressed) {
1223
 
                                rw_lock_s_lock_gen(&((buf_block_t*) bpage)
1224
 
                                                   ->lock, BUF_IO_WRITE);
1225
 
                        }
1226
 
                }
1227
 
 
1228
 
                break;
1229
 
 
1230
 
        case BUF_FLUSH_LRU:
1231
 
                /* VERY IMPORTANT:
1232
 
                Because any thread may call the LRU flush, even when owning
1233
 
                locks on pages, to avoid deadlocks, we must make sure that the
1234
 
                s-lock is acquired on the page without waiting: this is
1235
 
                accomplished because buf_flush_ready_for_flush() must hold,
1236
 
                and that requires the page not to be bufferfixed. */
1237
 
 
1238
 
                if (is_uncompressed) {
1239
 
                        rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
1240
 
                                           BUF_IO_WRITE);
1241
 
                }
1242
 
 
1243
 
                /* Note that the s-latch is acquired before releasing the
1244
 
                buf_pool mutex: this ensures that the latch is acquired
1245
 
                immediately. */
1246
 
 
1247
 
                mutex_exit(block_mutex);
1248
 
                buf_pool_mutex_exit(buf_pool);
1249
 
                break;
1250
 
 
1251
 
        default:
1252
 
                ut_error;
1253
 
        }
1254
 
 
1255
 
        /* Even though bpage is not protected by any mutex at this
1256
 
        point, it is safe to access bpage, because it is io_fixed and
1257
 
        oldest_modification != 0.  Thus, it cannot be relocated in the
1258
 
        buffer pool or removed from flush_list or LRU_list. */
1259
 
 
1260
 
#ifdef UNIV_DEBUG
1261
 
        if (buf_debug_prints) {
1262
 
                fprintf(stderr,
1263
 
                        "Flushing %u space %u page %u\n",
1264
 
                        flush_type, bpage->space, bpage->offset);
1265
 
        }
1266
 
#endif /* UNIV_DEBUG */
1267
 
        buf_flush_write_block_low(bpage);
1268
 
}
1269
 
 
1270
 
/***********************************************************//**
1271
 
Flushes to disk all flushable pages within the flush area.
1272
 
@return number of pages flushed */
1273
 
static
1274
 
ulint
1275
 
buf_flush_try_neighbors(
1276
 
/*====================*/
1277
 
        ulint           space,          /*!< in: space id */
1278
 
        ulint           offset,         /*!< in: page offset */
1279
 
        enum buf_flush  flush_type,     /*!< in: BUF_FLUSH_LRU or
1280
 
                                        BUF_FLUSH_LIST */
1281
 
        ulint           n_flushed,      /*!< in: number of pages
1282
 
                                        flushed so far in this batch */
1283
 
        ulint           n_to_flush)     /*!< in: maximum number of pages
1284
 
                                        we are allowed to flush */
1285
 
{
1286
 
        ulint           i;
1287
 
        ulint           low;
1288
 
        ulint           high;
1289
 
        ulint           count = 0;
1290
 
        buf_pool_t*     buf_pool = buf_pool_get(space, offset);
1291
 
 
1292
 
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1293
 
 
1294
 
        if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
1295
 
                /* If there is little space, it is better not to flush
1296
 
                any block except from the end of the LRU list */
1297
 
 
1298
 
                low = offset;
1299
 
                high = offset + 1;
1300
 
        } else {
1301
 
                /* When flushed, dirty blocks are searched in
1302
 
                neighborhoods of this size, and flushed along with the
1303
 
                original page. */
1304
 
 
1305
 
                ulint   buf_flush_area;
1306
 
        
1307
 
                buf_flush_area  = ut_min(
1308
 
                        BUF_READ_AHEAD_AREA(buf_pool),
1309
 
                        buf_pool->curr_size / 16);
1310
 
 
1311
 
                low = (offset / buf_flush_area) * buf_flush_area;
1312
 
                high = (offset / buf_flush_area + 1) * buf_flush_area;
1313
 
        }
1314
 
 
1315
 
        /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
1316
 
 
1317
 
        if (high > fil_space_get_size(space)) {
1318
 
                high = fil_space_get_size(space);
1319
 
        }
1320
 
 
1321
 
        for (i = low; i < high; i++) {
1322
 
 
1323
 
                buf_page_t*     bpage;
1324
 
 
1325
 
                if ((count + n_flushed) >= n_to_flush) {
1326
 
 
1327
 
                        /* We have already flushed enough pages and
1328
 
                        should call it a day. There is, however, one
1329
 
                        exception. If the page whose neighbors we
1330
 
                        are flushing has not been flushed yet then
1331
 
                        we'll try to flush the victim that we
1332
 
                        selected originally. */
1333
 
                        if (i <= offset) {
1334
 
                                i = offset;
1335
 
                        } else {
1336
 
                                break;
1337
 
                        }
1338
 
                }
1339
 
 
1340
 
                buf_pool = buf_pool_get(space, i);
1341
 
 
1342
 
                buf_pool_mutex_enter(buf_pool);
1343
 
 
1344
 
                /* We only want to flush pages from this buffer pool. */
1345
 
                bpage = buf_page_hash_get(buf_pool, space, i);
1346
 
 
1347
 
                if (!bpage) {
1348
 
 
1349
 
                        buf_pool_mutex_exit(buf_pool);
1350
 
                        continue;
1351
 
                }
1352
 
 
1353
 
                ut_a(buf_page_in_file(bpage));
1354
 
 
1355
 
                /* We avoid flushing 'non-old' blocks in an LRU flush,
1356
 
                because the flushed blocks are soon freed */
1357
 
 
1358
 
                if (flush_type != BUF_FLUSH_LRU
1359
 
                    || i == offset
1360
 
                    || buf_page_is_old(bpage)) {
1361
 
                        mutex_t* block_mutex = buf_page_get_mutex(bpage);
1362
 
 
1363
 
                        mutex_enter(block_mutex);
1364
 
 
1365
 
                        if (buf_flush_ready_for_flush(bpage, flush_type)
1366
 
                            && (i == offset || !bpage->buf_fix_count)) {
1367
 
                                /* We only try to flush those
1368
 
                                neighbors != offset where the buf fix
1369
 
                                count is zero, as we then know that we
1370
 
                                probably can latch the page without a
1371
 
                                semaphore wait. Semaphore waits are
1372
 
                                expensive because we must flush the
1373
 
                                doublewrite buffer before we start
1374
 
                                waiting. */
1375
 
 
1376
 
                                buf_flush_page(buf_pool, bpage, flush_type);
1377
 
                                ut_ad(!mutex_own(block_mutex));
1378
 
                                ut_ad(!buf_pool_mutex_own(buf_pool));
1379
 
                                count++;
1380
 
                                continue;
1381
 
                        } else {
1382
 
                                mutex_exit(block_mutex);
1383
 
                        }
1384
 
                }
1385
 
                buf_pool_mutex_exit(buf_pool);
1386
 
        }
1387
 
 
1388
 
        return(count);
1389
 
}
1390
 
 
1391
 
/********************************************************************//**
1392
 
Check if the block is modified and ready for flushing. If the the block
1393
 
is ready to flush then flush the page and try o flush its neighbors.
1394
 
 
1395
 
@return TRUE if buf_pool mutex was not released during this function.
1396
 
This does not guarantee that some pages were written as well.
1397
 
Number of pages written are incremented to the count. */
1398
 
static
1399
 
ibool
1400
 
buf_flush_page_and_try_neighbors(
1401
 
/*=============================*/
1402
 
        buf_page_t*     bpage,          /*!< in: buffer control block,
1403
 
                                        must be
1404
 
                                        buf_page_in_file(bpage) */
1405
 
        enum buf_flush  flush_type,     /*!< in: BUF_FLUSH_LRU
1406
 
                                        or BUF_FLUSH_LIST */
1407
 
        ulint           n_to_flush,     /*!< in: number of pages to
1408
 
                                        flush */
1409
 
        ulint*          count)          /*!< in/out: number of pages
1410
 
                                        flushed */
1411
 
{
1412
 
        mutex_t*        block_mutex;
1413
 
        ibool           flushed = FALSE;
1414
 
#ifdef UNIV_DEBUG
1415
 
        buf_pool_t*     buf_pool = buf_pool_from_bpage(bpage);
1416
 
#endif /* UNIV_DEBUG */
1417
 
 
1418
 
        ut_ad(buf_pool_mutex_own(buf_pool));
1419
 
 
1420
 
        block_mutex = buf_page_get_mutex(bpage);
1421
 
        mutex_enter(block_mutex);
1422
 
 
1423
 
        ut_a(buf_page_in_file(bpage));
1424
 
 
1425
 
        if (buf_flush_ready_for_flush(bpage, flush_type)) {
1426
 
                ulint           space;
1427
 
                ulint           offset;
1428
 
                buf_pool_t*     buf_pool;
1429
 
 
1430
 
                buf_pool = buf_pool_from_bpage(bpage);
1431
 
 
1432
 
                buf_pool_mutex_exit(buf_pool);
1433
 
 
1434
 
                /* These fields are protected by both the
1435
 
                buffer pool mutex and block mutex. */
1436
 
                space = buf_page_get_space(bpage);
1437
 
                offset = buf_page_get_page_no(bpage);
1438
 
 
1439
 
                mutex_exit(block_mutex);
1440
 
 
1441
 
                /* Try to flush also all the neighbors */
1442
 
                *count += buf_flush_try_neighbors(space,
1443
 
                                                  offset,
1444
 
                                                  flush_type,
1445
 
                                                  *count,
1446
 
                                                  n_to_flush);
1447
 
 
1448
 
                buf_pool_mutex_enter(buf_pool);
1449
 
                flushed = TRUE;
1450
 
        } else {
1451
 
                mutex_exit(block_mutex);
1452
 
        }
1453
 
 
1454
 
        ut_ad(buf_pool_mutex_own(buf_pool));
1455
 
 
1456
 
        return(flushed);
1457
 
}
1458
 
 
1459
 
/*******************************************************************//**
1460
 
This utility flushes dirty blocks from the end of the LRU list.
1461
 
In the case of an LRU flush the calling thread may own latches to
1462
 
pages: to avoid deadlocks, this function must be written so that it
1463
 
cannot end up waiting for these latches!
1464
 
@return number of blocks for which the write request was queued. */
1465
 
static
1466
 
ulint
1467
 
buf_flush_LRU_list_batch(
1468
 
/*=====================*/
1469
 
        buf_pool_t*     buf_pool,       /*!< in: buffer pool instance */
1470
 
        ulint           max)            /*!< in: max of blocks to flush */
1471
 
{
1472
 
        buf_page_t*     bpage;
1473
 
        ulint           count = 0;
1474
 
 
1475
 
        ut_ad(buf_pool_mutex_own(buf_pool));
1476
 
 
1477
 
        do {
1478
 
                /* Start from the end of the list looking for a
1479
 
                suitable block to be flushed. */
1480
 
                bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1481
 
 
1482
 
                /* Iterate backwards over the flush list till we find
1483
 
                a page that isn't ready for flushing. */
1484
 
                while (bpage != NULL
1485
 
                       && !buf_flush_page_and_try_neighbors(
1486
 
                                bpage, BUF_FLUSH_LRU, max, &count)) {
1487
 
 
1488
 
                        bpage = UT_LIST_GET_PREV(LRU, bpage);
1489
 
                }
1490
 
        } while (bpage != NULL && count < max);
1491
 
 
1492
 
        /* We keep track of all flushes happening as part of LRU
1493
 
        flush. When estimating the desired rate at which flush_list
1494
 
        should be flushed, we factor in this value. */
1495
 
        buf_lru_flush_page_count += count;
1496
 
 
1497
 
        ut_ad(buf_pool_mutex_own(buf_pool));
1498
 
 
1499
 
        return(count);
1500
 
}
1501
 
 
1502
 
/*******************************************************************//**
1503
 
This utility flushes dirty blocks from the end of the flush_list.
1504
 
the calling thread is not allowed to own any latches on pages!
1505
 
@return number of blocks for which the write request was queued;
1506
 
ULINT_UNDEFINED if there was a flush of the same type already
1507
 
running */
1508
 
static
1509
 
ulint
1510
 
buf_flush_flush_list_batch(
1511
 
/*=======================*/
1512
 
        buf_pool_t*     buf_pool,       /*!< in: buffer pool instance */
1513
 
        ulint           min_n,          /*!< in: wished minimum mumber
1514
 
                                        of blocks flushed (it is not
1515
 
                                        guaranteed that the actual
1516
 
                                        number is that big, though) */
1517
 
        ib_uint64_t     lsn_limit)      /*!< all blocks whose
1518
 
                                        oldest_modification is smaller
1519
 
                                        than this should be flushed (if
1520
 
                                        their number does not exceed
1521
 
                                        min_n) */
1522
 
{
1523
 
        ulint           len;
1524
 
        buf_page_t*     bpage;
1525
 
        ulint           count = 0;
1526
 
 
1527
 
        ut_ad(buf_pool_mutex_own(buf_pool));
1528
 
 
1529
 
        /* If we have flushed enough, leave the loop */
1530
 
        do {
1531
 
                /* Start from the end of the list looking for a suitable
1532
 
                block to be flushed. */
1533
 
 
1534
 
                buf_flush_list_mutex_enter(buf_pool);
1535
 
 
1536
 
                /* We use len here because theoretically insertions can
1537
 
                happen in the flush_list below while we are traversing
1538
 
                it for a suitable candidate for flushing. We'd like to
1539
 
                set a limit on how farther we are willing to traverse
1540
 
                the list. */
1541
 
                len = UT_LIST_GET_LEN(buf_pool->flush_list);
1542
 
                bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1543
 
 
1544
 
                if (bpage) {
1545
 
                        ut_a(bpage->oldest_modification > 0);
1546
 
                }
1547
 
 
1548
 
                if (!bpage || bpage->oldest_modification >= lsn_limit) {
1549
 
 
1550
 
                        /* We have flushed enough */
1551
 
                        buf_flush_list_mutex_exit(buf_pool);
1552
 
                        break;
1553
 
                }
1554
 
 
1555
 
                ut_a(bpage->oldest_modification > 0);
1556
 
 
1557
 
                ut_ad(bpage->in_flush_list);
1558
 
 
1559
 
                buf_flush_list_mutex_exit(buf_pool);
1560
 
 
1561
 
                /* The list may change during the flushing and we cannot
1562
 
                safely preserve within this function a pointer to a
1563
 
                block in the list! */
1564
 
                while (bpage != NULL
1565
 
                       && len > 0
1566
 
                       && !buf_flush_page_and_try_neighbors(
1567
 
                                bpage, BUF_FLUSH_LIST, min_n, &count)) {
1568
 
 
1569
 
                        buf_flush_list_mutex_enter(buf_pool);
1570
 
 
1571
 
                        /* If we are here that means that buf_pool->mutex
1572
 
                         was not released in buf_flush_page_and_try_neighbors()
1573
 
                        above and this guarantees that bpage didn't get
1574
 
                        relocated since we released the flush_list
1575
 
                        mutex above. There is a chance, however, that
1576
 
                        the bpage got removed from flush_list (not
1577
 
                        currently possible because flush_list_remove()
1578
 
                        also obtains buf_pool mutex but that may change
1579
 
                        in future). To avoid this scenario we check
1580
 
                        the oldest_modification and if it is zero
1581
 
                        we start all over again. */
1582
 
                        if (bpage->oldest_modification == 0) {
1583
 
                                buf_flush_list_mutex_exit(buf_pool);
1584
 
                                break;
1585
 
                        }
1586
 
 
1587
 
                        bpage = UT_LIST_GET_PREV(list, bpage);
1588
 
 
1589
 
                        ut_ad(!bpage || bpage->in_flush_list);
1590
 
 
1591
 
                        buf_flush_list_mutex_exit(buf_pool);
1592
 
 
1593
 
                        --len;
1594
 
                }
1595
 
 
1596
 
        } while (count < min_n && bpage != NULL && len > 0);
1597
 
 
1598
 
        ut_ad(buf_pool_mutex_own(buf_pool));
1599
 
 
1600
 
        return(count);
1601
 
}
1602
 
 
1603
 
/*******************************************************************//**
1604
 
This utility flushes dirty blocks from the end of the LRU list or flush_list.
1605
 
NOTE 1: in the case of an LRU flush the calling thread may own latches to
1606
 
pages: to avoid deadlocks, this function must be written so that it cannot
1607
 
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1608
 
the calling thread is not allowed to own any latches on pages!
1609
 
@return number of blocks for which the write request was queued;
1610
 
ULINT_UNDEFINED if there was a flush of the same type already running */
1611
 
static
1612
 
ulint
1613
 
buf_flush_batch(
1614
 
/*============*/
1615
 
        buf_pool_t*     buf_pool,       /*!< in: buffer pool instance */
1616
 
        enum buf_flush  flush_type,     /*!< in: BUF_FLUSH_LRU or
1617
 
                                        BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
1618
 
                                        then the caller must not own any
1619
 
                                        latches on pages */
1620
 
        ulint           min_n,          /*!< in: wished minimum mumber of blocks
1621
 
                                        flushed (it is not guaranteed that the
1622
 
                                        actual number is that big, though) */
1623
 
        ib_uint64_t     lsn_limit)      /*!< in: in the case of BUF_FLUSH_LIST
1624
 
                                        all blocks whose oldest_modification is
1625
 
                                        smaller than this should be flushed
1626
 
                                        (if their number does not exceed
1627
 
                                        min_n), otherwise ignored */
1628
 
{
1629
 
        ulint           count   = 0;
1630
 
 
1631
 
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1632
 
#ifdef UNIV_SYNC_DEBUG
1633
 
        ut_ad((flush_type != BUF_FLUSH_LIST)
1634
 
              || sync_thread_levels_empty_gen(TRUE));
1635
 
#endif /* UNIV_SYNC_DEBUG */
1636
 
 
1637
 
        buf_pool_mutex_enter(buf_pool);
1638
 
 
1639
 
        /* Note: The buffer pool mutex is released and reacquired within
1640
 
        the flush functions. */
1641
 
        switch(flush_type) {
1642
 
        case BUF_FLUSH_LRU:
1643
 
                count = buf_flush_LRU_list_batch(buf_pool, min_n);
1644
 
                break;
1645
 
        case BUF_FLUSH_LIST:
1646
 
                count = buf_flush_flush_list_batch(buf_pool, min_n, lsn_limit);
1647
 
                break;
1648
 
        default:
1649
 
                ut_error;
1650
 
        }
1651
 
 
1652
 
        buf_pool_mutex_exit(buf_pool);
1653
 
 
1654
 
        buf_flush_buffered_writes();
1655
 
 
1656
 
#ifdef UNIV_DEBUG
1657
 
        if (buf_debug_prints && count > 0) {
1658
 
                fprintf(stderr, flush_type == BUF_FLUSH_LRU
1659
 
                        ? "Flushed %lu pages in LRU flush\n"
1660
 
                        : "Flushed %lu pages in flush list flush\n",
1661
 
                        (ulong) count);
1662
 
        }
1663
 
#endif /* UNIV_DEBUG */
1664
 
 
1665
 
        srv_buf_pool_flushed += count;
1666
 
 
1667
 
        return(count);
1668
 
}
1669
 
 
1670
 
/******************************************************************//**
1671
 
Gather the aggregated stats for both flush list and LRU list flushing */
1672
 
static
1673
 
void
1674
 
buf_flush_common(
1675
 
/*=============*/
1676
 
        enum buf_flush  flush_type,     /*!< in: type of flush */
1677
 
        ulint           page_count)     /*!< in: number of pages flushed */
1678
 
{
1679
 
        buf_flush_buffered_writes();
1680
 
 
1681
 
        ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1682
 
 
1683
 
#ifdef UNIV_DEBUG
1684
 
        if (buf_debug_prints && page_count > 0) {
1685
 
                fprintf(stderr, flush_type == BUF_FLUSH_LRU
1686
 
                        ? "Flushed %lu pages in LRU flush\n"
1687
 
                        : "Flushed %lu pages in flush list flush\n",
1688
 
                        (ulong) page_count);
1689
 
        }
1690
 
#endif /* UNIV_DEBUG */
1691
 
 
1692
 
        srv_buf_pool_flushed += page_count;
1693
 
 
1694
 
        if (flush_type == BUF_FLUSH_LRU) {
1695
 
                /* We keep track of all flushes happening as part of LRU
1696
 
                flush. When estimating the desired rate at which flush_list
1697
 
                should be flushed we factor in this value. */
1698
 
                buf_lru_flush_page_count += page_count;
1699
 
        }
1700
 
}
1701
 
 
1702
 
/******************************************************************//**
1703
 
Start a buffer flush batch for LRU or flush list */
1704
 
static
1705
 
ibool
1706
 
buf_flush_start(
1707
 
/*============*/
1708
 
        buf_pool_t*     buf_pool,       /*!< buffer pool instance */
1709
 
        enum buf_flush  flush_type)     /*!< in: BUF_FLUSH_LRU
1710
 
                                        or BUF_FLUSH_LIST */
1711
 
{
1712
 
        buf_pool_mutex_enter(buf_pool);
1713
 
 
1714
 
        if (buf_pool->n_flush[flush_type] > 0
1715
 
           || buf_pool->init_flush[flush_type] == TRUE) {
1716
 
 
1717
 
                /* There is already a flush batch of the same type running */
1718
 
 
1719
 
                buf_pool_mutex_exit(buf_pool);
1720
 
 
1721
 
                return(FALSE);
1722
 
        }
1723
 
 
1724
 
        buf_pool->init_flush[flush_type] = TRUE;
1725
 
 
1726
 
        buf_pool_mutex_exit(buf_pool);
1727
 
 
1728
 
        return(TRUE);
1729
 
}
1730
 
 
1731
 
/******************************************************************//**
1732
 
End a buffer flush batch for LRU or flush list */
1733
 
static
1734
 
void
1735
 
buf_flush_end(
1736
 
/*==========*/
1737
 
        buf_pool_t*     buf_pool,       /*!< buffer pool instance */
1738
 
        enum buf_flush  flush_type)     /*!< in: BUF_FLUSH_LRU
1739
 
                                        or BUF_FLUSH_LIST */
1740
 
{
1741
 
        buf_pool_mutex_enter(buf_pool);
1742
 
 
1743
 
        buf_pool->init_flush[flush_type] = FALSE;
1744
 
 
1745
 
        if (buf_pool->n_flush[flush_type] == 0) {
1746
 
 
1747
 
                /* The running flush batch has ended */
1748
 
 
1749
 
                os_event_set(buf_pool->no_flush[flush_type]);
1750
 
        }
1751
 
 
1752
 
        buf_pool_mutex_exit(buf_pool);
1753
 
}
1754
 
 
1755
 
/******************************************************************//**
1756
 
Waits until a flush batch of the given type ends */
1757
 
UNIV_INTERN
1758
 
void
1759
 
buf_flush_wait_batch_end(
1760
 
/*=====================*/
1761
 
        buf_pool_t*     buf_pool,       /*!< buffer pool instance */
1762
 
        enum buf_flush  type)           /*!< in: BUF_FLUSH_LRU
1763
 
                                        or BUF_FLUSH_LIST */
1764
 
{
1765
 
        ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1766
 
 
1767
 
        if (buf_pool == NULL) {
1768
 
                ulint   i;
1769
 
 
1770
 
                for (i = 0; i < srv_buf_pool_instances; ++i) {
1771
 
                        buf_pool_t*     buf_pool;
1772
 
 
1773
 
                        buf_pool = buf_pool_from_array(i);
1774
 
 
1775
 
                        os_event_wait(buf_pool->no_flush[type]);
1776
 
                }
1777
 
        } else {
1778
 
                os_event_wait(buf_pool->no_flush[type]);
1779
 
        }
1780
 
}
1781
 
 
1782
 
/*******************************************************************//**
1783
 
This utility flushes dirty blocks from the end of the LRU list.
1784
 
NOTE: The calling thread may own latches to pages: to avoid deadlocks,
1785
 
this function must be written so that it cannot end up waiting for these
1786
 
latches!
1787
 
@return number of blocks for which the write request was queued;
1788
 
ULINT_UNDEFINED if there was a flush of the same type already running */
1789
 
UNIV_INTERN
1790
 
ulint
1791
 
buf_flush_LRU(
1792
 
/*==========*/
1793
 
        buf_pool_t*     buf_pool,       /*!< in: buffer pool instance */
1794
 
        ulint           min_n)          /*!< in: wished minimum mumber of blocks
1795
 
                                        flushed (it is not guaranteed that the
1796
 
                                        actual number is that big, though) */
1797
 
{
1798
 
        ulint           page_count;
1799
 
 
1800
 
        if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
1801
 
                return(ULINT_UNDEFINED);
1802
 
        }
1803
 
 
1804
 
        page_count = buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0);
1805
 
 
1806
 
        buf_flush_end(buf_pool, BUF_FLUSH_LRU);
1807
 
 
1808
 
        buf_flush_common(BUF_FLUSH_LRU, page_count);
1809
 
 
1810
 
        return(page_count);
1811
 
}
1812
 
 
1813
 
/*******************************************************************//**
1814
 
This utility flushes dirty blocks from the end of the flush list of
1815
 
all buffer pool instances.
1816
 
NOTE: The calling thread is not allowed to own any latches on pages!
1817
 
@return number of blocks for which the write request was queued;
1818
 
ULINT_UNDEFINED if there was a flush of the same type already running */
1819
 
UNIV_INTERN
1820
 
ulint
1821
 
buf_flush_list(
1822
 
/*===========*/
1823
 
        ulint           min_n,          /*!< in: wished minimum mumber of blocks
1824
 
                                        flushed (it is not guaranteed that the
1825
 
                                        actual number is that big, though) */
1826
 
        ib_uint64_t     lsn_limit)      /*!< in the case BUF_FLUSH_LIST all
1827
 
                                        blocks whose oldest_modification is
1828
 
                                        smaller than this should be flushed
1829
 
                                        (if their number does not exceed
1830
 
                                        min_n), otherwise ignored */
1831
 
{
1832
 
        ulint           i;
1833
 
        ulint           total_page_count = 0;
1834
 
        ibool           skipped = FALSE;
1835
 
 
1836
 
        if (min_n != ULINT_MAX) {
1837
 
                /* Ensure that flushing is spread evenly amongst the
1838
 
                buffer pool instances. When min_n is ULINT_MAX
1839
 
                we need to flush everything up to the lsn limit
1840
 
                so no limit here. */
1841
 
                min_n = (min_n + srv_buf_pool_instances - 1)
1842
 
                         / srv_buf_pool_instances;
1843
 
        }
1844
 
 
1845
 
        /* Flush to lsn_limit in all buffer pool instances */
1846
 
        for (i = 0; i < srv_buf_pool_instances; i++) {
1847
 
                buf_pool_t*     buf_pool;
1848
 
                ulint           page_count = 0;
1849
 
 
1850
 
                buf_pool = buf_pool_from_array(i);
1851
 
 
1852
 
                if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) {
1853
 
                        /* We have two choices here. If lsn_limit was
1854
 
                        specified then skipping an instance of buffer
1855
 
                        pool means we cannot guarantee that all pages
1856
 
                        up to lsn_limit has been flushed. We can
1857
 
                        return right now with failure or we can try
1858
 
                        to flush remaining buffer pools up to the
1859
 
                        lsn_limit. We attempt to flush other buffer
1860
 
                        pools based on the assumption that it will
1861
 
                        help in the retry which will follow the
1862
 
                        failure. */
1863
 
                        skipped = TRUE;
1864
 
 
1865
 
                        continue;
1866
 
                }
1867
 
 
1868
 
                page_count = buf_flush_batch(
1869
 
                        buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit);
1870
 
 
1871
 
                buf_flush_end(buf_pool, BUF_FLUSH_LIST);
1872
 
 
1873
 
                buf_flush_common(BUF_FLUSH_LIST, page_count);
1874
 
 
1875
 
                total_page_count += page_count;
1876
 
        }
1877
 
 
1878
 
        return(lsn_limit != IB_ULONGLONG_MAX && skipped
1879
 
               ? ULINT_UNDEFINED : total_page_count);
1880
 
}
1881
 
 
1882
 
/******************************************************************//**
1883
 
Gives a recommendation of how many blocks should be flushed to establish
1884
 
a big enough margin of replaceable blocks near the end of the LRU list
1885
 
and in the free list.
1886
 
@return number of blocks which should be flushed from the end of the
1887
 
LRU list */
1888
 
static
1889
 
ulint
1890
 
buf_flush_LRU_recommendation(
1891
 
/*=========================*/
1892
 
        buf_pool_t*     buf_pool)               /*!< in: Buffer pool instance */
1893
 
{
1894
 
        buf_page_t*     bpage;
1895
 
        ulint           n_replaceable;
1896
 
        ulint           distance        = 0;
1897
 
 
1898
 
        buf_pool_mutex_enter(buf_pool);
1899
 
 
1900
 
        n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
1901
 
 
1902
 
        bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1903
 
 
1904
 
        while ((bpage != NULL)
1905
 
               && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
1906
 
                   + BUF_FLUSH_EXTRA_MARGIN(buf_pool))
1907
 
               && (distance < BUF_LRU_FREE_SEARCH_LEN(buf_pool))) {
1908
 
 
1909
 
                mutex_t* block_mutex = buf_page_get_mutex(bpage);
1910
 
 
1911
 
                mutex_enter(block_mutex);
1912
 
 
1913
 
                if (buf_flush_ready_for_replace(bpage)) {
1914
 
                        n_replaceable++;
1915
 
                }
1916
 
 
1917
 
                mutex_exit(block_mutex);
1918
 
 
1919
 
                distance++;
1920
 
 
1921
 
                bpage = UT_LIST_GET_PREV(LRU, bpage);
1922
 
        }
1923
 
 
1924
 
        buf_pool_mutex_exit(buf_pool);
1925
 
 
1926
 
        if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)) {
1927
 
 
1928
 
                return(0);
1929
 
        }
1930
 
 
1931
 
        return(BUF_FLUSH_FREE_BLOCK_MARGIN(buf_pool)
1932
 
               + BUF_FLUSH_EXTRA_MARGIN(buf_pool)
1933
 
               - n_replaceable);
1934
 
}
1935
 
 
1936
 
/*********************************************************************//**
1937
 
Flushes pages from the end of the LRU list if there is too small a margin
1938
 
of replaceable pages there or in the free list. VERY IMPORTANT: this function
1939
 
is called also by threads which have locks on pages. To avoid deadlocks, we
1940
 
flush only pages such that the s-lock required for flushing can be acquired
1941
 
immediately, without waiting. */
1942
 
UNIV_INTERN
1943
 
void
1944
 
buf_flush_free_margin(
1945
 
/*==================*/
1946
 
        buf_pool_t*     buf_pool)               /*!< in: Buffer pool instance */
1947
 
{
1948
 
        ulint   n_to_flush;
1949
 
 
1950
 
        n_to_flush = buf_flush_LRU_recommendation(buf_pool);
1951
 
 
1952
 
        if (n_to_flush > 0) {
1953
 
                ulint   n_flushed;
1954
 
 
1955
 
                n_flushed = buf_flush_LRU(buf_pool, n_to_flush);
1956
 
 
1957
 
                if (n_flushed == ULINT_UNDEFINED) {
1958
 
                        /* There was an LRU type flush batch already running;
1959
 
                        let us wait for it to end */
1960
 
 
1961
 
                        buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
1962
 
                }
1963
 
        }
1964
 
}
1965
 
 
1966
 
/*********************************************************************//**
1967
 
Flushes pages from the end of all the LRU lists. */
1968
 
UNIV_INTERN
1969
 
void
1970
 
buf_flush_free_margins(void)
1971
 
/*========================*/
1972
 
{
1973
 
        ulint   i;
1974
 
 
1975
 
        for (i = 0; i < srv_buf_pool_instances; i++) {
1976
 
                buf_pool_t*     buf_pool;
1977
 
 
1978
 
                buf_pool = buf_pool_from_array(i);
1979
 
 
1980
 
                buf_flush_free_margin(buf_pool);
1981
 
        }
1982
 
}
1983
 
 
1984
 
/*********************************************************************
1985
 
Update the historical stats that we are collecting for flush rate
1986
 
heuristics at the end of each interval.
1987
 
Flush rate heuristic depends on (a) rate of redo log generation and
1988
 
(b) the rate at which LRU flush is happening. */
1989
 
UNIV_INTERN
1990
 
void
1991
 
buf_flush_stat_update(void)
1992
 
/*=======================*/
1993
 
{
1994
 
        buf_flush_stat_t*       item;
1995
 
        ib_uint64_t             lsn_diff;
1996
 
        ib_uint64_t             lsn;
1997
 
        ulint                   n_flushed;
1998
 
 
1999
 
        lsn = log_get_lsn();
2000
 
        if (buf_flush_stat_cur.redo == 0) {
2001
 
                /* First time around. Just update the current LSN
2002
 
                and return. */
2003
 
                buf_flush_stat_cur.redo = lsn;
2004
 
                return;
2005
 
        }
2006
 
 
2007
 
        item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
2008
 
 
2009
 
        /* values for this interval */
2010
 
        lsn_diff = lsn - buf_flush_stat_cur.redo;
2011
 
        n_flushed = buf_lru_flush_page_count
2012
 
                    - buf_flush_stat_cur.n_flushed;
2013
 
 
2014
 
        /* add the current value and subtract the obsolete entry. */
2015
 
        buf_flush_stat_sum.redo += lsn_diff - item->redo;
2016
 
        buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
2017
 
 
2018
 
        /* put current entry in the array. */
2019
 
        item->redo = lsn_diff;
2020
 
        item->n_flushed = n_flushed;
2021
 
 
2022
 
        /* update the index */
2023
 
        buf_flush_stat_arr_ind++;
2024
 
        buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
2025
 
 
2026
 
        /* reset the current entry. */
2027
 
        buf_flush_stat_cur.redo = lsn;
2028
 
        buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
2029
 
}
2030
 
 
2031
 
/*********************************************************************
2032
 
Determines the fraction of dirty pages that need to be flushed based
2033
 
on the speed at which we generate redo log. Note that if redo log
2034
 
is generated at a significant rate without corresponding increase
2035
 
in the number of dirty pages (for example, an in-memory workload)
2036
 
it can cause IO bursts of flushing. This function implements heuristics
2037
 
to avoid this burstiness.
2038
 
@return number of dirty pages to be flushed / second */
2039
 
UNIV_INTERN
2040
 
ulint
2041
 
buf_flush_get_desired_flush_rate(void)
2042
 
/*==================================*/
2043
 
{
2044
 
        ulint           i;
2045
 
        lint            rate;
2046
 
        ulint           redo_avg;
2047
 
        ulint           n_dirty = 0;
2048
 
        ulint           n_flush_req;
2049
 
        ulint           lru_flush_avg;
2050
 
        ib_uint64_t     lsn = log_get_lsn();
2051
 
        ulint           log_capacity = log_get_capacity();
2052
 
 
2053
 
        /* log_capacity should never be zero after the initialization
2054
 
        of log subsystem. */
2055
 
        ut_ad(log_capacity != 0);
2056
 
 
2057
 
        /* Get total number of dirty pages. It is OK to access
2058
 
        flush_list without holding any mutex as we are using this
2059
 
        only for heuristics. */
2060
 
        for (i = 0; i < srv_buf_pool_instances; i++) {
2061
 
                buf_pool_t*     buf_pool;
2062
 
 
2063
 
                buf_pool = buf_pool_from_array(i);
2064
 
                n_dirty += UT_LIST_GET_LEN(buf_pool->flush_list);
2065
 
        }
2066
 
 
2067
 
        /* An overflow can happen if we generate more than 2^32 bytes
2068
 
        of redo in this interval i.e.: 4G of redo in 1 second. We can
2069
 
        safely consider this as infinity because if we ever come close
2070
 
        to 4G we'll start a synchronous flush of dirty pages. */
2071
 
        /* redo_avg below is average at which redo is generated in
2072
 
        past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
2073
 
        interval. */
2074
 
        redo_avg = (ulint) (buf_flush_stat_sum.redo
2075
 
                            / BUF_FLUSH_STAT_N_INTERVAL
2076
 
                            + (lsn - buf_flush_stat_cur.redo));
2077
 
 
2078
 
        /* An overflow can happen possibly if we flush more than 2^32
2079
 
        pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
2080
 
        unlikely scenario. Even when this happens it means that our
2081
 
        flush rate will be off the mark. It won't affect correctness
2082
 
        of any subsystem. */
2083
 
        /* lru_flush_avg below is rate at which pages are flushed as
2084
 
        part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
2085
 
        number of pages flushed in the current interval. */
2086
 
        lru_flush_avg = buf_flush_stat_sum.n_flushed
2087
 
                        / BUF_FLUSH_STAT_N_INTERVAL
2088
 
                        + (buf_lru_flush_page_count
2089
 
                           - buf_flush_stat_cur.n_flushed);
2090
 
 
2091
 
        n_flush_req = (n_dirty * redo_avg) / log_capacity;
2092
 
 
2093
 
        /* The number of pages that we want to flush from the flush
2094
 
        list is the difference between the required rate and the
2095
 
        number of pages that we are historically flushing from the
2096
 
        LRU list */
2097
 
        rate = n_flush_req - lru_flush_avg;
2098
 
        return(rate > 0 ? (ulint) rate : 0);
2099
 
}
2100
 
 
2101
 
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2102
 
/******************************************************************//**
2103
 
Validates the flush list.
2104
 
@return TRUE if ok */
2105
 
static
2106
 
ibool
2107
 
buf_flush_validate_low(
2108
 
/*===================*/
2109
 
        buf_pool_t*     buf_pool)               /*!< in: Buffer pool instance */
2110
 
{
2111
 
        buf_page_t*             bpage;
2112
 
        const ib_rbt_node_t*    rnode = NULL;
2113
 
 
2114
 
        ut_ad(buf_flush_list_mutex_own(buf_pool));
2115
 
 
2116
 
        UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
2117
 
                         ut_ad(ut_list_node_313->in_flush_list));
2118
 
 
2119
 
        bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
2120
 
 
2121
 
        /* If we are in recovery mode i.e.: flush_rbt != NULL
2122
 
        then each block in the flush_list must also be present
2123
 
        in the flush_rbt. */
2124
 
        if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2125
 
                rnode = rbt_first(buf_pool->flush_rbt);
2126
 
        }
2127
 
 
2128
 
        while (bpage != NULL) {
2129
 
                const ib_uint64_t om = bpage->oldest_modification;
2130
 
 
2131
 
                ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
2132
 
 
2133
 
                ut_ad(bpage->in_flush_list);
2134
 
 
2135
 
                /* A page in flush_list can be in BUF_BLOCK_REMOVE_HASH
2136
 
                state. This happens when a page is in the middle of
2137
 
                being relocated. In that case the original descriptor
2138
 
                can have this state and still be in the flush list
2139
 
                waiting to acquire the flush_list_mutex to complete
2140
 
                the relocation. */
2141
 
                ut_a(buf_page_in_file(bpage)
2142
 
                     || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
2143
 
                ut_a(om > 0);
2144
 
 
2145
 
                if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
2146
 
                        buf_page_t** prpage;
2147
 
 
2148
 
                        ut_a(rnode);
2149
 
                        prpage = rbt_value(buf_page_t*, rnode);
2150
 
 
2151
 
                        ut_a(*prpage);
2152
 
                        ut_a(*prpage == bpage);
2153
 
                        rnode = rbt_next(buf_pool->flush_rbt, rnode);
2154
 
                }
2155
 
 
2156
 
                bpage = UT_LIST_GET_NEXT(list, bpage);
2157
 
 
2158
 
                ut_a(!bpage || om >= bpage->oldest_modification);
2159
 
        }
2160
 
 
2161
 
        /* By this time we must have exhausted the traversal of
2162
 
        flush_rbt (if active) as well. */
2163
 
        ut_a(rnode == NULL);
2164
 
 
2165
 
        return(TRUE);
2166
 
}
2167
 
 
2168
 
/******************************************************************//**
2169
 
Validates the flush list.
2170
 
@return TRUE if ok */
2171
 
UNIV_INTERN
2172
 
ibool
2173
 
buf_flush_validate(
2174
 
/*===============*/
2175
 
        buf_pool_t*     buf_pool)       /*!< buffer pool instance */
2176
 
{
2177
 
        ibool   ret;
2178
 
 
2179
 
        buf_flush_list_mutex_enter(buf_pool);
2180
 
 
2181
 
        ret = buf_flush_validate_low(buf_pool);
2182
 
 
2183
 
        buf_flush_list_mutex_exit(buf_pool);
2184
 
 
2185
 
        return(ret);
2186
 
}
2187
 
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2188
 
#endif /* !UNIV_HOTBACKUP */