~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to plugin/innobase/buf/buf0flu.c

  • Committer: Brian Aker
  • Date: 2010-01-22 00:53:13 UTC
  • Revision ID: brian@gaz-20100122005313-jmizcbcdi1lt4tcx
Revert db patch.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*****************************************************************************
 
2
 
 
3
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
 
4
 
 
5
This program is free software; you can redistribute it and/or modify it under
 
6
the terms of the GNU General Public License as published by the Free Software
 
7
Foundation; version 2 of the License.
 
8
 
 
9
This program is distributed in the hope that it will be useful, but WITHOUT
 
10
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 
11
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
12
 
 
13
You should have received a copy of the GNU General Public License along with
 
14
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 
15
Place, Suite 330, Boston, MA 02111-1307 USA
 
16
 
 
17
*****************************************************************************/
 
18
 
 
19
/**************************************************//**
 
20
@file buf/buf0flu.c
 
21
The database buffer buf_pool flush algorithm
 
22
 
 
23
Created 11/11/1995 Heikki Tuuri
 
24
*******************************************************/
 
25
 
 
26
#include "buf0flu.h"
 
27
 
 
28
#ifdef UNIV_NONINL
 
29
#include "buf0flu.ic"
 
30
#endif
 
31
 
 
32
#include "buf0buf.h"
 
33
#include "srv0srv.h"
 
34
#include "page0zip.h"
 
35
#ifndef UNIV_HOTBACKUP
 
36
#include "ut0byte.h"
 
37
#include "ut0lst.h"
 
38
#include "page0page.h"
 
39
#include "fil0fil.h"
 
40
#include "buf0lru.h"
 
41
#include "buf0rea.h"
 
42
#include "ibuf0ibuf.h"
 
43
#include "log0log.h"
 
44
#include "os0file.h"
 
45
#include "trx0sys.h"
 
46
 
 
47
/**********************************************************************
 
48
These statistics are generated for heuristics used in estimating the
 
49
rate at which we should flush the dirty blocks to avoid bursty IO
 
50
activity. Note that the rate of flushing not only depends on how many
 
51
dirty pages we have in the buffer pool but it is also a fucntion of
 
52
how much redo the workload is generating and at what rate. */
 
53
/* @{ */
 
54
 
 
55
/** Number of intervals for which we keep the history of these stats.
 
56
Each interval is 1 second, defined by the rate at which
 
57
srv_error_monitor_thread() calls buf_flush_stat_update(). */
 
58
#define BUF_FLUSH_STAT_N_INTERVAL 20
 
59
 
 
60
/** Sampled values buf_flush_stat_cur.
 
61
Not protected by any mutex.  Updated by buf_flush_stat_update(). */
 
62
static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
 
63
 
 
64
/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
 
65
static ulint            buf_flush_stat_arr_ind;
 
66
 
 
67
/** Values at start of the current interval. Reset by
 
68
buf_flush_stat_update(). */
 
69
static buf_flush_stat_t buf_flush_stat_cur;
 
70
 
 
71
/** Running sum of past values of buf_flush_stat_cur.
 
72
Updated by buf_flush_stat_update(). Not protected by any mutex. */
 
73
static buf_flush_stat_t buf_flush_stat_sum;
 
74
 
 
75
/** Number of pages flushed through non flush_list flushes. */
 
76
static ulint buf_lru_flush_page_count = 0;
 
77
 
 
78
/* @} */
 
79
 
 
80
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
81
/******************************************************************//**
 
82
Validates the flush list.
 
83
@return TRUE if ok */
 
84
static
 
85
ibool
 
86
buf_flush_validate_low(void);
 
87
/*========================*/
 
88
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
89
 
 
90
/********************************************************************//**
 
91
Inserts a modified block into the flush list. */
 
92
UNIV_INTERN
 
93
void
 
94
buf_flush_insert_into_flush_list(
 
95
/*=============================*/
 
96
        buf_block_t*    block)  /*!< in/out: block which is modified */
 
97
{
 
98
        ut_ad(buf_pool_mutex_own());
 
99
        ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
 
100
              || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
 
101
                  <= block->page.oldest_modification));
 
102
 
 
103
        ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 
104
        ut_ad(block->page.in_LRU_list);
 
105
        ut_ad(block->page.in_page_hash);
 
106
        ut_ad(!block->page.in_zip_hash);
 
107
        ut_ad(!block->page.in_flush_list);
 
108
        ut_d(block->page.in_flush_list = TRUE);
 
109
        UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
 
110
 
 
111
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
112
        ut_a(buf_flush_validate_low());
 
113
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
114
}
 
115
 
 
116
/********************************************************************//**
 
117
Inserts a modified block into the flush list in the right sorted position.
 
118
This function is used by recovery, because there the modifications do not
 
119
necessarily come in the order of lsn's. */
 
120
UNIV_INTERN
 
121
void
 
122
buf_flush_insert_sorted_into_flush_list(
 
123
/*====================================*/
 
124
        buf_block_t*    block)  /*!< in/out: block which is modified */
 
125
{
 
126
        buf_page_t*     prev_b;
 
127
        buf_page_t*     b;
 
128
 
 
129
        ut_ad(buf_pool_mutex_own());
 
130
        ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 
131
 
 
132
        ut_ad(block->page.in_LRU_list);
 
133
        ut_ad(block->page.in_page_hash);
 
134
        ut_ad(!block->page.in_zip_hash);
 
135
        ut_ad(!block->page.in_flush_list);
 
136
        ut_d(block->page.in_flush_list = TRUE);
 
137
 
 
138
        prev_b = NULL;
 
139
        b = UT_LIST_GET_FIRST(buf_pool->flush_list);
 
140
 
 
141
        while (b && b->oldest_modification > block->page.oldest_modification) {
 
142
                ut_ad(b->in_flush_list);
 
143
                prev_b = b;
 
144
                b = UT_LIST_GET_NEXT(list, b);
 
145
        }
 
146
 
 
147
        if (prev_b == NULL) {
 
148
                UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
 
149
        } else {
 
150
                UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
 
151
                                     prev_b, &block->page);
 
152
        }
 
153
 
 
154
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
155
        ut_a(buf_flush_validate_low());
 
156
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
157
}
 
158
 
 
159
/********************************************************************//**
 
160
Returns TRUE if the file page block is immediately suitable for replacement,
 
161
i.e., the transition FILE_PAGE => NOT_USED allowed.
 
162
@return TRUE if can replace immediately */
 
163
UNIV_INTERN
 
164
ibool
 
165
buf_flush_ready_for_replace(
 
166
/*========================*/
 
167
        buf_page_t*     bpage)  /*!< in: buffer control block, must be
 
168
                                buf_page_in_file(bpage) and in the LRU list */
 
169
{
 
170
        ut_ad(buf_pool_mutex_own());
 
171
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
172
        ut_ad(bpage->in_LRU_list);
 
173
 
 
174
        if (UNIV_LIKELY(buf_page_in_file(bpage))) {
 
175
 
 
176
                return(bpage->oldest_modification == 0
 
177
                       && buf_page_get_io_fix(bpage) == BUF_IO_NONE
 
178
                       && bpage->buf_fix_count == 0);
 
179
        }
 
180
 
 
181
        ut_print_timestamp(stderr);
 
182
        fprintf(stderr,
 
183
                "  InnoDB: Error: buffer block state %lu"
 
184
                " in the LRU list!\n",
 
185
                (ulong) buf_page_get_state(bpage));
 
186
        ut_print_buf(stderr, bpage, sizeof(buf_page_t));
 
187
        putc('\n', stderr);
 
188
 
 
189
        return(FALSE);
 
190
}
 
191
 
 
192
/********************************************************************//**
 
193
Returns TRUE if the block is modified and ready for flushing.
 
194
@return TRUE if can flush immediately */
 
195
UNIV_INLINE
 
196
ibool
 
197
buf_flush_ready_for_flush(
 
198
/*======================*/
 
199
        buf_page_t*     bpage,  /*!< in: buffer control block, must be
 
200
                                buf_page_in_file(bpage) */
 
201
        enum buf_flush  flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
 
202
{
 
203
        ut_a(buf_page_in_file(bpage));
 
204
        ut_ad(buf_pool_mutex_own());
 
205
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
206
        ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
 
207
 
 
208
        if (bpage->oldest_modification != 0
 
209
            && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
 
210
                ut_ad(bpage->in_flush_list);
 
211
 
 
212
                if (flush_type != BUF_FLUSH_LRU) {
 
213
 
 
214
                        return(TRUE);
 
215
 
 
216
                } else if (bpage->buf_fix_count == 0) {
 
217
 
 
218
                        /* If we are flushing the LRU list, to avoid deadlocks
 
219
                        we require the block not to be bufferfixed, and hence
 
220
                        not latched. */
 
221
 
 
222
                        return(TRUE);
 
223
                }
 
224
        }
 
225
 
 
226
        return(FALSE);
 
227
}
 
228
 
 
229
/********************************************************************//**
 
230
Remove a block from the flush list of modified blocks. */
 
231
UNIV_INTERN
 
232
void
 
233
buf_flush_remove(
 
234
/*=============*/
 
235
        buf_page_t*     bpage)  /*!< in: pointer to the block in question */
 
236
{
 
237
        ut_ad(buf_pool_mutex_own());
 
238
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
 
239
        ut_ad(bpage->in_flush_list);
 
240
        ut_d(bpage->in_flush_list = FALSE);
 
241
 
 
242
        switch (buf_page_get_state(bpage)) {
 
243
        case BUF_BLOCK_ZIP_PAGE:
 
244
                /* clean compressed pages should not be on the flush list */
 
245
        case BUF_BLOCK_ZIP_FREE:
 
246
        case BUF_BLOCK_NOT_USED:
 
247
        case BUF_BLOCK_READY_FOR_USE:
 
248
        case BUF_BLOCK_MEMORY:
 
249
        case BUF_BLOCK_REMOVE_HASH:
 
250
                ut_error;
 
251
                return;
 
252
        case BUF_BLOCK_ZIP_DIRTY:
 
253
                buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
 
254
                UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
 
255
                buf_LRU_insert_zip_clean(bpage);
 
256
                break;
 
257
        case BUF_BLOCK_FILE_PAGE:
 
258
                UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
 
259
                break;
 
260
        }
 
261
 
 
262
        bpage->oldest_modification = 0;
 
263
 
 
264
        ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
 
265
                              ut_ad(ut_list_node_313->in_flush_list)));
 
266
}
 
267
 
 
268
/********************************************************************//**
 
269
Updates the flush system data structures when a write is completed. */
 
270
UNIV_INTERN
 
271
void
 
272
buf_flush_write_complete(
 
273
/*=====================*/
 
274
        buf_page_t*     bpage)  /*!< in: pointer to the block in question */
 
275
{
 
276
        enum buf_flush  flush_type;
 
277
 
 
278
        ut_ad(bpage);
 
279
 
 
280
        buf_flush_remove(bpage);
 
281
 
 
282
        flush_type = buf_page_get_flush_type(bpage);
 
283
        buf_pool->n_flush[flush_type]--;
 
284
 
 
285
        if (flush_type == BUF_FLUSH_LRU) {
 
286
                /* Put the block to the end of the LRU list to wait to be
 
287
                moved to the free list */
 
288
 
 
289
                buf_LRU_make_block_old(bpage);
 
290
 
 
291
                buf_pool->LRU_flush_ended++;
 
292
        }
 
293
 
 
294
        /* fprintf(stderr, "n pending flush %lu\n",
 
295
        buf_pool->n_flush[flush_type]); */
 
296
 
 
297
        if ((buf_pool->n_flush[flush_type] == 0)
 
298
            && (buf_pool->init_flush[flush_type] == FALSE)) {
 
299
 
 
300
                /* The running flush batch has ended */
 
301
 
 
302
                os_event_set(buf_pool->no_flush[flush_type]);
 
303
        }
 
304
}
 
305
 
 
306
/********************************************************************//**
 
307
Flushes possible buffered writes from the doublewrite memory buffer to disk,
 
308
and also wakes up the aio thread if simulated aio is used. It is very
 
309
important to call this function after a batch of writes has been posted,
 
310
and also when we may have to wait for a page latch! Otherwise a deadlock
 
311
of threads can occur. */
 
312
static
 
313
void
 
314
buf_flush_buffered_writes(void)
 
315
/*===========================*/
 
316
{
 
317
        byte*           write_buf;
 
318
        ulint           len;
 
319
        ulint           len2;
 
320
        ulint           i;
 
321
 
 
322
        if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
 
323
                os_aio_simulated_wake_handler_threads();
 
324
 
 
325
                return;
 
326
        }
 
327
 
 
328
        mutex_enter(&(trx_doublewrite->mutex));
 
329
 
 
330
        /* Write first to doublewrite buffer blocks. We use synchronous
 
331
        aio and thus know that file write has been completed when the
 
332
        control returns. */
 
333
 
 
334
        if (trx_doublewrite->first_free == 0) {
 
335
 
 
336
                mutex_exit(&(trx_doublewrite->mutex));
 
337
 
 
338
                return;
 
339
        }
 
340
 
 
341
        for (i = 0; i < trx_doublewrite->first_free; i++) {
 
342
 
 
343
                const buf_block_t*      block;
 
344
 
 
345
                block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
 
346
 
 
347
                if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
 
348
                    || block->page.zip.data) {
 
349
                        /* No simple validate for compressed pages exists. */
 
350
                        continue;
 
351
                }
 
352
 
 
353
                if (UNIV_UNLIKELY
 
354
                    (memcmp(block->frame + (FIL_PAGE_LSN + 4),
 
355
                            block->frame + (UNIV_PAGE_SIZE
 
356
                                            - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
 
357
                            4))) {
 
358
                        ut_print_timestamp(stderr);
 
359
                        fprintf(stderr,
 
360
                                "  InnoDB: ERROR: The page to be written"
 
361
                                " seems corrupt!\n"
 
362
                                "InnoDB: The lsn fields do not match!"
 
363
                                " Noticed in the buffer pool\n"
 
364
                                "InnoDB: before posting to the"
 
365
                                " doublewrite buffer.\n");
 
366
                }
 
367
 
 
368
                if (!block->check_index_page_at_flush) {
 
369
                } else if (page_is_comp(block->frame)) {
 
370
                        if (UNIV_UNLIKELY
 
371
                            (!page_simple_validate_new(block->frame))) {
 
372
corrupted_page:
 
373
                                buf_page_print(block->frame, 0);
 
374
 
 
375
                                ut_print_timestamp(stderr);
 
376
                                fprintf(stderr,
 
377
                                        "  InnoDB: Apparent corruption of an"
 
378
                                        " index page n:o %lu in space %lu\n"
 
379
                                        "InnoDB: to be written to data file."
 
380
                                        " We intentionally crash server\n"
 
381
                                        "InnoDB: to prevent corrupt data"
 
382
                                        " from ending up in data\n"
 
383
                                        "InnoDB: files.\n",
 
384
                                        (ulong) buf_block_get_page_no(block),
 
385
                                        (ulong) buf_block_get_space(block));
 
386
 
 
387
                                ut_error;
 
388
                        }
 
389
                } else if (UNIV_UNLIKELY
 
390
                           (!page_simple_validate_old(block->frame))) {
 
391
 
 
392
                        goto corrupted_page;
 
393
                }
 
394
        }
 
395
 
 
396
        /* increment the doublewrite flushed pages counter */
 
397
        srv_dblwr_pages_written+= trx_doublewrite->first_free;
 
398
        srv_dblwr_writes++;
 
399
 
 
400
        len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
 
401
                     trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
 
402
 
 
403
        write_buf = trx_doublewrite->write_buf;
 
404
        i = 0;
 
405
 
 
406
        fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
 
407
               trx_doublewrite->block1, 0, len,
 
408
               (void*) write_buf, NULL);
 
409
 
 
410
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
 
411
             len2 += UNIV_PAGE_SIZE, i++) {
 
412
                const buf_block_t* block = (buf_block_t*)
 
413
                        trx_doublewrite->buf_block_arr[i];
 
414
 
 
415
                if (UNIV_LIKELY(!block->page.zip.data)
 
416
                    && UNIV_LIKELY(buf_block_get_state(block)
 
417
                                   == BUF_BLOCK_FILE_PAGE)
 
418
                    && UNIV_UNLIKELY
 
419
                    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
 
420
                            write_buf + len2
 
421
                            + (UNIV_PAGE_SIZE
 
422
                               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
 
423
                        ut_print_timestamp(stderr);
 
424
                        fprintf(stderr,
 
425
                                "  InnoDB: ERROR: The page to be written"
 
426
                                " seems corrupt!\n"
 
427
                                "InnoDB: The lsn fields do not match!"
 
428
                                " Noticed in the doublewrite block1.\n");
 
429
                }
 
430
        }
 
431
 
 
432
        if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
433
                goto flush;
 
434
        }
 
435
 
 
436
        len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
 
437
                * UNIV_PAGE_SIZE;
 
438
 
 
439
        write_buf = trx_doublewrite->write_buf
 
440
                + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
 
441
        ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
 
442
 
 
443
        fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
 
444
               trx_doublewrite->block2, 0, len,
 
445
               (void*) write_buf, NULL);
 
446
 
 
447
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
 
448
             len2 += UNIV_PAGE_SIZE, i++) {
 
449
                const buf_block_t* block = (buf_block_t*)
 
450
                        trx_doublewrite->buf_block_arr[i];
 
451
 
 
452
                if (UNIV_LIKELY(!block->page.zip.data)
 
453
                    && UNIV_LIKELY(buf_block_get_state(block)
 
454
                                   == BUF_BLOCK_FILE_PAGE)
 
455
                    && UNIV_UNLIKELY
 
456
                    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
 
457
                            write_buf + len2
 
458
                            + (UNIV_PAGE_SIZE
 
459
                               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
 
460
                        ut_print_timestamp(stderr);
 
461
                        fprintf(stderr,
 
462
                                "  InnoDB: ERROR: The page to be"
 
463
                                " written seems corrupt!\n"
 
464
                                "InnoDB: The lsn fields do not match!"
 
465
                                " Noticed in"
 
466
                                " the doublewrite block2.\n");
 
467
                }
 
468
        }
 
469
 
 
470
flush:
 
471
        /* Now flush the doublewrite buffer data to disk */
 
472
 
 
473
        fil_flush(TRX_SYS_SPACE);
 
474
 
 
475
        /* We know that the writes have been flushed to disk now
 
476
        and in recovery we will find them in the doublewrite buffer
 
477
        blocks. Next do the writes to the intended positions. */
 
478
 
 
479
        for (i = 0; i < trx_doublewrite->first_free; i++) {
 
480
                const buf_block_t* block = (buf_block_t*)
 
481
                        trx_doublewrite->buf_block_arr[i];
 
482
 
 
483
                ut_a(buf_page_in_file(&block->page));
 
484
                if (UNIV_LIKELY_NULL(block->page.zip.data)) {
 
485
                        fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 
486
                               FALSE, buf_page_get_space(&block->page),
 
487
                               buf_page_get_zip_size(&block->page),
 
488
                               buf_page_get_page_no(&block->page), 0,
 
489
                               buf_page_get_zip_size(&block->page),
 
490
                               (void*)block->page.zip.data,
 
491
                               (void*)block);
 
492
 
 
493
                        /* Increment the counter of I/O operations used
 
494
                        for selecting LRU policy. */
 
495
                        buf_LRU_stat_inc_io();
 
496
 
 
497
                        continue;
 
498
                }
 
499
 
 
500
                ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
 
501
 
 
502
                if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
 
503
                                         block->frame
 
504
                                         + (UNIV_PAGE_SIZE
 
505
                                            - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
 
506
                                         4))) {
 
507
                        ut_print_timestamp(stderr);
 
508
                        fprintf(stderr,
 
509
                                "  InnoDB: ERROR: The page to be written"
 
510
                                " seems corrupt!\n"
 
511
                                "InnoDB: The lsn fields do not match!"
 
512
                                " Noticed in the buffer pool\n"
 
513
                                "InnoDB: after posting and flushing"
 
514
                                " the doublewrite buffer.\n"
 
515
                                "InnoDB: Page buf fix count %lu,"
 
516
                                " io fix %lu, state %lu\n",
 
517
                                (ulong)block->page.buf_fix_count,
 
518
                                (ulong)buf_block_get_io_fix(block),
 
519
                                (ulong)buf_block_get_state(block));
 
520
                }
 
521
 
 
522
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 
523
                       FALSE, buf_block_get_space(block), 0,
 
524
                       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
 
525
                       (void*)block->frame, (void*)block);
 
526
 
 
527
                /* Increment the counter of I/O operations used
 
528
                for selecting LRU policy. */
 
529
                buf_LRU_stat_inc_io();
 
530
        }
 
531
 
 
532
        /* Wake possible simulated aio thread to actually post the
 
533
        writes to the operating system */
 
534
 
 
535
        os_aio_simulated_wake_handler_threads();
 
536
 
 
537
        /* Wait that all async writes to tablespaces have been posted to
 
538
        the OS */
 
539
 
 
540
        os_aio_wait_until_no_pending_writes();
 
541
 
 
542
        /* Now we flush the data to disk (for example, with fsync) */
 
543
 
 
544
        fil_flush_file_spaces(FIL_TABLESPACE);
 
545
 
 
546
        /* We can now reuse the doublewrite memory buffer: */
 
547
 
 
548
        trx_doublewrite->first_free = 0;
 
549
 
 
550
        mutex_exit(&(trx_doublewrite->mutex));
 
551
}
 
552
 
 
553
/********************************************************************//**
 
554
Posts a buffer page for writing. If the doublewrite memory buffer is
 
555
full, calls buf_flush_buffered_writes and waits for for free space to
 
556
appear. */
 
557
static
 
558
void
 
559
buf_flush_post_to_doublewrite_buf(
 
560
/*==============================*/
 
561
        buf_page_t*     bpage)  /*!< in: buffer block to write */
 
562
{
 
563
        ulint   zip_size;
 
564
try_again:
 
565
        mutex_enter(&(trx_doublewrite->mutex));
 
566
 
 
567
        ut_a(buf_page_in_file(bpage));
 
568
 
 
569
        if (trx_doublewrite->first_free
 
570
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
571
                mutex_exit(&(trx_doublewrite->mutex));
 
572
 
 
573
                buf_flush_buffered_writes();
 
574
 
 
575
                goto try_again;
 
576
        }
 
577
 
 
578
        zip_size = buf_page_get_zip_size(bpage);
 
579
 
 
580
        if (UNIV_UNLIKELY(zip_size)) {
 
581
                /* Copy the compressed page and clear the rest. */
 
582
                memcpy(trx_doublewrite->write_buf
 
583
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
 
584
                       bpage->zip.data, zip_size);
 
585
                memset(trx_doublewrite->write_buf
 
586
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free
 
587
                       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
 
588
        } else {
 
589
                ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
 
590
 
 
591
                memcpy(trx_doublewrite->write_buf
 
592
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
 
593
                       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
 
594
        }
 
595
 
 
596
        trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
 
597
 
 
598
        trx_doublewrite->first_free++;
 
599
 
 
600
        if (trx_doublewrite->first_free
 
601
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 
602
                mutex_exit(&(trx_doublewrite->mutex));
 
603
 
 
604
                buf_flush_buffered_writes();
 
605
 
 
606
                return;
 
607
        }
 
608
 
 
609
        mutex_exit(&(trx_doublewrite->mutex));
 
610
}
 
611
#endif /* !UNIV_HOTBACKUP */
 
612
 
 
613
/********************************************************************//**
 
614
Initializes a page for writing to the tablespace. */
 
615
UNIV_INTERN
 
616
void
 
617
buf_flush_init_for_writing(
 
618
/*=======================*/
 
619
        byte*           page,           /*!< in/out: page */
 
620
        void*           page_zip_,      /*!< in/out: compressed page, or NULL */
 
621
        ib_uint64_t     newest_lsn)     /*!< in: newest modification lsn
 
622
                                        to the page */
 
623
{
 
624
        ut_ad(page);
 
625
 
 
626
        if (page_zip_) {
 
627
                page_zip_des_t* page_zip = page_zip_;
 
628
                ulint           zip_size = page_zip_get_size(page_zip);
 
629
                ut_ad(zip_size);
 
630
                ut_ad(ut_is_2pow(zip_size));
 
631
                ut_ad(zip_size <= UNIV_PAGE_SIZE);
 
632
 
 
633
                switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
 
634
                case FIL_PAGE_TYPE_ALLOCATED:
 
635
                case FIL_PAGE_INODE:
 
636
                case FIL_PAGE_IBUF_BITMAP:
 
637
                case FIL_PAGE_TYPE_FSP_HDR:
 
638
                case FIL_PAGE_TYPE_XDES:
 
639
                        /* These are essentially uncompressed pages. */
 
640
                        memcpy(page_zip->data, page, zip_size);
 
641
                        /* fall through */
 
642
                case FIL_PAGE_TYPE_ZBLOB:
 
643
                case FIL_PAGE_TYPE_ZBLOB2:
 
644
                case FIL_PAGE_INDEX:
 
645
                        mach_write_ull(page_zip->data
 
646
                                       + FIL_PAGE_LSN, newest_lsn);
 
647
                        memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
 
648
                        mach_write_to_4(page_zip->data
 
649
                                        + FIL_PAGE_SPACE_OR_CHKSUM,
 
650
                                        srv_use_checksums
 
651
                                        ? page_zip_calc_checksum(
 
652
                                                page_zip->data, zip_size)
 
653
                                        : BUF_NO_CHECKSUM_MAGIC);
 
654
                        return;
 
655
                }
 
656
 
 
657
                ut_print_timestamp(stderr);
 
658
                fputs("  InnoDB: ERROR: The compressed page to be written"
 
659
                      " seems corrupt:", stderr);
 
660
                ut_print_buf(stderr, page, zip_size);
 
661
                fputs("\nInnoDB: Possibly older version of the page:", stderr);
 
662
                ut_print_buf(stderr, page_zip->data, zip_size);
 
663
                putc('\n', stderr);
 
664
                ut_error;
 
665
        }
 
666
 
 
667
        /* Write the newest modification lsn to the page header and trailer */
 
668
        mach_write_ull(page + FIL_PAGE_LSN, newest_lsn);
 
669
 
 
670
        mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
 
671
                       newest_lsn);
 
672
 
 
673
        /* Store the new formula checksum */
 
674
 
 
675
        mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
 
676
                        srv_use_checksums
 
677
                        ? buf_calc_page_new_checksum(page)
 
678
                        : BUF_NO_CHECKSUM_MAGIC);
 
679
 
 
680
        /* We overwrite the first 4 bytes of the end lsn field to store
 
681
        the old formula checksum. Since it depends also on the field
 
682
        FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
 
683
        new formula checksum. */
 
684
 
 
685
        mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
 
686
                        srv_use_checksums
 
687
                        ? buf_calc_page_old_checksum(page)
 
688
                        : BUF_NO_CHECKSUM_MAGIC);
 
689
}
 
690
 
 
691
#ifndef UNIV_HOTBACKUP
 
692
/********************************************************************//**
 
693
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
 
694
also when the doublewrite buffer is used, we must call
 
695
buf_flush_buffered_writes after we have posted a batch of writes! */
 
696
static
 
697
void
 
698
buf_flush_write_block_low(
 
699
/*======================*/
 
700
        buf_page_t*     bpage)  /*!< in: buffer block to write */
 
701
{
 
702
        ulint   zip_size        = buf_page_get_zip_size(bpage);
 
703
        page_t* frame           = NULL;
 
704
#ifdef UNIV_LOG_DEBUG
 
705
        static ibool univ_log_debug_warned;
 
706
#endif /* UNIV_LOG_DEBUG */
 
707
 
 
708
        ut_ad(buf_page_in_file(bpage));
 
709
 
 
710
        /* We are not holding buf_pool_mutex or block_mutex here.
 
711
        Nevertheless, it is safe to access bpage, because it is
 
712
        io_fixed and oldest_modification != 0.  Thus, it cannot be
 
713
        relocated in the buffer pool or removed from flush_list or
 
714
        LRU_list. */
 
715
        ut_ad(!buf_pool_mutex_own());
 
716
        ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
 
717
        ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
 
718
        ut_ad(bpage->oldest_modification != 0);
 
719
 
 
720
#ifdef UNIV_IBUF_COUNT_DEBUG
 
721
        ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
 
722
#endif
 
723
        ut_ad(bpage->newest_modification != 0);
 
724
 
 
725
#ifdef UNIV_LOG_DEBUG
 
726
        if (!univ_log_debug_warned) {
 
727
                univ_log_debug_warned = TRUE;
 
728
                fputs("Warning: cannot force log to disk if"
 
729
                      " UNIV_LOG_DEBUG is defined!\n"
 
730
                      "Crash recovery will not work!\n",
 
731
                      stderr);
 
732
        }
 
733
#else
 
734
        /* Force the log to the disk before writing the modified block */
 
735
        log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
 
736
#endif
 
737
        switch (buf_page_get_state(bpage)) {
 
738
        case BUF_BLOCK_ZIP_FREE:
 
739
        case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
 
740
        case BUF_BLOCK_NOT_USED:
 
741
        case BUF_BLOCK_READY_FOR_USE:
 
742
        case BUF_BLOCK_MEMORY:
 
743
        case BUF_BLOCK_REMOVE_HASH:
 
744
                ut_error;
 
745
                break;
 
746
        case BUF_BLOCK_ZIP_DIRTY:
 
747
                frame = bpage->zip.data;
 
748
                if (UNIV_LIKELY(srv_use_checksums)) {
 
749
                        ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
 
750
                             == page_zip_calc_checksum(frame, zip_size));
 
751
                }
 
752
                mach_write_ull(frame + FIL_PAGE_LSN,
 
753
                               bpage->newest_modification);
 
754
                memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
 
755
                break;
 
756
        case BUF_BLOCK_FILE_PAGE:
 
757
                frame = bpage->zip.data;
 
758
                if (!frame) {
 
759
                        frame = ((buf_block_t*) bpage)->frame;
 
760
                }
 
761
 
 
762
                buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
 
763
                                           bpage->zip.data
 
764
                                           ? &bpage->zip : NULL,
 
765
                                           bpage->newest_modification);
 
766
                break;
 
767
        }
 
768
 
 
769
        if (!srv_use_doublewrite_buf || !trx_doublewrite) {
 
770
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
 
771
                       FALSE, buf_page_get_space(bpage), zip_size,
 
772
                       buf_page_get_page_no(bpage), 0,
 
773
                       zip_size ? zip_size : UNIV_PAGE_SIZE,
 
774
                       frame, bpage);
 
775
        } else {
 
776
                buf_flush_post_to_doublewrite_buf(bpage);
 
777
        }
 
778
}
 
779
 
 
780
/********************************************************************//**
 
781
Writes a flushable page asynchronously from the buffer pool to a file.
 
782
NOTE: in simulated aio we must call
 
783
os_aio_simulated_wake_handler_threads after we have posted a batch of
 
784
writes! NOTE: buf_pool_mutex and buf_page_get_mutex(bpage) must be
 
785
held upon entering this function, and they will be released by this
 
786
function. */
 
787
static
 
788
void
 
789
buf_flush_page(
 
790
/*===========*/
 
791
        buf_page_t*     bpage,          /*!< in: buffer control block */
 
792
        enum buf_flush  flush_type)     /*!< in: BUF_FLUSH_LRU
 
793
                                        or BUF_FLUSH_LIST */
 
794
{
 
795
        mutex_t*        block_mutex;
 
796
        ibool           is_uncompressed;
 
797
 
 
798
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
 
799
        ut_ad(buf_pool_mutex_own());
 
800
        ut_ad(buf_page_in_file(bpage));
 
801
 
 
802
        block_mutex = buf_page_get_mutex(bpage);
 
803
        ut_ad(mutex_own(block_mutex));
 
804
 
 
805
        ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
 
806
 
 
807
        buf_page_set_io_fix(bpage, BUF_IO_WRITE);
 
808
 
 
809
        buf_page_set_flush_type(bpage, flush_type);
 
810
 
 
811
        if (buf_pool->n_flush[flush_type] == 0) {
 
812
 
 
813
                os_event_reset(buf_pool->no_flush[flush_type]);
 
814
        }
 
815
 
 
816
        buf_pool->n_flush[flush_type]++;
 
817
 
 
818
        is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
 
819
        ut_ad(is_uncompressed == (block_mutex != &buf_pool_zip_mutex));
 
820
 
 
821
        switch (flush_type) {
 
822
                ibool   is_s_latched;
 
823
        case BUF_FLUSH_LIST:
 
824
                /* If the simulated aio thread is not running, we must
 
825
                not wait for any latch, as we may end up in a deadlock:
 
826
                if buf_fix_count == 0, then we know we need not wait */
 
827
 
 
828
                is_s_latched = (bpage->buf_fix_count == 0);
 
829
                if (is_s_latched && is_uncompressed) {
 
830
                        rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
 
831
                                           BUF_IO_WRITE);
 
832
                }
 
833
 
 
834
                mutex_exit(block_mutex);
 
835
                buf_pool_mutex_exit();
 
836
 
 
837
                /* Even though bpage is not protected by any mutex at
 
838
                this point, it is safe to access bpage, because it is
 
839
                io_fixed and oldest_modification != 0.  Thus, it
 
840
                cannot be relocated in the buffer pool or removed from
 
841
                flush_list or LRU_list. */
 
842
 
 
843
                if (!is_s_latched) {
 
844
                        buf_flush_buffered_writes();
 
845
 
 
846
                        if (is_uncompressed) {
 
847
                                rw_lock_s_lock_gen(&((buf_block_t*) bpage)
 
848
                                                   ->lock, BUF_IO_WRITE);
 
849
                        }
 
850
                }
 
851
 
 
852
                break;
 
853
 
 
854
        case BUF_FLUSH_LRU:
 
855
                /* VERY IMPORTANT:
 
856
                Because any thread may call the LRU flush, even when owning
 
857
                locks on pages, to avoid deadlocks, we must make sure that the
 
858
                s-lock is acquired on the page without waiting: this is
 
859
                accomplished because buf_flush_ready_for_flush() must hold,
 
860
                and that requires the page not to be bufferfixed. */
 
861
 
 
862
                if (is_uncompressed) {
 
863
                        rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
 
864
                                           BUF_IO_WRITE);
 
865
                }
 
866
 
 
867
                /* Note that the s-latch is acquired before releasing the
 
868
                buf_pool mutex: this ensures that the latch is acquired
 
869
                immediately. */
 
870
 
 
871
                mutex_exit(block_mutex);
 
872
                buf_pool_mutex_exit();
 
873
                break;
 
874
 
 
875
        default:
 
876
                ut_error;
 
877
        }
 
878
 
 
879
        /* Even though bpage is not protected by any mutex at this
 
880
        point, it is safe to access bpage, because it is io_fixed and
 
881
        oldest_modification != 0.  Thus, it cannot be relocated in the
 
882
        buffer pool or removed from flush_list or LRU_list. */
 
883
 
 
884
#ifdef UNIV_DEBUG
 
885
        if (buf_debug_prints) {
 
886
                fprintf(stderr,
 
887
                        "Flushing %u space %u page %u\n",
 
888
                        flush_type, bpage->space, bpage->offset);
 
889
        }
 
890
#endif /* UNIV_DEBUG */
 
891
        buf_flush_write_block_low(bpage);
 
892
}
 
893
 
 
894
/***********************************************************//**
 
895
Flushes to disk all flushable pages within the flush area.
 
896
@return number of pages flushed */
 
897
static
 
898
ulint
 
899
buf_flush_try_neighbors(
 
900
/*====================*/
 
901
        ulint           space,          /*!< in: space id */
 
902
        ulint           offset,         /*!< in: page offset */
 
903
        enum buf_flush  flush_type)     /*!< in: BUF_FLUSH_LRU or
 
904
                                        BUF_FLUSH_LIST */
 
905
{
 
906
        buf_page_t*     bpage;
 
907
        ulint           low, high;
 
908
        ulint           count           = 0;
 
909
        ulint           i;
 
910
 
 
911
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
 
912
 
 
913
        if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
 
914
                /* If there is little space, it is better not to flush any
 
915
                block except from the end of the LRU list */
 
916
 
 
917
                low = offset;
 
918
                high = offset + 1;
 
919
        } else {
 
920
                /* When flushed, dirty blocks are searched in neighborhoods of
 
921
                this size, and flushed along with the original page. */
 
922
 
 
923
                ulint   buf_flush_area  = ut_min(BUF_READ_AHEAD_AREA,
 
924
                                                 buf_pool->curr_size / 16);
 
925
 
 
926
                low = (offset / buf_flush_area) * buf_flush_area;
 
927
                high = (offset / buf_flush_area + 1) * buf_flush_area;
 
928
        }
 
929
 
 
930
        /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
 
931
 
 
932
        if (high > fil_space_get_size(space)) {
 
933
                high = fil_space_get_size(space);
 
934
        }
 
935
 
 
936
        buf_pool_mutex_enter();
 
937
 
 
938
        for (i = low; i < high; i++) {
 
939
 
 
940
                bpage = buf_page_hash_get(space, i);
 
941
 
 
942
                if (!bpage) {
 
943
 
 
944
                        continue;
 
945
                }
 
946
 
 
947
                ut_a(buf_page_in_file(bpage));
 
948
 
 
949
                /* We avoid flushing 'non-old' blocks in an LRU flush,
 
950
                because the flushed blocks are soon freed */
 
951
 
 
952
                if (flush_type != BUF_FLUSH_LRU
 
953
                    || i == offset
 
954
                    || buf_page_is_old(bpage)) {
 
955
                        mutex_t* block_mutex = buf_page_get_mutex(bpage);
 
956
 
 
957
                        mutex_enter(block_mutex);
 
958
 
 
959
                        if (buf_flush_ready_for_flush(bpage, flush_type)
 
960
                            && (i == offset || !bpage->buf_fix_count)) {
 
961
                                /* We only try to flush those
 
962
                                neighbors != offset where the buf fix count is
 
963
                                zero, as we then know that we probably can
 
964
                                latch the page without a semaphore wait.
 
965
                                Semaphore waits are expensive because we must
 
966
                                flush the doublewrite buffer before we start
 
967
                                waiting. */
 
968
 
 
969
                                buf_flush_page(bpage, flush_type);
 
970
                                ut_ad(!mutex_own(block_mutex));
 
971
                                count++;
 
972
 
 
973
                                buf_pool_mutex_enter();
 
974
                        } else {
 
975
                                mutex_exit(block_mutex);
 
976
                        }
 
977
                }
 
978
        }
 
979
 
 
980
        buf_pool_mutex_exit();
 
981
 
 
982
        return(count);
 
983
}
 
984
 
 
985
/*******************************************************************//**
 
986
This utility flushes dirty blocks from the end of the LRU list or flush_list.
 
987
NOTE 1: in the case of an LRU flush the calling thread may own latches to
 
988
pages: to avoid deadlocks, this function must be written so that it cannot
 
989
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
 
990
the calling thread is not allowed to own any latches on pages!
 
991
@return number of blocks for which the write request was queued;
 
992
ULINT_UNDEFINED if there was a flush of the same type already running */
 
993
UNIV_INTERN
 
994
ulint
 
995
buf_flush_batch(
 
996
/*============*/
 
997
        enum buf_flush  flush_type,     /*!< in: BUF_FLUSH_LRU or
 
998
                                        BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
 
999
                                        then the caller must not own any
 
1000
                                        latches on pages */
 
1001
        ulint           min_n,          /*!< in: wished minimum mumber of blocks
 
1002
                                        flushed (it is not guaranteed that the
 
1003
                                        actual number is that big, though) */
 
1004
        ib_uint64_t     lsn_limit)      /*!< in the case BUF_FLUSH_LIST all
 
1005
                                        blocks whose oldest_modification is
 
1006
                                        smaller than this should be flushed
 
1007
                                        (if their number does not exceed
 
1008
                                        min_n), otherwise ignored */
 
1009
{
 
1010
        buf_page_t*     bpage;
 
1011
        ulint           page_count      = 0;
 
1012
        ulint           old_page_count;
 
1013
        ulint           space;
 
1014
        ulint           offset;
 
1015
 
 
1016
        ut_ad((flush_type == BUF_FLUSH_LRU)
 
1017
              || (flush_type == BUF_FLUSH_LIST));
 
1018
#ifdef UNIV_SYNC_DEBUG
 
1019
        ut_ad((flush_type != BUF_FLUSH_LIST)
 
1020
              || sync_thread_levels_empty_gen(TRUE));
 
1021
#endif /* UNIV_SYNC_DEBUG */
 
1022
        buf_pool_mutex_enter();
 
1023
 
 
1024
        if ((buf_pool->n_flush[flush_type] > 0)
 
1025
            || (buf_pool->init_flush[flush_type] == TRUE)) {
 
1026
 
 
1027
                /* There is already a flush batch of the same type running */
 
1028
 
 
1029
                buf_pool_mutex_exit();
 
1030
 
 
1031
                return(ULINT_UNDEFINED);
 
1032
        }
 
1033
 
 
1034
        buf_pool->init_flush[flush_type] = TRUE;
 
1035
 
 
1036
        bool done_with_loop= false;
 
1037
        for (;done_with_loop != true;) {
 
1038
flush_next:
 
1039
                /* If we have flushed enough, leave the loop */
 
1040
                if (page_count >= min_n) {
 
1041
 
 
1042
                        break;
 
1043
                }
 
1044
 
 
1045
                /* Start from the end of the list looking for a suitable
 
1046
                block to be flushed. */
 
1047
 
 
1048
                if (flush_type == BUF_FLUSH_LRU) {
 
1049
                        bpage = UT_LIST_GET_LAST(buf_pool->LRU);
 
1050
                } else {
 
1051
                        ut_ad(flush_type == BUF_FLUSH_LIST);
 
1052
 
 
1053
                        bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
 
1054
                        if (!bpage
 
1055
                            || bpage->oldest_modification >= lsn_limit) {
 
1056
                                /* We have flushed enough */
 
1057
 
 
1058
                                break;
 
1059
                        }
 
1060
                        ut_ad(bpage->in_flush_list);
 
1061
                }
 
1062
 
 
1063
                /* Note that after finding a single flushable page, we try to
 
1064
                flush also all its neighbors, and after that start from the
 
1065
                END of the LRU list or flush list again: the list may change
 
1066
                during the flushing and we cannot safely preserve within this
 
1067
                function a pointer to a block in the list! */
 
1068
 
 
1069
                do {
 
1070
                        mutex_t*block_mutex = buf_page_get_mutex(bpage);
 
1071
                        ibool   ready;
 
1072
 
 
1073
                        ut_a(buf_page_in_file(bpage));
 
1074
 
 
1075
                        mutex_enter(block_mutex);
 
1076
                        ready = buf_flush_ready_for_flush(bpage, flush_type);
 
1077
                        mutex_exit(block_mutex);
 
1078
 
 
1079
                        if (ready) {
 
1080
                                space = buf_page_get_space(bpage);
 
1081
                                offset = buf_page_get_page_no(bpage);
 
1082
 
 
1083
                                buf_pool_mutex_exit();
 
1084
 
 
1085
                                old_page_count = page_count;
 
1086
 
 
1087
                                /* Try to flush also all the neighbors */
 
1088
                                page_count += buf_flush_try_neighbors(
 
1089
                                        space, offset, flush_type);
 
1090
                                /* fprintf(stderr,
 
1091
                                "Flush type %lu, page no %lu, neighb %lu\n",
 
1092
                                flush_type, offset,
 
1093
                                page_count - old_page_count); */
 
1094
 
 
1095
                                buf_pool_mutex_enter();
 
1096
                                goto flush_next;
 
1097
 
 
1098
                        } else if (flush_type == BUF_FLUSH_LRU) {
 
1099
                                bpage = UT_LIST_GET_PREV(LRU, bpage);
 
1100
                        } else {
 
1101
                                ut_ad(flush_type == BUF_FLUSH_LIST);
 
1102
 
 
1103
                                bpage = UT_LIST_GET_PREV(list, bpage);
 
1104
                                ut_ad(!bpage || bpage->in_flush_list);
 
1105
                        }
 
1106
                } while (bpage != NULL);
 
1107
 
 
1108
                /* If we could not find anything to flush, leave the loop */
 
1109
 
 
1110
                done_with_loop= true;
 
1111
 
 
1112
        }
 
1113
 
 
1114
        buf_pool->init_flush[flush_type] = FALSE;
 
1115
 
 
1116
        if (buf_pool->n_flush[flush_type] == 0) {
 
1117
 
 
1118
                /* The running flush batch has ended */
 
1119
 
 
1120
                os_event_set(buf_pool->no_flush[flush_type]);
 
1121
        }
 
1122
 
 
1123
        buf_pool_mutex_exit();
 
1124
 
 
1125
        buf_flush_buffered_writes();
 
1126
 
 
1127
#ifdef UNIV_DEBUG
 
1128
        if (buf_debug_prints && page_count > 0) {
 
1129
                ut_a(flush_type == BUF_FLUSH_LRU
 
1130
                     || flush_type == BUF_FLUSH_LIST);
 
1131
                fprintf(stderr, flush_type == BUF_FLUSH_LRU
 
1132
                        ? "Flushed %lu pages in LRU flush\n"
 
1133
                        : "Flushed %lu pages in flush list flush\n",
 
1134
                        (ulong) page_count);
 
1135
        }
 
1136
#endif /* UNIV_DEBUG */
 
1137
 
 
1138
        srv_buf_pool_flushed += page_count;
 
1139
 
 
1140
        /* We keep track of all flushes happening as part of LRU
 
1141
        flush. When estimating the desired rate at which flush_list
 
1142
        should be flushed we factor in this value. */
 
1143
        if (flush_type == BUF_FLUSH_LRU) {
 
1144
                buf_lru_flush_page_count += page_count;
 
1145
        }
 
1146
 
 
1147
        return(page_count);
 
1148
}
 
1149
 
 
1150
/******************************************************************//**
 
1151
Waits until a flush batch of the given type ends */
 
1152
UNIV_INTERN
 
1153
void
 
1154
buf_flush_wait_batch_end(
 
1155
/*=====================*/
 
1156
        enum buf_flush  type)   /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
 
1157
{
 
1158
        ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
 
1159
 
 
1160
        os_event_wait(buf_pool->no_flush[type]);
 
1161
}
 
1162
 
 
1163
/******************************************************************//**
 
1164
Gives a recommendation of how many blocks should be flushed to establish
 
1165
a big enough margin of replaceable blocks near the end of the LRU list
 
1166
and in the free list.
 
1167
@return number of blocks which should be flushed from the end of the
 
1168
LRU list */
 
1169
static
 
1170
ulint
 
1171
buf_flush_LRU_recommendation(void)
 
1172
/*==============================*/
 
1173
{
 
1174
        buf_page_t*     bpage;
 
1175
        ulint           n_replaceable;
 
1176
        ulint           distance        = 0;
 
1177
 
 
1178
        buf_pool_mutex_enter();
 
1179
 
 
1180
        n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
 
1181
 
 
1182
        bpage = UT_LIST_GET_LAST(buf_pool->LRU);
 
1183
 
 
1184
        while ((bpage != NULL)
 
1185
               && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
 
1186
                   + BUF_FLUSH_EXTRA_MARGIN)
 
1187
               && (distance < BUF_LRU_FREE_SEARCH_LEN)) {
 
1188
 
 
1189
                mutex_t* block_mutex = buf_page_get_mutex(bpage);
 
1190
 
 
1191
                mutex_enter(block_mutex);
 
1192
 
 
1193
                if (buf_flush_ready_for_replace(bpage)) {
 
1194
                        n_replaceable++;
 
1195
                }
 
1196
 
 
1197
                mutex_exit(block_mutex);
 
1198
 
 
1199
                distance++;
 
1200
 
 
1201
                bpage = UT_LIST_GET_PREV(LRU, bpage);
 
1202
        }
 
1203
 
 
1204
        buf_pool_mutex_exit();
 
1205
 
 
1206
        if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
 
1207
 
 
1208
                return(0);
 
1209
        }
 
1210
 
 
1211
        return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
 
1212
               - n_replaceable);
 
1213
}
 
1214
 
 
1215
/*********************************************************************//**
 
1216
Flushes pages from the end of the LRU list if there is too small a margin
 
1217
of replaceable pages there or in the free list. VERY IMPORTANT: this function
 
1218
is called also by threads which have locks on pages. To avoid deadlocks, we
 
1219
flush only pages such that the s-lock required for flushing can be acquired
 
1220
immediately, without waiting. */
 
1221
UNIV_INTERN
 
1222
void
 
1223
buf_flush_free_margin(void)
 
1224
/*=======================*/
 
1225
{
 
1226
        ulint   n_to_flush;
 
1227
        ulint   n_flushed;
 
1228
 
 
1229
        n_to_flush = buf_flush_LRU_recommendation();
 
1230
 
 
1231
        if (n_to_flush > 0) {
 
1232
                n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0);
 
1233
                if (n_flushed == ULINT_UNDEFINED) {
 
1234
                        /* There was an LRU type flush batch already running;
 
1235
                        let us wait for it to end */
 
1236
 
 
1237
                        buf_flush_wait_batch_end(BUF_FLUSH_LRU);
 
1238
                }
 
1239
        }
 
1240
}
 
1241
 
 
1242
/*********************************************************************
 
1243
Update the historical stats that we are collecting for flush rate
 
1244
heuristics at the end of each interval.
 
1245
Flush rate heuristic depends on (a) rate of redo log generation and
 
1246
(b) the rate at which LRU flush is happening. */
 
1247
UNIV_INTERN
 
1248
void
 
1249
buf_flush_stat_update(void)
 
1250
/*=======================*/
 
1251
{
 
1252
        buf_flush_stat_t*       item;
 
1253
        ib_uint64_t             lsn_diff;
 
1254
        ib_uint64_t             lsn;
 
1255
        ulint                   n_flushed;
 
1256
 
 
1257
        lsn = log_get_lsn();
 
1258
        if (buf_flush_stat_cur.redo == 0) {
 
1259
                /* First time around. Just update the current LSN
 
1260
                and return. */
 
1261
                buf_flush_stat_cur.redo = lsn;
 
1262
                return;
 
1263
        }
 
1264
 
 
1265
        item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
 
1266
 
 
1267
        /* values for this interval */
 
1268
        lsn_diff = lsn - buf_flush_stat_cur.redo;
 
1269
        n_flushed = buf_lru_flush_page_count
 
1270
                    - buf_flush_stat_cur.n_flushed;
 
1271
 
 
1272
        /* add the current value and subtract the obsolete entry. */
 
1273
        buf_flush_stat_sum.redo += lsn_diff - item->redo;
 
1274
        buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
 
1275
 
 
1276
        /* put current entry in the array. */
 
1277
        item->redo = lsn_diff;
 
1278
        item->n_flushed = n_flushed;
 
1279
 
 
1280
        /* update the index */
 
1281
        buf_flush_stat_arr_ind++;
 
1282
        buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
 
1283
 
 
1284
        /* reset the current entry. */
 
1285
        buf_flush_stat_cur.redo = lsn;
 
1286
        buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
 
1287
}
 
1288
 
 
1289
/*********************************************************************
 
1290
Determines the fraction of dirty pages that need to be flushed based
 
1291
on the speed at which we generate redo log. Note that if redo log
 
1292
is generated at a significant rate without corresponding increase
 
1293
in the number of dirty pages (for example, an in-memory workload)
 
1294
it can cause IO bursts of flushing. This function implements heuristics
 
1295
to avoid this burstiness.
 
1296
@return number of dirty pages to be flushed / second */
 
1297
UNIV_INTERN
 
1298
ulint
 
1299
buf_flush_get_desired_flush_rate(void)
 
1300
/*==================================*/
 
1301
{
 
1302
        ulint                   redo_avg;
 
1303
        ulint                   lru_flush_avg;
 
1304
        ulint                   n_dirty;
 
1305
        ulint                   n_flush_req;
 
1306
        lint                    rate;
 
1307
        ib_uint64_t             lsn = log_get_lsn();
 
1308
        ulint                   log_capacity = log_get_capacity();
 
1309
 
 
1310
        /* log_capacity should never be zero after the initialization
 
1311
        of log subsystem. */
 
1312
        ut_ad(log_capacity != 0);
 
1313
 
 
1314
        /* Get total number of dirty pages. It is OK to access
 
1315
        flush_list without holding any mtex as we are using this
 
1316
        only for heuristics. */
 
1317
        n_dirty = UT_LIST_GET_LEN(buf_pool->flush_list);
 
1318
 
 
1319
        /* An overflow can happen if we generate more than 2^32 bytes
 
1320
        of redo in this interval i.e.: 4G of redo in 1 second. We can
 
1321
        safely consider this as infinity because if we ever come close
 
1322
        to 4G we'll start a synchronous flush of dirty pages. */
 
1323
        /* redo_avg below is average at which redo is generated in
 
1324
        past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
 
1325
        interval. */
 
1326
        redo_avg = (ulint) (buf_flush_stat_sum.redo
 
1327
                            / BUF_FLUSH_STAT_N_INTERVAL
 
1328
                            + (lsn - buf_flush_stat_cur.redo));
 
1329
 
 
1330
        /* An overflow can happen possibly if we flush more than 2^32
 
1331
        pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
 
1332
        unlikely scenario. Even when this happens it means that our
 
1333
        flush rate will be off the mark. It won't affect correctness
 
1334
        of any subsystem. */
 
1335
        /* lru_flush_avg below is rate at which pages are flushed as
 
1336
        part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
 
1337
        number of pages flushed in the current interval. */
 
1338
        lru_flush_avg = buf_flush_stat_sum.n_flushed
 
1339
                        / BUF_FLUSH_STAT_N_INTERVAL
 
1340
                        + (buf_lru_flush_page_count
 
1341
                           - buf_flush_stat_cur.n_flushed);
 
1342
 
 
1343
        n_flush_req = (n_dirty * redo_avg) / log_capacity;
 
1344
 
 
1345
        /* The number of pages that we want to flush from the flush
 
1346
        list is the difference between the required rate and the
 
1347
        number of pages that we are historically flushing from the
 
1348
        LRU list */
 
1349
        rate = n_flush_req - lru_flush_avg;
 
1350
        return(rate > 0 ? (ulint) rate : 0);
 
1351
}
 
1352
 
 
1353
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 
1354
/******************************************************************//**
 
1355
Validates the flush list.
 
1356
@return TRUE if ok */
 
1357
static
 
1358
ibool
 
1359
buf_flush_validate_low(void)
 
1360
/*========================*/
 
1361
{
 
1362
        buf_page_t*     bpage;
 
1363
 
 
1364
        UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
 
1365
                         ut_ad(ut_list_node_313->in_flush_list));
 
1366
 
 
1367
        bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
 
1368
 
 
1369
        while (bpage != NULL) {
 
1370
                const ib_uint64_t om = bpage->oldest_modification;
 
1371
                ut_ad(bpage->in_flush_list);
 
1372
                ut_a(buf_page_in_file(bpage));
 
1373
                ut_a(om > 0);
 
1374
 
 
1375
                bpage = UT_LIST_GET_NEXT(list, bpage);
 
1376
 
 
1377
                ut_a(!bpage || om >= bpage->oldest_modification);
 
1378
        }
 
1379
 
 
1380
        return(TRUE);
 
1381
}
 
1382
 
 
1383
/******************************************************************//**
 
1384
Validates the flush list.
 
1385
@return TRUE if ok */
 
1386
UNIV_INTERN
 
1387
ibool
 
1388
buf_flush_validate(void)
 
1389
/*====================*/
 
1390
{
 
1391
        ibool   ret;
 
1392
 
 
1393
        buf_pool_mutex_enter();
 
1394
 
 
1395
        ret = buf_flush_validate_low();
 
1396
 
 
1397
        buf_pool_mutex_exit();
 
1398
 
 
1399
        return(ret);
 
1400
}
 
1401
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
1402
#endif /* !UNIV_HOTBACKUP */