~drizzle-trunk/drizzle/development

« back to all changes in this revision

Viewing changes to plugin/innobase/buf/buf0flu.c

Renamed more stuff to drizzle.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
/*****************************************************************************
2
 
 
3
 
Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
4
 
 
5
 
This program is free software; you can redistribute it and/or modify it under
6
 
the terms of the GNU General Public License as published by the Free Software
7
 
Foundation; version 2 of the License.
8
 
 
9
 
This program is distributed in the hope that it will be useful, but WITHOUT
10
 
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11
 
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
 
 
13
 
You should have received a copy of the GNU General Public License along with
14
 
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15
 
Place, Suite 330, Boston, MA 02111-1307 USA
16
 
 
17
 
*****************************************************************************/
18
 
 
19
 
/**************************************************//**
20
 
@file buf/buf0flu.c
21
 
The database buffer buf_pool flush algorithm
22
 
 
23
 
Created 11/11/1995 Heikki Tuuri
24
 
*******************************************************/
25
 
 
26
 
#include "buf0flu.h"
27
 
 
28
 
#ifdef UNIV_NONINL
29
 
#include "buf0flu.ic"
30
 
#endif
31
 
 
32
 
#include "buf0buf.h"
33
 
#include "srv0srv.h"
34
 
#include "page0zip.h"
35
 
#ifndef UNIV_HOTBACKUP
36
 
#include "ut0byte.h"
37
 
#include "ut0lst.h"
38
 
#include "page0page.h"
39
 
#include "fil0fil.h"
40
 
#include "buf0lru.h"
41
 
#include "buf0rea.h"
42
 
#include "ibuf0ibuf.h"
43
 
#include "log0log.h"
44
 
#include "os0file.h"
45
 
#include "trx0sys.h"
46
 
 
47
 
/**********************************************************************
48
 
These statistics are generated for heuristics used in estimating the
49
 
rate at which we should flush the dirty blocks to avoid bursty IO
50
 
activity. Note that the rate of flushing not only depends on how many
51
 
dirty pages we have in the buffer pool but it is also a fucntion of
52
 
how much redo the workload is generating and at what rate. */
53
 
/* @{ */
54
 
 
55
 
/** Number of intervals for which we keep the history of these stats.
56
 
Each interval is 1 second, defined by the rate at which
57
 
srv_error_monitor_thread() calls buf_flush_stat_update(). */
58
 
#define BUF_FLUSH_STAT_N_INTERVAL 20
59
 
 
60
 
/** Sampled values buf_flush_stat_cur.
61
 
Not protected by any mutex.  Updated by buf_flush_stat_update(). */
62
 
static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
63
 
 
64
 
/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
65
 
static ulint            buf_flush_stat_arr_ind;
66
 
 
67
 
/** Values at start of the current interval. Reset by
68
 
buf_flush_stat_update(). */
69
 
static buf_flush_stat_t buf_flush_stat_cur;
70
 
 
71
 
/** Running sum of past values of buf_flush_stat_cur.
72
 
Updated by buf_flush_stat_update(). Not protected by any mutex. */
73
 
static buf_flush_stat_t buf_flush_stat_sum;
74
 
 
75
 
/** Number of pages flushed through non flush_list flushes. */
76
 
static ulint buf_lru_flush_page_count = 0;
77
 
 
78
 
/* @} */
79
 
 
80
 
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
81
 
/******************************************************************//**
82
 
Validates the flush list.
83
 
@return TRUE if ok */
84
 
static
85
 
ibool
86
 
buf_flush_validate_low(void);
87
 
/*========================*/
88
 
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
89
 
 
90
 
/********************************************************************//**
91
 
Inserts a modified block into the flush list. */
92
 
UNIV_INTERN
93
 
void
94
 
buf_flush_insert_into_flush_list(
95
 
/*=============================*/
96
 
        buf_block_t*    block)  /*!< in/out: block which is modified */
97
 
{
98
 
        ut_ad(buf_pool_mutex_own());
99
 
        ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
100
 
              || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
101
 
                  <= block->page.oldest_modification));
102
 
 
103
 
        ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
104
 
        ut_ad(block->page.in_LRU_list);
105
 
        ut_ad(block->page.in_page_hash);
106
 
        ut_ad(!block->page.in_zip_hash);
107
 
        ut_ad(!block->page.in_flush_list);
108
 
        ut_d(block->page.in_flush_list = TRUE);
109
 
        UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
110
 
 
111
 
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
112
 
        ut_a(buf_flush_validate_low());
113
 
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
114
 
}
115
 
 
116
 
/********************************************************************//**
117
 
Inserts a modified block into the flush list in the right sorted position.
118
 
This function is used by recovery, because there the modifications do not
119
 
necessarily come in the order of lsn's. */
120
 
UNIV_INTERN
121
 
void
122
 
buf_flush_insert_sorted_into_flush_list(
123
 
/*====================================*/
124
 
        buf_block_t*    block)  /*!< in/out: block which is modified */
125
 
{
126
 
        buf_page_t*     prev_b;
127
 
        buf_page_t*     b;
128
 
 
129
 
        ut_ad(buf_pool_mutex_own());
130
 
        ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
131
 
 
132
 
        ut_ad(block->page.in_LRU_list);
133
 
        ut_ad(block->page.in_page_hash);
134
 
        ut_ad(!block->page.in_zip_hash);
135
 
        ut_ad(!block->page.in_flush_list);
136
 
        ut_d(block->page.in_flush_list = TRUE);
137
 
 
138
 
        prev_b = NULL;
139
 
        b = UT_LIST_GET_FIRST(buf_pool->flush_list);
140
 
 
141
 
        while (b && b->oldest_modification > block->page.oldest_modification) {
142
 
                ut_ad(b->in_flush_list);
143
 
                prev_b = b;
144
 
                b = UT_LIST_GET_NEXT(list, b);
145
 
        }
146
 
 
147
 
        if (prev_b == NULL) {
148
 
                UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
149
 
        } else {
150
 
                UT_LIST_INSERT_AFTER(list, buf_pool->flush_list,
151
 
                                     prev_b, &block->page);
152
 
        }
153
 
 
154
 
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
155
 
        ut_a(buf_flush_validate_low());
156
 
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
157
 
}
158
 
 
159
 
/********************************************************************//**
160
 
Returns TRUE if the file page block is immediately suitable for replacement,
161
 
i.e., the transition FILE_PAGE => NOT_USED allowed.
162
 
@return TRUE if can replace immediately */
163
 
UNIV_INTERN
164
 
ibool
165
 
buf_flush_ready_for_replace(
166
 
/*========================*/
167
 
        buf_page_t*     bpage)  /*!< in: buffer control block, must be
168
 
                                buf_page_in_file(bpage) and in the LRU list */
169
 
{
170
 
        ut_ad(buf_pool_mutex_own());
171
 
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
172
 
        ut_ad(bpage->in_LRU_list);
173
 
 
174
 
        if (UNIV_LIKELY(buf_page_in_file(bpage))) {
175
 
 
176
 
                return(bpage->oldest_modification == 0
177
 
                       && buf_page_get_io_fix(bpage) == BUF_IO_NONE
178
 
                       && bpage->buf_fix_count == 0);
179
 
        }
180
 
 
181
 
        ut_print_timestamp(stderr);
182
 
        fprintf(stderr,
183
 
                "  InnoDB: Error: buffer block state %lu"
184
 
                " in the LRU list!\n",
185
 
                (ulong) buf_page_get_state(bpage));
186
 
        ut_print_buf(stderr, bpage, sizeof(buf_page_t));
187
 
        putc('\n', stderr);
188
 
 
189
 
        return(FALSE);
190
 
}
191
 
 
192
 
/********************************************************************//**
193
 
Returns TRUE if the block is modified and ready for flushing.
194
 
@return TRUE if can flush immediately */
195
 
UNIV_INLINE
196
 
ibool
197
 
buf_flush_ready_for_flush(
198
 
/*======================*/
199
 
        buf_page_t*     bpage,  /*!< in: buffer control block, must be
200
 
                                buf_page_in_file(bpage) */
201
 
        enum buf_flush  flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
202
 
{
203
 
        ut_a(buf_page_in_file(bpage));
204
 
        ut_ad(buf_pool_mutex_own());
205
 
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
206
 
        ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
207
 
 
208
 
        if (bpage->oldest_modification != 0
209
 
            && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
210
 
                ut_ad(bpage->in_flush_list);
211
 
 
212
 
                if (flush_type != BUF_FLUSH_LRU) {
213
 
 
214
 
                        return(TRUE);
215
 
 
216
 
                } else if (bpage->buf_fix_count == 0) {
217
 
 
218
 
                        /* If we are flushing the LRU list, to avoid deadlocks
219
 
                        we require the block not to be bufferfixed, and hence
220
 
                        not latched. */
221
 
 
222
 
                        return(TRUE);
223
 
                }
224
 
        }
225
 
 
226
 
        return(FALSE);
227
 
}
228
 
 
229
 
/********************************************************************//**
230
 
Remove a block from the flush list of modified blocks. */
231
 
UNIV_INTERN
232
 
void
233
 
buf_flush_remove(
234
 
/*=============*/
235
 
        buf_page_t*     bpage)  /*!< in: pointer to the block in question */
236
 
{
237
 
        ut_ad(buf_pool_mutex_own());
238
 
        ut_ad(mutex_own(buf_page_get_mutex(bpage)));
239
 
        ut_ad(bpage->in_flush_list);
240
 
        ut_d(bpage->in_flush_list = FALSE);
241
 
 
242
 
        switch (buf_page_get_state(bpage)) {
243
 
        case BUF_BLOCK_ZIP_PAGE:
244
 
                /* clean compressed pages should not be on the flush list */
245
 
        case BUF_BLOCK_ZIP_FREE:
246
 
        case BUF_BLOCK_NOT_USED:
247
 
        case BUF_BLOCK_READY_FOR_USE:
248
 
        case BUF_BLOCK_MEMORY:
249
 
        case BUF_BLOCK_REMOVE_HASH:
250
 
                ut_error;
251
 
                return;
252
 
        case BUF_BLOCK_ZIP_DIRTY:
253
 
                buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
254
 
                UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
255
 
                buf_LRU_insert_zip_clean(bpage);
256
 
                break;
257
 
        case BUF_BLOCK_FILE_PAGE:
258
 
                UT_LIST_REMOVE(list, buf_pool->flush_list, bpage);
259
 
                break;
260
 
        }
261
 
 
262
 
        bpage->oldest_modification = 0;
263
 
 
264
 
        ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
265
 
                              ut_ad(ut_list_node_313->in_flush_list)));
266
 
}
267
 
 
268
 
/********************************************************************//**
269
 
Updates the flush system data structures when a write is completed. */
270
 
UNIV_INTERN
271
 
void
272
 
buf_flush_write_complete(
273
 
/*=====================*/
274
 
        buf_page_t*     bpage)  /*!< in: pointer to the block in question */
275
 
{
276
 
        enum buf_flush  flush_type;
277
 
 
278
 
        ut_ad(bpage);
279
 
 
280
 
        buf_flush_remove(bpage);
281
 
 
282
 
        flush_type = buf_page_get_flush_type(bpage);
283
 
        buf_pool->n_flush[flush_type]--;
284
 
 
285
 
        if (flush_type == BUF_FLUSH_LRU) {
286
 
                /* Put the block to the end of the LRU list to wait to be
287
 
                moved to the free list */
288
 
 
289
 
                buf_LRU_make_block_old(bpage);
290
 
 
291
 
                buf_pool->LRU_flush_ended++;
292
 
        }
293
 
 
294
 
        /* fprintf(stderr, "n pending flush %lu\n",
295
 
        buf_pool->n_flush[flush_type]); */
296
 
 
297
 
        if ((buf_pool->n_flush[flush_type] == 0)
298
 
            && (buf_pool->init_flush[flush_type] == FALSE)) {
299
 
 
300
 
                /* The running flush batch has ended */
301
 
 
302
 
                os_event_set(buf_pool->no_flush[flush_type]);
303
 
        }
304
 
}
305
 
 
306
 
/********************************************************************//**
307
 
Flushes possible buffered writes from the doublewrite memory buffer to disk,
308
 
and also wakes up the aio thread if simulated aio is used. It is very
309
 
important to call this function after a batch of writes has been posted,
310
 
and also when we may have to wait for a page latch! Otherwise a deadlock
311
 
of threads can occur. */
312
 
static
313
 
void
314
 
buf_flush_buffered_writes(void)
315
 
/*===========================*/
316
 
{
317
 
        byte*           write_buf;
318
 
        ulint           len;
319
 
        ulint           len2;
320
 
        ulint           i;
321
 
 
322
 
        if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
323
 
                os_aio_simulated_wake_handler_threads();
324
 
 
325
 
                return;
326
 
        }
327
 
 
328
 
        mutex_enter(&(trx_doublewrite->mutex));
329
 
 
330
 
        /* Write first to doublewrite buffer blocks. We use synchronous
331
 
        aio and thus know that file write has been completed when the
332
 
        control returns. */
333
 
 
334
 
        if (trx_doublewrite->first_free == 0) {
335
 
 
336
 
                mutex_exit(&(trx_doublewrite->mutex));
337
 
 
338
 
                return;
339
 
        }
340
 
 
341
 
        for (i = 0; i < trx_doublewrite->first_free; i++) {
342
 
 
343
 
                const buf_block_t*      block;
344
 
 
345
 
                block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
346
 
 
347
 
                if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
348
 
                    || block->page.zip.data) {
349
 
                        /* No simple validate for compressed pages exists. */
350
 
                        continue;
351
 
                }
352
 
 
353
 
                if (UNIV_UNLIKELY
354
 
                    (memcmp(block->frame + (FIL_PAGE_LSN + 4),
355
 
                            block->frame + (UNIV_PAGE_SIZE
356
 
                                            - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
357
 
                            4))) {
358
 
                        ut_print_timestamp(stderr);
359
 
                        fprintf(stderr,
360
 
                                "  InnoDB: ERROR: The page to be written"
361
 
                                " seems corrupt!\n"
362
 
                                "InnoDB: The lsn fields do not match!"
363
 
                                " Noticed in the buffer pool\n"
364
 
                                "InnoDB: before posting to the"
365
 
                                " doublewrite buffer.\n");
366
 
                }
367
 
 
368
 
                if (!block->check_index_page_at_flush) {
369
 
                } else if (page_is_comp(block->frame)) {
370
 
                        if (UNIV_UNLIKELY
371
 
                            (!page_simple_validate_new(block->frame))) {
372
 
corrupted_page:
373
 
                                buf_page_print(block->frame, 0);
374
 
 
375
 
                                ut_print_timestamp(stderr);
376
 
                                fprintf(stderr,
377
 
                                        "  InnoDB: Apparent corruption of an"
378
 
                                        " index page n:o %lu in space %lu\n"
379
 
                                        "InnoDB: to be written to data file."
380
 
                                        " We intentionally crash server\n"
381
 
                                        "InnoDB: to prevent corrupt data"
382
 
                                        " from ending up in data\n"
383
 
                                        "InnoDB: files.\n",
384
 
                                        (ulong) buf_block_get_page_no(block),
385
 
                                        (ulong) buf_block_get_space(block));
386
 
 
387
 
                                ut_error;
388
 
                        }
389
 
                } else if (UNIV_UNLIKELY
390
 
                           (!page_simple_validate_old(block->frame))) {
391
 
 
392
 
                        goto corrupted_page;
393
 
                }
394
 
        }
395
 
 
396
 
        /* increment the doublewrite flushed pages counter */
397
 
        srv_dblwr_pages_written+= trx_doublewrite->first_free;
398
 
        srv_dblwr_writes++;
399
 
 
400
 
        len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
401
 
                     trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
402
 
 
403
 
        write_buf = trx_doublewrite->write_buf;
404
 
        i = 0;
405
 
 
406
 
        fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
407
 
               trx_doublewrite->block1, 0, len,
408
 
               (void*) write_buf, NULL);
409
 
 
410
 
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
411
 
             len2 += UNIV_PAGE_SIZE, i++) {
412
 
                const buf_block_t* block = (buf_block_t*)
413
 
                        trx_doublewrite->buf_block_arr[i];
414
 
 
415
 
                if (UNIV_LIKELY(!block->page.zip.data)
416
 
                    && UNIV_LIKELY(buf_block_get_state(block)
417
 
                                   == BUF_BLOCK_FILE_PAGE)
418
 
                    && UNIV_UNLIKELY
419
 
                    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
420
 
                            write_buf + len2
421
 
                            + (UNIV_PAGE_SIZE
422
 
                               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
423
 
                        ut_print_timestamp(stderr);
424
 
                        fprintf(stderr,
425
 
                                "  InnoDB: ERROR: The page to be written"
426
 
                                " seems corrupt!\n"
427
 
                                "InnoDB: The lsn fields do not match!"
428
 
                                " Noticed in the doublewrite block1.\n");
429
 
                }
430
 
        }
431
 
 
432
 
        if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
433
 
                goto flush;
434
 
        }
435
 
 
436
 
        len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
437
 
                * UNIV_PAGE_SIZE;
438
 
 
439
 
        write_buf = trx_doublewrite->write_buf
440
 
                + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
441
 
        ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
442
 
 
443
 
        fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, 0,
444
 
               trx_doublewrite->block2, 0, len,
445
 
               (void*) write_buf, NULL);
446
 
 
447
 
        for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
448
 
             len2 += UNIV_PAGE_SIZE, i++) {
449
 
                const buf_block_t* block = (buf_block_t*)
450
 
                        trx_doublewrite->buf_block_arr[i];
451
 
 
452
 
                if (UNIV_LIKELY(!block->page.zip.data)
453
 
                    && UNIV_LIKELY(buf_block_get_state(block)
454
 
                                   == BUF_BLOCK_FILE_PAGE)
455
 
                    && UNIV_UNLIKELY
456
 
                    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
457
 
                            write_buf + len2
458
 
                            + (UNIV_PAGE_SIZE
459
 
                               - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
460
 
                        ut_print_timestamp(stderr);
461
 
                        fprintf(stderr,
462
 
                                "  InnoDB: ERROR: The page to be"
463
 
                                " written seems corrupt!\n"
464
 
                                "InnoDB: The lsn fields do not match!"
465
 
                                " Noticed in"
466
 
                                " the doublewrite block2.\n");
467
 
                }
468
 
        }
469
 
 
470
 
flush:
471
 
        /* Now flush the doublewrite buffer data to disk */
472
 
 
473
 
        fil_flush(TRX_SYS_SPACE);
474
 
 
475
 
        /* We know that the writes have been flushed to disk now
476
 
        and in recovery we will find them in the doublewrite buffer
477
 
        blocks. Next do the writes to the intended positions. */
478
 
 
479
 
        for (i = 0; i < trx_doublewrite->first_free; i++) {
480
 
                const buf_block_t* block = (buf_block_t*)
481
 
                        trx_doublewrite->buf_block_arr[i];
482
 
 
483
 
                ut_a(buf_page_in_file(&block->page));
484
 
                if (UNIV_LIKELY_NULL(block->page.zip.data)) {
485
 
                        fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
486
 
                               FALSE, buf_page_get_space(&block->page),
487
 
                               buf_page_get_zip_size(&block->page),
488
 
                               buf_page_get_page_no(&block->page), 0,
489
 
                               buf_page_get_zip_size(&block->page),
490
 
                               (void*)block->page.zip.data,
491
 
                               (void*)block);
492
 
 
493
 
                        /* Increment the counter of I/O operations used
494
 
                        for selecting LRU policy. */
495
 
                        buf_LRU_stat_inc_io();
496
 
 
497
 
                        continue;
498
 
                }
499
 
 
500
 
                ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
501
 
 
502
 
                if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
503
 
                                         block->frame
504
 
                                         + (UNIV_PAGE_SIZE
505
 
                                            - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
506
 
                                         4))) {
507
 
                        ut_print_timestamp(stderr);
508
 
                        fprintf(stderr,
509
 
                                "  InnoDB: ERROR: The page to be written"
510
 
                                " seems corrupt!\n"
511
 
                                "InnoDB: The lsn fields do not match!"
512
 
                                " Noticed in the buffer pool\n"
513
 
                                "InnoDB: after posting and flushing"
514
 
                                " the doublewrite buffer.\n"
515
 
                                "InnoDB: Page buf fix count %lu,"
516
 
                                " io fix %lu, state %lu\n",
517
 
                                (ulong)block->page.buf_fix_count,
518
 
                                (ulong)buf_block_get_io_fix(block),
519
 
                                (ulong)buf_block_get_state(block));
520
 
                }
521
 
 
522
 
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
523
 
                       FALSE, buf_block_get_space(block), 0,
524
 
                       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
525
 
                       (void*)block->frame, (void*)block);
526
 
 
527
 
                /* Increment the counter of I/O operations used
528
 
                for selecting LRU policy. */
529
 
                buf_LRU_stat_inc_io();
530
 
        }
531
 
 
532
 
        /* Wake possible simulated aio thread to actually post the
533
 
        writes to the operating system */
534
 
 
535
 
        os_aio_simulated_wake_handler_threads();
536
 
 
537
 
        /* Wait that all async writes to tablespaces have been posted to
538
 
        the OS */
539
 
 
540
 
        os_aio_wait_until_no_pending_writes();
541
 
 
542
 
        /* Now we flush the data to disk (for example, with fsync) */
543
 
 
544
 
        fil_flush_file_spaces(FIL_TABLESPACE);
545
 
 
546
 
        /* We can now reuse the doublewrite memory buffer: */
547
 
 
548
 
        trx_doublewrite->first_free = 0;
549
 
 
550
 
        mutex_exit(&(trx_doublewrite->mutex));
551
 
}
552
 
 
553
 
/********************************************************************//**
554
 
Posts a buffer page for writing. If the doublewrite memory buffer is
555
 
full, calls buf_flush_buffered_writes and waits for for free space to
556
 
appear. */
557
 
static
558
 
void
559
 
buf_flush_post_to_doublewrite_buf(
560
 
/*==============================*/
561
 
        buf_page_t*     bpage)  /*!< in: buffer block to write */
562
 
{
563
 
        ulint   zip_size;
564
 
try_again:
565
 
        mutex_enter(&(trx_doublewrite->mutex));
566
 
 
567
 
        ut_a(buf_page_in_file(bpage));
568
 
 
569
 
        if (trx_doublewrite->first_free
570
 
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
571
 
                mutex_exit(&(trx_doublewrite->mutex));
572
 
 
573
 
                buf_flush_buffered_writes();
574
 
 
575
 
                goto try_again;
576
 
        }
577
 
 
578
 
        zip_size = buf_page_get_zip_size(bpage);
579
 
 
580
 
        if (UNIV_UNLIKELY(zip_size)) {
581
 
                /* Copy the compressed page and clear the rest. */
582
 
                memcpy(trx_doublewrite->write_buf
583
 
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
584
 
                       bpage->zip.data, zip_size);
585
 
                memset(trx_doublewrite->write_buf
586
 
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free
587
 
                       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
588
 
        } else {
589
 
                ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
590
 
 
591
 
                memcpy(trx_doublewrite->write_buf
592
 
                       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
593
 
                       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
594
 
        }
595
 
 
596
 
        trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
597
 
 
598
 
        trx_doublewrite->first_free++;
599
 
 
600
 
        if (trx_doublewrite->first_free
601
 
            >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
602
 
                mutex_exit(&(trx_doublewrite->mutex));
603
 
 
604
 
                buf_flush_buffered_writes();
605
 
 
606
 
                return;
607
 
        }
608
 
 
609
 
        mutex_exit(&(trx_doublewrite->mutex));
610
 
}
611
 
#endif /* !UNIV_HOTBACKUP */
612
 
 
613
 
/********************************************************************//**
614
 
Initializes a page for writing to the tablespace. */
615
 
UNIV_INTERN
616
 
void
617
 
buf_flush_init_for_writing(
618
 
/*=======================*/
619
 
        byte*           page,           /*!< in/out: page */
620
 
        void*           page_zip_,      /*!< in/out: compressed page, or NULL */
621
 
        ib_uint64_t     newest_lsn)     /*!< in: newest modification lsn
622
 
                                        to the page */
623
 
{
624
 
        ut_ad(page);
625
 
 
626
 
        if (page_zip_) {
627
 
                page_zip_des_t* page_zip = page_zip_;
628
 
                ulint           zip_size = page_zip_get_size(page_zip);
629
 
                ut_ad(zip_size);
630
 
                ut_ad(ut_is_2pow(zip_size));
631
 
                ut_ad(zip_size <= UNIV_PAGE_SIZE);
632
 
 
633
 
                switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
634
 
                case FIL_PAGE_TYPE_ALLOCATED:
635
 
                case FIL_PAGE_INODE:
636
 
                case FIL_PAGE_IBUF_BITMAP:
637
 
                case FIL_PAGE_TYPE_FSP_HDR:
638
 
                case FIL_PAGE_TYPE_XDES:
639
 
                        /* These are essentially uncompressed pages. */
640
 
                        memcpy(page_zip->data, page, zip_size);
641
 
                        /* fall through */
642
 
                case FIL_PAGE_TYPE_ZBLOB:
643
 
                case FIL_PAGE_TYPE_ZBLOB2:
644
 
                case FIL_PAGE_INDEX:
645
 
                        mach_write_ull(page_zip->data
646
 
                                       + FIL_PAGE_LSN, newest_lsn);
647
 
                        memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
648
 
                        mach_write_to_4(page_zip->data
649
 
                                        + FIL_PAGE_SPACE_OR_CHKSUM,
650
 
                                        srv_use_checksums
651
 
                                        ? page_zip_calc_checksum(
652
 
                                                page_zip->data, zip_size)
653
 
                                        : BUF_NO_CHECKSUM_MAGIC);
654
 
                        return;
655
 
                }
656
 
 
657
 
                ut_print_timestamp(stderr);
658
 
                fputs("  InnoDB: ERROR: The compressed page to be written"
659
 
                      " seems corrupt:", stderr);
660
 
                ut_print_buf(stderr, page, zip_size);
661
 
                fputs("\nInnoDB: Possibly older version of the page:", stderr);
662
 
                ut_print_buf(stderr, page_zip->data, zip_size);
663
 
                putc('\n', stderr);
664
 
                ut_error;
665
 
        }
666
 
 
667
 
        /* Write the newest modification lsn to the page header and trailer */
668
 
        mach_write_ull(page + FIL_PAGE_LSN, newest_lsn);
669
 
 
670
 
        mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
671
 
                       newest_lsn);
672
 
 
673
 
        /* Store the new formula checksum */
674
 
 
675
 
        mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
676
 
                        srv_use_checksums
677
 
                        ? buf_calc_page_new_checksum(page)
678
 
                        : BUF_NO_CHECKSUM_MAGIC);
679
 
 
680
 
        /* We overwrite the first 4 bytes of the end lsn field to store
681
 
        the old formula checksum. Since it depends also on the field
682
 
        FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
683
 
        new formula checksum. */
684
 
 
685
 
        mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
686
 
                        srv_use_checksums
687
 
                        ? buf_calc_page_old_checksum(page)
688
 
                        : BUF_NO_CHECKSUM_MAGIC);
689
 
}
690
 
 
691
 
#ifndef UNIV_HOTBACKUP
692
 
/********************************************************************//**
693
 
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
694
 
also when the doublewrite buffer is used, we must call
695
 
buf_flush_buffered_writes after we have posted a batch of writes! */
696
 
static
697
 
void
698
 
buf_flush_write_block_low(
699
 
/*======================*/
700
 
        buf_page_t*     bpage)  /*!< in: buffer block to write */
701
 
{
702
 
        ulint   zip_size        = buf_page_get_zip_size(bpage);
703
 
        page_t* frame           = NULL;
704
 
#ifdef UNIV_LOG_DEBUG
705
 
        static ibool univ_log_debug_warned;
706
 
#endif /* UNIV_LOG_DEBUG */
707
 
 
708
 
        ut_ad(buf_page_in_file(bpage));
709
 
 
710
 
        /* We are not holding buf_pool_mutex or block_mutex here.
711
 
        Nevertheless, it is safe to access bpage, because it is
712
 
        io_fixed and oldest_modification != 0.  Thus, it cannot be
713
 
        relocated in the buffer pool or removed from flush_list or
714
 
        LRU_list. */
715
 
        ut_ad(!buf_pool_mutex_own());
716
 
        ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
717
 
        ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
718
 
        ut_ad(bpage->oldest_modification != 0);
719
 
 
720
 
#ifdef UNIV_IBUF_COUNT_DEBUG
721
 
        ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
722
 
#endif
723
 
        ut_ad(bpage->newest_modification != 0);
724
 
 
725
 
#ifdef UNIV_LOG_DEBUG
726
 
        if (!univ_log_debug_warned) {
727
 
                univ_log_debug_warned = TRUE;
728
 
                fputs("Warning: cannot force log to disk if"
729
 
                      " UNIV_LOG_DEBUG is defined!\n"
730
 
                      "Crash recovery will not work!\n",
731
 
                      stderr);
732
 
        }
733
 
#else
734
 
        /* Force the log to the disk before writing the modified block */
735
 
        log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
736
 
#endif
737
 
        switch (buf_page_get_state(bpage)) {
738
 
        case BUF_BLOCK_ZIP_FREE:
739
 
        case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
740
 
        case BUF_BLOCK_NOT_USED:
741
 
        case BUF_BLOCK_READY_FOR_USE:
742
 
        case BUF_BLOCK_MEMORY:
743
 
        case BUF_BLOCK_REMOVE_HASH:
744
 
                ut_error;
745
 
                break;
746
 
        case BUF_BLOCK_ZIP_DIRTY:
747
 
                frame = bpage->zip.data;
748
 
                if (UNIV_LIKELY(srv_use_checksums)) {
749
 
                        ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
750
 
                             == page_zip_calc_checksum(frame, zip_size));
751
 
                }
752
 
                mach_write_ull(frame + FIL_PAGE_LSN,
753
 
                               bpage->newest_modification);
754
 
                memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
755
 
                break;
756
 
        case BUF_BLOCK_FILE_PAGE:
757
 
                frame = bpage->zip.data;
758
 
                if (!frame) {
759
 
                        frame = ((buf_block_t*) bpage)->frame;
760
 
                }
761
 
 
762
 
                buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
763
 
                                           bpage->zip.data
764
 
                                           ? &bpage->zip : NULL,
765
 
                                           bpage->newest_modification);
766
 
                break;
767
 
        }
768
 
 
769
 
        if (!srv_use_doublewrite_buf || !trx_doublewrite) {
770
 
                fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
771
 
                       FALSE, buf_page_get_space(bpage), zip_size,
772
 
                       buf_page_get_page_no(bpage), 0,
773
 
                       zip_size ? zip_size : UNIV_PAGE_SIZE,
774
 
                       frame, bpage);
775
 
        } else {
776
 
                buf_flush_post_to_doublewrite_buf(bpage);
777
 
        }
778
 
}
779
 
 
780
 
/********************************************************************//**
781
 
Writes a flushable page asynchronously from the buffer pool to a file.
782
 
NOTE: in simulated aio we must call
783
 
os_aio_simulated_wake_handler_threads after we have posted a batch of
784
 
writes! NOTE: buf_pool_mutex and buf_page_get_mutex(bpage) must be
785
 
held upon entering this function, and they will be released by this
786
 
function. */
787
 
static
788
 
void
789
 
buf_flush_page(
790
 
/*===========*/
791
 
        buf_page_t*     bpage,          /*!< in: buffer control block */
792
 
        enum buf_flush  flush_type)     /*!< in: BUF_FLUSH_LRU
793
 
                                        or BUF_FLUSH_LIST */
794
 
{
795
 
        mutex_t*        block_mutex;
796
 
        ibool           is_uncompressed;
797
 
 
798
 
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
799
 
        ut_ad(buf_pool_mutex_own());
800
 
        ut_ad(buf_page_in_file(bpage));
801
 
 
802
 
        block_mutex = buf_page_get_mutex(bpage);
803
 
        ut_ad(mutex_own(block_mutex));
804
 
 
805
 
        ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
806
 
 
807
 
        buf_page_set_io_fix(bpage, BUF_IO_WRITE);
808
 
 
809
 
        buf_page_set_flush_type(bpage, flush_type);
810
 
 
811
 
        if (buf_pool->n_flush[flush_type] == 0) {
812
 
 
813
 
                os_event_reset(buf_pool->no_flush[flush_type]);
814
 
        }
815
 
 
816
 
        buf_pool->n_flush[flush_type]++;
817
 
 
818
 
        is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
819
 
        ut_ad(is_uncompressed == (block_mutex != &buf_pool_zip_mutex));
820
 
 
821
 
        switch (flush_type) {
822
 
                ibool   is_s_latched;
823
 
        case BUF_FLUSH_LIST:
824
 
                /* If the simulated aio thread is not running, we must
825
 
                not wait for any latch, as we may end up in a deadlock:
826
 
                if buf_fix_count == 0, then we know we need not wait */
827
 
 
828
 
                is_s_latched = (bpage->buf_fix_count == 0);
829
 
                if (is_s_latched && is_uncompressed) {
830
 
                        rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
831
 
                                           BUF_IO_WRITE);
832
 
                }
833
 
 
834
 
                mutex_exit(block_mutex);
835
 
                buf_pool_mutex_exit();
836
 
 
837
 
                /* Even though bpage is not protected by any mutex at
838
 
                this point, it is safe to access bpage, because it is
839
 
                io_fixed and oldest_modification != 0.  Thus, it
840
 
                cannot be relocated in the buffer pool or removed from
841
 
                flush_list or LRU_list. */
842
 
 
843
 
                if (!is_s_latched) {
844
 
                        buf_flush_buffered_writes();
845
 
 
846
 
                        if (is_uncompressed) {
847
 
                                rw_lock_s_lock_gen(&((buf_block_t*) bpage)
848
 
                                                   ->lock, BUF_IO_WRITE);
849
 
                        }
850
 
                }
851
 
 
852
 
                break;
853
 
 
854
 
        case BUF_FLUSH_LRU:
855
 
                /* VERY IMPORTANT:
856
 
                Because any thread may call the LRU flush, even when owning
857
 
                locks on pages, to avoid deadlocks, we must make sure that the
858
 
                s-lock is acquired on the page without waiting: this is
859
 
                accomplished because buf_flush_ready_for_flush() must hold,
860
 
                and that requires the page not to be bufferfixed. */
861
 
 
862
 
                if (is_uncompressed) {
863
 
                        rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
864
 
                                           BUF_IO_WRITE);
865
 
                }
866
 
 
867
 
                /* Note that the s-latch is acquired before releasing the
868
 
                buf_pool mutex: this ensures that the latch is acquired
869
 
                immediately. */
870
 
 
871
 
                mutex_exit(block_mutex);
872
 
                buf_pool_mutex_exit();
873
 
                break;
874
 
 
875
 
        default:
876
 
                ut_error;
877
 
        }
878
 
 
879
 
        /* Even though bpage is not protected by any mutex at this
880
 
        point, it is safe to access bpage, because it is io_fixed and
881
 
        oldest_modification != 0.  Thus, it cannot be relocated in the
882
 
        buffer pool or removed from flush_list or LRU_list. */
883
 
 
884
 
#ifdef UNIV_DEBUG
885
 
        if (buf_debug_prints) {
886
 
                fprintf(stderr,
887
 
                        "Flushing %u space %u page %u\n",
888
 
                        flush_type, bpage->space, bpage->offset);
889
 
        }
890
 
#endif /* UNIV_DEBUG */
891
 
        buf_flush_write_block_low(bpage);
892
 
}
893
 
 
894
 
/***********************************************************//**
895
 
Flushes to disk all flushable pages within the flush area.
896
 
@return number of pages flushed */
897
 
static
898
 
ulint
899
 
buf_flush_try_neighbors(
900
 
/*====================*/
901
 
        ulint           space,          /*!< in: space id */
902
 
        ulint           offset,         /*!< in: page offset */
903
 
        enum buf_flush  flush_type)     /*!< in: BUF_FLUSH_LRU or
904
 
                                        BUF_FLUSH_LIST */
905
 
{
906
 
        buf_page_t*     bpage;
907
 
        ulint           low, high;
908
 
        ulint           count           = 0;
909
 
        ulint           i;
910
 
 
911
 
        ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
912
 
 
913
 
        if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
914
 
                /* If there is little space, it is better not to flush any
915
 
                block except from the end of the LRU list */
916
 
 
917
 
                low = offset;
918
 
                high = offset + 1;
919
 
        } else {
920
 
                /* When flushed, dirty blocks are searched in neighborhoods of
921
 
                this size, and flushed along with the original page. */
922
 
 
923
 
                ulint   buf_flush_area  = ut_min(BUF_READ_AHEAD_AREA,
924
 
                                                 buf_pool->curr_size / 16);
925
 
 
926
 
                low = (offset / buf_flush_area) * buf_flush_area;
927
 
                high = (offset / buf_flush_area + 1) * buf_flush_area;
928
 
        }
929
 
 
930
 
        /* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
931
 
 
932
 
        if (high > fil_space_get_size(space)) {
933
 
                high = fil_space_get_size(space);
934
 
        }
935
 
 
936
 
        buf_pool_mutex_enter();
937
 
 
938
 
        for (i = low; i < high; i++) {
939
 
 
940
 
                bpage = buf_page_hash_get(space, i);
941
 
 
942
 
                if (!bpage) {
943
 
 
944
 
                        continue;
945
 
                }
946
 
 
947
 
                ut_a(buf_page_in_file(bpage));
948
 
 
949
 
                /* We avoid flushing 'non-old' blocks in an LRU flush,
950
 
                because the flushed blocks are soon freed */
951
 
 
952
 
                if (flush_type != BUF_FLUSH_LRU
953
 
                    || i == offset
954
 
                    || buf_page_is_old(bpage)) {
955
 
                        mutex_t* block_mutex = buf_page_get_mutex(bpage);
956
 
 
957
 
                        mutex_enter(block_mutex);
958
 
 
959
 
                        if (buf_flush_ready_for_flush(bpage, flush_type)
960
 
                            && (i == offset || !bpage->buf_fix_count)) {
961
 
                                /* We only try to flush those
962
 
                                neighbors != offset where the buf fix count is
963
 
                                zero, as we then know that we probably can
964
 
                                latch the page without a semaphore wait.
965
 
                                Semaphore waits are expensive because we must
966
 
                                flush the doublewrite buffer before we start
967
 
                                waiting. */
968
 
 
969
 
                                buf_flush_page(bpage, flush_type);
970
 
                                ut_ad(!mutex_own(block_mutex));
971
 
                                count++;
972
 
 
973
 
                                buf_pool_mutex_enter();
974
 
                        } else {
975
 
                                mutex_exit(block_mutex);
976
 
                        }
977
 
                }
978
 
        }
979
 
 
980
 
        buf_pool_mutex_exit();
981
 
 
982
 
        return(count);
983
 
}
984
 
 
985
 
/*******************************************************************//**
986
 
This utility flushes dirty blocks from the end of the LRU list or flush_list.
987
 
NOTE 1: in the case of an LRU flush the calling thread may own latches to
988
 
pages: to avoid deadlocks, this function must be written so that it cannot
989
 
end up waiting for these latches! NOTE 2: in the case of a flush list flush,
990
 
the calling thread is not allowed to own any latches on pages!
991
 
@return number of blocks for which the write request was queued;
992
 
ULINT_UNDEFINED if there was a flush of the same type already running */
993
 
UNIV_INTERN
994
 
ulint
995
 
buf_flush_batch(
996
 
/*============*/
997
 
        enum buf_flush  flush_type,     /*!< in: BUF_FLUSH_LRU or
998
 
                                        BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
999
 
                                        then the caller must not own any
1000
 
                                        latches on pages */
1001
 
        ulint           min_n,          /*!< in: wished minimum mumber of blocks
1002
 
                                        flushed (it is not guaranteed that the
1003
 
                                        actual number is that big, though) */
1004
 
        ib_uint64_t     lsn_limit)      /*!< in the case BUF_FLUSH_LIST all
1005
 
                                        blocks whose oldest_modification is
1006
 
                                        smaller than this should be flushed
1007
 
                                        (if their number does not exceed
1008
 
                                        min_n), otherwise ignored */
1009
 
{
1010
 
        buf_page_t*     bpage;
1011
 
        ulint           page_count      = 0;
1012
 
        ulint           old_page_count;
1013
 
        ulint           space;
1014
 
        ulint           offset;
1015
 
 
1016
 
        ut_ad((flush_type == BUF_FLUSH_LRU)
1017
 
              || (flush_type == BUF_FLUSH_LIST));
1018
 
#ifdef UNIV_SYNC_DEBUG
1019
 
        ut_ad((flush_type != BUF_FLUSH_LIST)
1020
 
              || sync_thread_levels_empty_gen(TRUE));
1021
 
#endif /* UNIV_SYNC_DEBUG */
1022
 
        buf_pool_mutex_enter();
1023
 
 
1024
 
        if ((buf_pool->n_flush[flush_type] > 0)
1025
 
            || (buf_pool->init_flush[flush_type] == TRUE)) {
1026
 
 
1027
 
                /* There is already a flush batch of the same type running */
1028
 
 
1029
 
                buf_pool_mutex_exit();
1030
 
 
1031
 
                return(ULINT_UNDEFINED);
1032
 
        }
1033
 
 
1034
 
        buf_pool->init_flush[flush_type] = TRUE;
1035
 
 
1036
 
        bool done_with_loop= false;
1037
 
        for (;done_with_loop != true;) {
1038
 
flush_next:
1039
 
                /* If we have flushed enough, leave the loop */
1040
 
                if (page_count >= min_n) {
1041
 
 
1042
 
                        break;
1043
 
                }
1044
 
 
1045
 
                /* Start from the end of the list looking for a suitable
1046
 
                block to be flushed. */
1047
 
 
1048
 
                if (flush_type == BUF_FLUSH_LRU) {
1049
 
                        bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1050
 
                } else {
1051
 
                        ut_ad(flush_type == BUF_FLUSH_LIST);
1052
 
 
1053
 
                        bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1054
 
                        if (!bpage
1055
 
                            || bpage->oldest_modification >= lsn_limit) {
1056
 
                                /* We have flushed enough */
1057
 
 
1058
 
                                break;
1059
 
                        }
1060
 
                        ut_ad(bpage->in_flush_list);
1061
 
                }
1062
 
 
1063
 
                /* Note that after finding a single flushable page, we try to
1064
 
                flush also all its neighbors, and after that start from the
1065
 
                END of the LRU list or flush list again: the list may change
1066
 
                during the flushing and we cannot safely preserve within this
1067
 
                function a pointer to a block in the list! */
1068
 
 
1069
 
                do {
1070
 
                        mutex_t*block_mutex = buf_page_get_mutex(bpage);
1071
 
                        ibool   ready;
1072
 
 
1073
 
                        ut_a(buf_page_in_file(bpage));
1074
 
 
1075
 
                        mutex_enter(block_mutex);
1076
 
                        ready = buf_flush_ready_for_flush(bpage, flush_type);
1077
 
                        mutex_exit(block_mutex);
1078
 
 
1079
 
                        if (ready) {
1080
 
                                space = buf_page_get_space(bpage);
1081
 
                                offset = buf_page_get_page_no(bpage);
1082
 
 
1083
 
                                buf_pool_mutex_exit();
1084
 
 
1085
 
                                old_page_count = page_count;
1086
 
 
1087
 
                                /* Try to flush also all the neighbors */
1088
 
                                page_count += buf_flush_try_neighbors(
1089
 
                                        space, offset, flush_type);
1090
 
                                /* fprintf(stderr,
1091
 
                                "Flush type %lu, page no %lu, neighb %lu\n",
1092
 
                                flush_type, offset,
1093
 
                                page_count - old_page_count); */
1094
 
 
1095
 
                                buf_pool_mutex_enter();
1096
 
                                goto flush_next;
1097
 
 
1098
 
                        } else if (flush_type == BUF_FLUSH_LRU) {
1099
 
                                bpage = UT_LIST_GET_PREV(LRU, bpage);
1100
 
                        } else {
1101
 
                                ut_ad(flush_type == BUF_FLUSH_LIST);
1102
 
 
1103
 
                                bpage = UT_LIST_GET_PREV(list, bpage);
1104
 
                                ut_ad(!bpage || bpage->in_flush_list);
1105
 
                        }
1106
 
                } while (bpage != NULL);
1107
 
 
1108
 
                /* If we could not find anything to flush, leave the loop */
1109
 
 
1110
 
                done_with_loop= true;
1111
 
 
1112
 
        }
1113
 
 
1114
 
        buf_pool->init_flush[flush_type] = FALSE;
1115
 
 
1116
 
        if (buf_pool->n_flush[flush_type] == 0) {
1117
 
 
1118
 
                /* The running flush batch has ended */
1119
 
 
1120
 
                os_event_set(buf_pool->no_flush[flush_type]);
1121
 
        }
1122
 
 
1123
 
        buf_pool_mutex_exit();
1124
 
 
1125
 
        buf_flush_buffered_writes();
1126
 
 
1127
 
#ifdef UNIV_DEBUG
1128
 
        if (buf_debug_prints && page_count > 0) {
1129
 
                ut_a(flush_type == BUF_FLUSH_LRU
1130
 
                     || flush_type == BUF_FLUSH_LIST);
1131
 
                fprintf(stderr, flush_type == BUF_FLUSH_LRU
1132
 
                        ? "Flushed %lu pages in LRU flush\n"
1133
 
                        : "Flushed %lu pages in flush list flush\n",
1134
 
                        (ulong) page_count);
1135
 
        }
1136
 
#endif /* UNIV_DEBUG */
1137
 
 
1138
 
        srv_buf_pool_flushed += page_count;
1139
 
 
1140
 
        /* We keep track of all flushes happening as part of LRU
1141
 
        flush. When estimating the desired rate at which flush_list
1142
 
        should be flushed we factor in this value. */
1143
 
        if (flush_type == BUF_FLUSH_LRU) {
1144
 
                buf_lru_flush_page_count += page_count;
1145
 
        }
1146
 
 
1147
 
        return(page_count);
1148
 
}
1149
 
 
1150
 
/******************************************************************//**
1151
 
Waits until a flush batch of the given type ends */
1152
 
UNIV_INTERN
1153
 
void
1154
 
buf_flush_wait_batch_end(
1155
 
/*=====================*/
1156
 
        enum buf_flush  type)   /*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
1157
 
{
1158
 
        ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
1159
 
 
1160
 
        os_event_wait(buf_pool->no_flush[type]);
1161
 
}
1162
 
 
1163
 
/******************************************************************//**
1164
 
Gives a recommendation of how many blocks should be flushed to establish
1165
 
a big enough margin of replaceable blocks near the end of the LRU list
1166
 
and in the free list.
1167
 
@return number of blocks which should be flushed from the end of the
1168
 
LRU list */
1169
 
static
1170
 
ulint
1171
 
buf_flush_LRU_recommendation(void)
1172
 
/*==============================*/
1173
 
{
1174
 
        buf_page_t*     bpage;
1175
 
        ulint           n_replaceable;
1176
 
        ulint           distance        = 0;
1177
 
 
1178
 
        buf_pool_mutex_enter();
1179
 
 
1180
 
        n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
1181
 
 
1182
 
        bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1183
 
 
1184
 
        while ((bpage != NULL)
1185
 
               && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
1186
 
                   + BUF_FLUSH_EXTRA_MARGIN)
1187
 
               && (distance < BUF_LRU_FREE_SEARCH_LEN)) {
1188
 
 
1189
 
                mutex_t* block_mutex = buf_page_get_mutex(bpage);
1190
 
 
1191
 
                mutex_enter(block_mutex);
1192
 
 
1193
 
                if (buf_flush_ready_for_replace(bpage)) {
1194
 
                        n_replaceable++;
1195
 
                }
1196
 
 
1197
 
                mutex_exit(block_mutex);
1198
 
 
1199
 
                distance++;
1200
 
 
1201
 
                bpage = UT_LIST_GET_PREV(LRU, bpage);
1202
 
        }
1203
 
 
1204
 
        buf_pool_mutex_exit();
1205
 
 
1206
 
        if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
1207
 
 
1208
 
                return(0);
1209
 
        }
1210
 
 
1211
 
        return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
1212
 
               - n_replaceable);
1213
 
}
1214
 
 
1215
 
/*********************************************************************//**
1216
 
Flushes pages from the end of the LRU list if there is too small a margin
1217
 
of replaceable pages there or in the free list. VERY IMPORTANT: this function
1218
 
is called also by threads which have locks on pages. To avoid deadlocks, we
1219
 
flush only pages such that the s-lock required for flushing can be acquired
1220
 
immediately, without waiting. */
1221
 
UNIV_INTERN
1222
 
void
1223
 
buf_flush_free_margin(void)
1224
 
/*=======================*/
1225
 
{
1226
 
        ulint   n_to_flush;
1227
 
        ulint   n_flushed;
1228
 
 
1229
 
        n_to_flush = buf_flush_LRU_recommendation();
1230
 
 
1231
 
        if (n_to_flush > 0) {
1232
 
                n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0);
1233
 
                if (n_flushed == ULINT_UNDEFINED) {
1234
 
                        /* There was an LRU type flush batch already running;
1235
 
                        let us wait for it to end */
1236
 
 
1237
 
                        buf_flush_wait_batch_end(BUF_FLUSH_LRU);
1238
 
                }
1239
 
        }
1240
 
}
1241
 
 
1242
 
/*********************************************************************
1243
 
Update the historical stats that we are collecting for flush rate
1244
 
heuristics at the end of each interval.
1245
 
Flush rate heuristic depends on (a) rate of redo log generation and
1246
 
(b) the rate at which LRU flush is happening. */
1247
 
UNIV_INTERN
1248
 
void
1249
 
buf_flush_stat_update(void)
1250
 
/*=======================*/
1251
 
{
1252
 
        buf_flush_stat_t*       item;
1253
 
        ib_uint64_t             lsn_diff;
1254
 
        ib_uint64_t             lsn;
1255
 
        ulint                   n_flushed;
1256
 
 
1257
 
        lsn = log_get_lsn();
1258
 
        if (buf_flush_stat_cur.redo == 0) {
1259
 
                /* First time around. Just update the current LSN
1260
 
                and return. */
1261
 
                buf_flush_stat_cur.redo = lsn;
1262
 
                return;
1263
 
        }
1264
 
 
1265
 
        item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
1266
 
 
1267
 
        /* values for this interval */
1268
 
        lsn_diff = lsn - buf_flush_stat_cur.redo;
1269
 
        n_flushed = buf_lru_flush_page_count
1270
 
                    - buf_flush_stat_cur.n_flushed;
1271
 
 
1272
 
        /* add the current value and subtract the obsolete entry. */
1273
 
        buf_flush_stat_sum.redo += lsn_diff - item->redo;
1274
 
        buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
1275
 
 
1276
 
        /* put current entry in the array. */
1277
 
        item->redo = lsn_diff;
1278
 
        item->n_flushed = n_flushed;
1279
 
 
1280
 
        /* update the index */
1281
 
        buf_flush_stat_arr_ind++;
1282
 
        buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
1283
 
 
1284
 
        /* reset the current entry. */
1285
 
        buf_flush_stat_cur.redo = lsn;
1286
 
        buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
1287
 
}
1288
 
 
1289
 
/*********************************************************************
1290
 
Determines the fraction of dirty pages that need to be flushed based
1291
 
on the speed at which we generate redo log. Note that if redo log
1292
 
is generated at a significant rate without corresponding increase
1293
 
in the number of dirty pages (for example, an in-memory workload)
1294
 
it can cause IO bursts of flushing. This function implements heuristics
1295
 
to avoid this burstiness.
1296
 
@return number of dirty pages to be flushed / second */
1297
 
UNIV_INTERN
1298
 
ulint
1299
 
buf_flush_get_desired_flush_rate(void)
1300
 
/*==================================*/
1301
 
{
1302
 
        ulint                   redo_avg;
1303
 
        ulint                   lru_flush_avg;
1304
 
        ulint                   n_dirty;
1305
 
        ulint                   n_flush_req;
1306
 
        lint                    rate;
1307
 
        ib_uint64_t             lsn = log_get_lsn();
1308
 
        ulint                   log_capacity = log_get_capacity();
1309
 
 
1310
 
        /* log_capacity should never be zero after the initialization
1311
 
        of log subsystem. */
1312
 
        ut_ad(log_capacity != 0);
1313
 
 
1314
 
        /* Get total number of dirty pages. It is OK to access
1315
 
        flush_list without holding any mtex as we are using this
1316
 
        only for heuristics. */
1317
 
        n_dirty = UT_LIST_GET_LEN(buf_pool->flush_list);
1318
 
 
1319
 
        /* An overflow can happen if we generate more than 2^32 bytes
1320
 
        of redo in this interval i.e.: 4G of redo in 1 second. We can
1321
 
        safely consider this as infinity because if we ever come close
1322
 
        to 4G we'll start a synchronous flush of dirty pages. */
1323
 
        /* redo_avg below is average at which redo is generated in
1324
 
        past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
1325
 
        interval. */
1326
 
        redo_avg = (ulint) (buf_flush_stat_sum.redo
1327
 
                            / BUF_FLUSH_STAT_N_INTERVAL
1328
 
                            + (lsn - buf_flush_stat_cur.redo));
1329
 
 
1330
 
        /* An overflow can happen possibly if we flush more than 2^32
1331
 
        pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
1332
 
        unlikely scenario. Even when this happens it means that our
1333
 
        flush rate will be off the mark. It won't affect correctness
1334
 
        of any subsystem. */
1335
 
        /* lru_flush_avg below is rate at which pages are flushed as
1336
 
        part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
1337
 
        number of pages flushed in the current interval. */
1338
 
        lru_flush_avg = buf_flush_stat_sum.n_flushed
1339
 
                        / BUF_FLUSH_STAT_N_INTERVAL
1340
 
                        + (buf_lru_flush_page_count
1341
 
                           - buf_flush_stat_cur.n_flushed);
1342
 
 
1343
 
        n_flush_req = (n_dirty * redo_avg) / log_capacity;
1344
 
 
1345
 
        /* The number of pages that we want to flush from the flush
1346
 
        list is the difference between the required rate and the
1347
 
        number of pages that we are historically flushing from the
1348
 
        LRU list */
1349
 
        rate = n_flush_req - lru_flush_avg;
1350
 
        return(rate > 0 ? (ulint) rate : 0);
1351
 
}
1352
 
 
1353
 
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
1354
 
/******************************************************************//**
1355
 
Validates the flush list.
1356
 
@return TRUE if ok */
1357
 
static
1358
 
ibool
1359
 
buf_flush_validate_low(void)
1360
 
/*========================*/
1361
 
{
1362
 
        buf_page_t*     bpage;
1363
 
 
1364
 
        UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
1365
 
                         ut_ad(ut_list_node_313->in_flush_list));
1366
 
 
1367
 
        bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
1368
 
 
1369
 
        while (bpage != NULL) {
1370
 
                const ib_uint64_t om = bpage->oldest_modification;
1371
 
                ut_ad(bpage->in_flush_list);
1372
 
                ut_a(buf_page_in_file(bpage));
1373
 
                ut_a(om > 0);
1374
 
 
1375
 
                bpage = UT_LIST_GET_NEXT(list, bpage);
1376
 
 
1377
 
                ut_a(!bpage || om >= bpage->oldest_modification);
1378
 
        }
1379
 
 
1380
 
        return(TRUE);
1381
 
}
1382
 
 
1383
 
/******************************************************************//**
1384
 
Validates the flush list.
1385
 
@return TRUE if ok */
1386
 
UNIV_INTERN
1387
 
ibool
1388
 
buf_flush_validate(void)
1389
 
/*====================*/
1390
 
{
1391
 
        ibool   ret;
1392
 
 
1393
 
        buf_pool_mutex_enter();
1394
 
 
1395
 
        ret = buf_flush_validate_low();
1396
 
 
1397
 
        buf_pool_mutex_exit();
1398
 
 
1399
 
        return(ret);
1400
 
}
1401
 
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
1402
 
#endif /* !UNIV_HOTBACKUP */